cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

tcp.h (75181B)


      1/* SPDX-License-Identifier: GPL-2.0-or-later */
      2/*
      3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
      4 *		operating system.  INET is implemented using the  BSD Socket
      5 *		interface as the means of communication with the user level.
      6 *
      7 *		Definitions for the TCP module.
      8 *
      9 * Version:	@(#)tcp.h	1.0.5	05/23/93
     10 *
     11 * Authors:	Ross Biro
     12 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     13 */
     14#ifndef _TCP_H
     15#define _TCP_H
     16
     17#define FASTRETRANS_DEBUG 1
     18
     19#include <linux/list.h>
     20#include <linux/tcp.h>
     21#include <linux/bug.h>
     22#include <linux/slab.h>
     23#include <linux/cache.h>
     24#include <linux/percpu.h>
     25#include <linux/skbuff.h>
     26#include <linux/kref.h>
     27#include <linux/ktime.h>
     28#include <linux/indirect_call_wrapper.h>
     29
     30#include <net/inet_connection_sock.h>
     31#include <net/inet_timewait_sock.h>
     32#include <net/inet_hashtables.h>
     33#include <net/checksum.h>
     34#include <net/request_sock.h>
     35#include <net/sock_reuseport.h>
     36#include <net/sock.h>
     37#include <net/snmp.h>
     38#include <net/ip.h>
     39#include <net/tcp_states.h>
     40#include <net/inet_ecn.h>
     41#include <net/dst.h>
     42#include <net/mptcp.h>
     43
     44#include <linux/seq_file.h>
     45#include <linux/memcontrol.h>
     46#include <linux/bpf-cgroup.h>
     47#include <linux/siphash.h>
     48
     49extern struct inet_hashinfo tcp_hashinfo;
     50
     51DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
     52int tcp_orphan_count_sum(void);
     53
     54void tcp_time_wait(struct sock *sk, int state, int timeo);
     55
     56#define MAX_TCP_HEADER	L1_CACHE_ALIGN(128 + MAX_HEADER)
     57#define MAX_TCP_OPTION_SPACE 40
     58#define TCP_MIN_SND_MSS		48
     59#define TCP_MIN_GSO_SIZE	(TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)
     60
     61/*
     62 * Never offer a window over 32767 without using window scaling. Some
     63 * poor stacks do signed 16bit maths!
     64 */
     65#define MAX_TCP_WINDOW		32767U
     66
     67/* Minimal accepted MSS. It is (60+60+8) - (20+20). */
     68#define TCP_MIN_MSS		88U
     69
     70/* The initial MTU to use for probing */
     71#define TCP_BASE_MSS		1024
     72
     73/* probing interval, default to 10 minutes as per RFC4821 */
     74#define TCP_PROBE_INTERVAL	600
     75
     76/* Specify interval when tcp mtu probing will stop */
     77#define TCP_PROBE_THRESHOLD	8
     78
     79/* After receiving this amount of duplicate ACKs fast retransmit starts. */
     80#define TCP_FASTRETRANS_THRESH 3
     81
     82/* Maximal number of ACKs sent quickly to accelerate slow-start. */
     83#define TCP_MAX_QUICKACKS	16U
     84
     85/* Maximal number of window scale according to RFC1323 */
     86#define TCP_MAX_WSCALE		14U
     87
     88/* urg_data states */
     89#define TCP_URG_VALID	0x0100
     90#define TCP_URG_NOTYET	0x0200
     91#define TCP_URG_READ	0x0400
     92
     93#define TCP_RETR1	3	/*
     94				 * This is how many retries it does before it
     95				 * tries to figure out if the gateway is
     96				 * down. Minimal RFC value is 3; it corresponds
     97				 * to ~3sec-8min depending on RTO.
     98				 */
     99
    100#define TCP_RETR2	15	/*
    101				 * This should take at least
    102				 * 90 minutes to time out.
    103				 * RFC1122 says that the limit is 100 sec.
    104				 * 15 is ~13-30min depending on RTO.
    105				 */
    106
    107#define TCP_SYN_RETRIES	 6	/* This is how many retries are done
    108				 * when active opening a connection.
    109				 * RFC1122 says the minimum retry MUST
    110				 * be at least 180secs.  Nevertheless
    111				 * this value is corresponding to
    112				 * 63secs of retransmission with the
    113				 * current initial RTO.
    114				 */
    115
    116#define TCP_SYNACK_RETRIES 5	/* This is how may retries are done
    117				 * when passive opening a connection.
    118				 * This is corresponding to 31secs of
    119				 * retransmission with the current
    120				 * initial RTO.
    121				 */
    122
    123#define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT
    124				  * state, about 60 seconds	*/
    125#define TCP_FIN_TIMEOUT	TCP_TIMEWAIT_LEN
    126                                 /* BSD style FIN_WAIT2 deadlock breaker.
    127				  * It used to be 3min, new value is 60sec,
    128				  * to combine FIN-WAIT-2 timeout with
    129				  * TIME-WAIT timer.
    130				  */
    131#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */
    132
    133#define TCP_DELACK_MAX	((unsigned)(HZ/5))	/* maximal time to delay before sending an ACK */
    134#if HZ >= 100
    135#define TCP_DELACK_MIN	((unsigned)(HZ/25))	/* minimal time to delay before sending an ACK */
    136#define TCP_ATO_MIN	((unsigned)(HZ/25))
    137#else
    138#define TCP_DELACK_MIN	4U
    139#define TCP_ATO_MIN	4U
    140#endif
    141#define TCP_RTO_MAX	((unsigned)(120*HZ))
    142#define TCP_RTO_MIN	((unsigned)(HZ/5))
    143#define TCP_TIMEOUT_MIN	(2U) /* Min timeout for TCP timers in jiffies */
    144#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC6298 2.1 initial RTO value	*/
    145#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ))	/* RFC 1122 initial RTO value, now
    146						 * used as a fallback RTO for the
    147						 * initial data transmission if no
    148						 * valid RTT sample has been acquired,
    149						 * most likely due to retrans in 3WHS.
    150						 */
    151
    152#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
    153					                 * for local resources.
    154					                 */
    155#define TCP_KEEPALIVE_TIME	(120*60*HZ)	/* two hours */
    156#define TCP_KEEPALIVE_PROBES	9		/* Max of 9 keepalive probes	*/
    157#define TCP_KEEPALIVE_INTVL	(75*HZ)
    158
    159#define MAX_TCP_KEEPIDLE	32767
    160#define MAX_TCP_KEEPINTVL	32767
    161#define MAX_TCP_KEEPCNT		127
    162#define MAX_TCP_SYNCNT		127
    163
    164#define TCP_SYNQ_INTERVAL	(HZ/5)	/* Period of SYNACK timer */
    165
    166#define TCP_PAWS_24DAYS	(60 * 60 * 24 * 24)
    167#define TCP_PAWS_MSL	60		/* Per-host timestamps are invalidated
    168					 * after this time. It should be equal
    169					 * (or greater than) TCP_TIMEWAIT_LEN
    170					 * to provide reliability equal to one
    171					 * provided by timewait state.
    172					 */
    173#define TCP_PAWS_WINDOW	1		/* Replay window for per-host
    174					 * timestamps. It must be less than
    175					 * minimal timewait lifetime.
    176					 */
    177/*
    178 *	TCP option
    179 */
    180
    181#define TCPOPT_NOP		1	/* Padding */
    182#define TCPOPT_EOL		0	/* End of options */
    183#define TCPOPT_MSS		2	/* Segment size negotiating */
    184#define TCPOPT_WINDOW		3	/* Window scaling */
    185#define TCPOPT_SACK_PERM        4       /* SACK Permitted */
    186#define TCPOPT_SACK             5       /* SACK Block */
    187#define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
    188#define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
    189#define TCPOPT_MPTCP		30	/* Multipath TCP (RFC6824) */
    190#define TCPOPT_FASTOPEN		34	/* Fast open (RFC7413) */
    191#define TCPOPT_EXP		254	/* Experimental */
    192/* Magic number to be after the option value for sharing TCP
    193 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
    194 */
    195#define TCPOPT_FASTOPEN_MAGIC	0xF989
    196#define TCPOPT_SMC_MAGIC	0xE2D4C3D9
    197
    198/*
    199 *     TCP option lengths
    200 */
    201
    202#define TCPOLEN_MSS            4
    203#define TCPOLEN_WINDOW         3
    204#define TCPOLEN_SACK_PERM      2
    205#define TCPOLEN_TIMESTAMP      10
    206#define TCPOLEN_MD5SIG         18
    207#define TCPOLEN_FASTOPEN_BASE  2
    208#define TCPOLEN_EXP_FASTOPEN_BASE  4
    209#define TCPOLEN_EXP_SMC_BASE   6
    210
    211/* But this is what stacks really send out. */
    212#define TCPOLEN_TSTAMP_ALIGNED		12
    213#define TCPOLEN_WSCALE_ALIGNED		4
    214#define TCPOLEN_SACKPERM_ALIGNED	4
    215#define TCPOLEN_SACK_BASE		2
    216#define TCPOLEN_SACK_BASE_ALIGNED	4
    217#define TCPOLEN_SACK_PERBLOCK		8
    218#define TCPOLEN_MD5SIG_ALIGNED		20
    219#define TCPOLEN_MSS_ALIGNED		4
    220#define TCPOLEN_EXP_SMC_BASE_ALIGNED	8
    221
    222/* Flags in tp->nonagle */
    223#define TCP_NAGLE_OFF		1	/* Nagle's algo is disabled */
    224#define TCP_NAGLE_CORK		2	/* Socket is corked	    */
    225#define TCP_NAGLE_PUSH		4	/* Cork is overridden for already queued data */
    226
    227/* TCP thin-stream limits */
    228#define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */
    229
    230/* TCP initial congestion window as per rfc6928 */
    231#define TCP_INIT_CWND		10
    232
    233/* Bit Flags for sysctl_tcp_fastopen */
    234#define	TFO_CLIENT_ENABLE	1
    235#define	TFO_SERVER_ENABLE	2
    236#define	TFO_CLIENT_NO_COOKIE	4	/* Data in SYN w/o cookie option */
    237
    238/* Accept SYN data w/o any cookie option */
    239#define	TFO_SERVER_COOKIE_NOT_REQD	0x200
    240
    241/* Force enable TFO on all listeners, i.e., not requiring the
    242 * TCP_FASTOPEN socket option.
    243 */
    244#define	TFO_SERVER_WO_SOCKOPT1	0x400
    245
    246
    247/* sysctl variables for tcp */
    248extern int sysctl_tcp_max_orphans;
    249extern long sysctl_tcp_mem[3];
    250
    251#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
    252#define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
    253#define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */
    254
    255extern atomic_long_t tcp_memory_allocated;
    256extern struct percpu_counter tcp_sockets_allocated;
    257extern unsigned long tcp_memory_pressure;
    258
    259/* optimized version of sk_under_memory_pressure() for TCP sockets */
    260static inline bool tcp_under_memory_pressure(const struct sock *sk)
    261{
    262	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
    263	    mem_cgroup_under_socket_pressure(sk->sk_memcg))
    264		return true;
    265
    266	return READ_ONCE(tcp_memory_pressure);
    267}
    268/*
    269 * The next routines deal with comparing 32 bit unsigned ints
    270 * and worry about wraparound (automatic with unsigned arithmetic).
    271 */
    272
    273static inline bool before(__u32 seq1, __u32 seq2)
    274{
    275        return (__s32)(seq1-seq2) < 0;
    276}
    277#define after(seq2, seq1) 	before(seq1, seq2)
    278
    279/* is s2<=s1<=s3 ? */
    280static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3)
    281{
    282	return seq3 - seq2 >= seq1 - seq2;
    283}
    284
    285static inline bool tcp_out_of_memory(struct sock *sk)
    286{
    287	if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
    288	    sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
    289		return true;
    290	return false;
    291}
    292
    293static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
    294{
    295	sk_wmem_queued_add(sk, -skb->truesize);
    296	if (!skb_zcopy_pure(skb))
    297		sk_mem_uncharge(sk, skb->truesize);
    298	else
    299		sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb)));
    300	__kfree_skb(skb);
    301}
    302
    303void sk_forced_mem_schedule(struct sock *sk, int size);
    304
    305bool tcp_check_oom(struct sock *sk, int shift);
    306
    307
    308extern struct proto tcp_prot;
    309
    310#define TCP_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.tcp_statistics, field)
    311#define __TCP_INC_STATS(net, field)	__SNMP_INC_STATS((net)->mib.tcp_statistics, field)
    312#define TCP_DEC_STATS(net, field)	SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
    313#define TCP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
    314
    315void tcp_tasklet_init(void);
    316
    317int tcp_v4_err(struct sk_buff *skb, u32);
    318
    319void tcp_shutdown(struct sock *sk, int how);
    320
    321int tcp_v4_early_demux(struct sk_buff *skb);
    322int tcp_v4_rcv(struct sk_buff *skb);
    323
    324void tcp_remove_empty_skb(struct sock *sk);
    325int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
    326int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
    327int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
    328int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
    329		 int flags);
    330int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
    331			size_t size, int flags);
    332ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
    333		 size_t size, int flags);
    334int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
    335void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
    336	      int size_goal);
    337void tcp_release_cb(struct sock *sk);
    338void tcp_wfree(struct sk_buff *skb);
    339void tcp_write_timer_handler(struct sock *sk);
    340void tcp_delack_timer_handler(struct sock *sk);
    341int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
    342int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
    343void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
    344void tcp_rcv_space_adjust(struct sock *sk);
    345int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
    346void tcp_twsk_destructor(struct sock *sk);
    347ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
    348			struct pipe_inode_info *pipe, size_t len,
    349			unsigned int flags);
    350struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
    351				     bool force_schedule);
    352
    353void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks);
    354static inline void tcp_dec_quickack_mode(struct sock *sk,
    355					 const unsigned int pkts)
    356{
    357	struct inet_connection_sock *icsk = inet_csk(sk);
    358
    359	if (icsk->icsk_ack.quick) {
    360		if (pkts >= icsk->icsk_ack.quick) {
    361			icsk->icsk_ack.quick = 0;
    362			/* Leaving quickack mode we deflate ATO. */
    363			icsk->icsk_ack.ato   = TCP_ATO_MIN;
    364		} else
    365			icsk->icsk_ack.quick -= pkts;
    366	}
    367}
    368
    369#define	TCP_ECN_OK		1
    370#define	TCP_ECN_QUEUE_CWR	2
    371#define	TCP_ECN_DEMAND_CWR	4
    372#define	TCP_ECN_SEEN		8
    373
    374enum tcp_tw_status {
    375	TCP_TW_SUCCESS = 0,
    376	TCP_TW_RST = 1,
    377	TCP_TW_ACK = 2,
    378	TCP_TW_SYN = 3
    379};
    380
    381
    382enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
    383					      struct sk_buff *skb,
    384					      const struct tcphdr *th);
    385struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
    386			   struct request_sock *req, bool fastopen,
    387			   bool *lost_race);
    388int tcp_child_process(struct sock *parent, struct sock *child,
    389		      struct sk_buff *skb);
    390void tcp_enter_loss(struct sock *sk);
    391void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag);
    392void tcp_clear_retrans(struct tcp_sock *tp);
    393void tcp_update_metrics(struct sock *sk);
    394void tcp_init_metrics(struct sock *sk);
    395void tcp_metrics_init(void);
    396bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
    397void __tcp_close(struct sock *sk, long timeout);
    398void tcp_close(struct sock *sk, long timeout);
    399void tcp_init_sock(struct sock *sk);
    400void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
    401__poll_t tcp_poll(struct file *file, struct socket *sock,
    402		      struct poll_table_struct *wait);
    403int tcp_getsockopt(struct sock *sk, int level, int optname,
    404		   char __user *optval, int __user *optlen);
    405bool tcp_bpf_bypass_getsockopt(int level, int optname);
    406int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
    407		   unsigned int optlen);
    408void tcp_set_keepalive(struct sock *sk, int val);
    409void tcp_syn_ack_timeout(const struct request_sock *req);
    410int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
    411		int flags, int *addr_len);
    412int tcp_set_rcvlowat(struct sock *sk, int val);
    413int tcp_set_window_clamp(struct sock *sk, int val);
    414void tcp_update_recv_tstamps(struct sk_buff *skb,
    415			     struct scm_timestamping_internal *tss);
    416void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
    417			struct scm_timestamping_internal *tss);
    418void tcp_data_ready(struct sock *sk);
    419#ifdef CONFIG_MMU
    420int tcp_mmap(struct file *file, struct socket *sock,
    421	     struct vm_area_struct *vma);
    422#endif
    423void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
    424		       struct tcp_options_received *opt_rx,
    425		       int estab, struct tcp_fastopen_cookie *foc);
    426const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
    427
    428/*
    429 *	BPF SKB-less helpers
    430 */
    431u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
    432			 struct tcphdr *th, u32 *cookie);
    433u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
    434			 struct tcphdr *th, u32 *cookie);
    435u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
    436			  const struct tcp_request_sock_ops *af_ops,
    437			  struct sock *sk, struct tcphdr *th);
    438/*
    439 *	TCP v4 functions exported for the inet6 API
    440 */
    441
    442void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
    443void tcp_v4_mtu_reduced(struct sock *sk);
    444void tcp_req_err(struct sock *sk, u32 seq, bool abort);
    445void tcp_ld_RTO_revert(struct sock *sk, u32 seq);
    446int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
    447struct sock *tcp_create_openreq_child(const struct sock *sk,
    448				      struct request_sock *req,
    449				      struct sk_buff *skb);
    450void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
    451struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
    452				  struct request_sock *req,
    453				  struct dst_entry *dst,
    454				  struct request_sock *req_unhash,
    455				  bool *own_req);
    456int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
    457int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
    458int tcp_connect(struct sock *sk);
    459enum tcp_synack_type {
    460	TCP_SYNACK_NORMAL,
    461	TCP_SYNACK_FASTOPEN,
    462	TCP_SYNACK_COOKIE,
    463};
    464struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
    465				struct request_sock *req,
    466				struct tcp_fastopen_cookie *foc,
    467				enum tcp_synack_type synack_type,
    468				struct sk_buff *syn_skb);
    469int tcp_disconnect(struct sock *sk, int flags);
    470
    471void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
    472int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size);
    473void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
    474
    475/* From syncookies.c */
    476struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
    477				 struct request_sock *req,
    478				 struct dst_entry *dst, u32 tsoff);
    479int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
    480		      u32 cookie);
    481struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
    482struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
    483					    const struct tcp_request_sock_ops *af_ops,
    484					    struct sock *sk, struct sk_buff *skb);
    485#ifdef CONFIG_SYN_COOKIES
    486
    487/* Syncookies use a monotonic timer which increments every 60 seconds.
    488 * This counter is used both as a hash input and partially encoded into
    489 * the cookie value.  A cookie is only validated further if the delta
    490 * between the current counter value and the encoded one is less than this,
    491 * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if
    492 * the counter advances immediately after a cookie is generated).
    493 */
    494#define MAX_SYNCOOKIE_AGE	2
    495#define TCP_SYNCOOKIE_PERIOD	(60 * HZ)
    496#define TCP_SYNCOOKIE_VALID	(MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD)
    497
    498/* syncookies: remember time of last synqueue overflow
    499 * But do not dirty this field too often (once per second is enough)
    500 * It is racy as we do not hold a lock, but race is very minor.
    501 */
    502static inline void tcp_synq_overflow(const struct sock *sk)
    503{
    504	unsigned int last_overflow;
    505	unsigned int now = jiffies;
    506
    507	if (sk->sk_reuseport) {
    508		struct sock_reuseport *reuse;
    509
    510		reuse = rcu_dereference(sk->sk_reuseport_cb);
    511		if (likely(reuse)) {
    512			last_overflow = READ_ONCE(reuse->synq_overflow_ts);
    513			if (!time_between32(now, last_overflow,
    514					    last_overflow + HZ))
    515				WRITE_ONCE(reuse->synq_overflow_ts, now);
    516			return;
    517		}
    518	}
    519
    520	last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
    521	if (!time_between32(now, last_overflow, last_overflow + HZ))
    522		WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now);
    523}
    524
    525/* syncookies: no recent synqueue overflow on this listening socket? */
    526static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
    527{
    528	unsigned int last_overflow;
    529	unsigned int now = jiffies;
    530
    531	if (sk->sk_reuseport) {
    532		struct sock_reuseport *reuse;
    533
    534		reuse = rcu_dereference(sk->sk_reuseport_cb);
    535		if (likely(reuse)) {
    536			last_overflow = READ_ONCE(reuse->synq_overflow_ts);
    537			return !time_between32(now, last_overflow - HZ,
    538					       last_overflow +
    539					       TCP_SYNCOOKIE_VALID);
    540		}
    541	}
    542
    543	last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
    544
    545	/* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID,
    546	 * then we're under synflood. However, we have to use
    547	 * 'last_overflow - HZ' as lower bound. That's because a concurrent
    548	 * tcp_synq_overflow() could update .ts_recent_stamp after we read
    549	 * jiffies but before we store .ts_recent_stamp into last_overflow,
    550	 * which could lead to rejecting a valid syncookie.
    551	 */
    552	return !time_between32(now, last_overflow - HZ,
    553			       last_overflow + TCP_SYNCOOKIE_VALID);
    554}
    555
    556static inline u32 tcp_cookie_time(void)
    557{
    558	u64 val = get_jiffies_64();
    559
    560	do_div(val, TCP_SYNCOOKIE_PERIOD);
    561	return val;
    562}
    563
    564u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
    565			      u16 *mssp);
    566__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
    567u64 cookie_init_timestamp(struct request_sock *req, u64 now);
    568bool cookie_timestamp_decode(const struct net *net,
    569			     struct tcp_options_received *opt);
    570bool cookie_ecn_ok(const struct tcp_options_received *opt,
    571		   const struct net *net, const struct dst_entry *dst);
    572
    573/* From net/ipv6/syncookies.c */
    574int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
    575		      u32 cookie);
    576struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
    577
    578u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
    579			      const struct tcphdr *th, u16 *mssp);
    580__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
    581#endif
    582/* tcp_output.c */
    583
    584void tcp_skb_entail(struct sock *sk, struct sk_buff *skb);
    585void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb);
    586void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
    587			       int nonagle);
    588int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
    589int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
    590void tcp_retransmit_timer(struct sock *sk);
    591void tcp_xmit_retransmit_queue(struct sock *);
    592void tcp_simple_retransmit(struct sock *);
    593void tcp_enter_recovery(struct sock *sk, bool ece_ack);
    594int tcp_trim_head(struct sock *, struct sk_buff *, u32);
    595enum tcp_queue {
    596	TCP_FRAG_IN_WRITE_QUEUE,
    597	TCP_FRAG_IN_RTX_QUEUE,
    598};
    599int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
    600		 struct sk_buff *skb, u32 len,
    601		 unsigned int mss_now, gfp_t gfp);
    602
    603void tcp_send_probe0(struct sock *);
    604void tcp_send_partial(struct sock *);
    605int tcp_write_wakeup(struct sock *, int mib);
    606void tcp_send_fin(struct sock *sk);
    607void tcp_send_active_reset(struct sock *sk, gfp_t priority);
    608int tcp_send_synack(struct sock *);
    609void tcp_push_one(struct sock *, unsigned int mss_now);
    610void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
    611void tcp_send_ack(struct sock *sk);
    612void tcp_send_delayed_ack(struct sock *sk);
    613void tcp_send_loss_probe(struct sock *sk);
    614bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto);
    615void tcp_skb_collapse_tstamp(struct sk_buff *skb,
    616			     const struct sk_buff *next_skb);
    617
    618/* tcp_input.c */
    619void tcp_rearm_rto(struct sock *sk);
    620void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
    621void tcp_reset(struct sock *sk, struct sk_buff *skb);
    622void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
    623void tcp_fin(struct sock *sk);
    624void tcp_check_space(struct sock *sk);
    625
    626/* tcp_timer.c */
    627void tcp_init_xmit_timers(struct sock *);
    628static inline void tcp_clear_xmit_timers(struct sock *sk)
    629{
    630	if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
    631		__sock_put(sk);
    632
    633	if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
    634		__sock_put(sk);
    635
    636	inet_csk_clear_xmit_timers(sk);
    637}
    638
    639unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
    640unsigned int tcp_current_mss(struct sock *sk);
    641u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);
    642
    643/* Bound MSS / TSO packet size with the half of the window */
    644static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
    645{
    646	int cutoff;
    647
    648	/* When peer uses tiny windows, there is no use in packetizing
    649	 * to sub-MSS pieces for the sake of SWS or making sure there
    650	 * are enough packets in the pipe for fast recovery.
    651	 *
    652	 * On the other hand, for extremely large MSS devices, handling
    653	 * smaller than MSS windows in this way does make sense.
    654	 */
    655	if (tp->max_window > TCP_MSS_DEFAULT)
    656		cutoff = (tp->max_window >> 1);
    657	else
    658		cutoff = tp->max_window;
    659
    660	if (cutoff && pktsize > cutoff)
    661		return max_t(int, cutoff, 68U - tp->tcp_header_len);
    662	else
    663		return pktsize;
    664}
    665
    666/* tcp.c */
    667void tcp_get_info(struct sock *, struct tcp_info *);
    668
    669/* Read 'sendfile()'-style from a TCP socket */
    670int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
    671		  sk_read_actor_t recv_actor);
    672
    673void tcp_initialize_rcv_mss(struct sock *sk);
    674
    675int tcp_mtu_to_mss(struct sock *sk, int pmtu);
    676int tcp_mss_to_mtu(struct sock *sk, int mss);
    677void tcp_mtup_init(struct sock *sk);
    678
    679static inline void tcp_bound_rto(const struct sock *sk)
    680{
    681	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
    682		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
    683}
    684
    685static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
    686{
    687	return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
    688}
    689
    690static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
    691{
    692	/* mptcp hooks are only on the slow path */
    693	if (sk_is_mptcp((struct sock *)tp))
    694		return;
    695
    696	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
    697			       ntohl(TCP_FLAG_ACK) |
    698			       snd_wnd);
    699}
    700
    701static inline void tcp_fast_path_on(struct tcp_sock *tp)
    702{
    703	__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
    704}
    705
    706static inline void tcp_fast_path_check(struct sock *sk)
    707{
    708	struct tcp_sock *tp = tcp_sk(sk);
    709
    710	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
    711	    tp->rcv_wnd &&
    712	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
    713	    !tp->urg_data)
    714		tcp_fast_path_on(tp);
    715}
    716
    717/* Compute the actual rto_min value */
    718static inline u32 tcp_rto_min(struct sock *sk)
    719{
    720	const struct dst_entry *dst = __sk_dst_get(sk);
    721	u32 rto_min = inet_csk(sk)->icsk_rto_min;
    722
    723	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
    724		rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
    725	return rto_min;
    726}
    727
    728static inline u32 tcp_rto_min_us(struct sock *sk)
    729{
    730	return jiffies_to_usecs(tcp_rto_min(sk));
    731}
    732
    733static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
    734{
    735	return dst_metric_locked(dst, RTAX_CC_ALGO);
    736}
    737
    738/* Minimum RTT in usec. ~0 means not available. */
    739static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
    740{
    741	return minmax_get(&tp->rtt_min);
    742}
    743
    744/* Compute the actual receive window we are currently advertising.
    745 * Rcv_nxt can be after the window if our peer push more data
    746 * than the offered window.
    747 */
    748static inline u32 tcp_receive_window(const struct tcp_sock *tp)
    749{
    750	s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
    751
    752	if (win < 0)
    753		win = 0;
    754	return (u32) win;
    755}
    756
    757/* Choose a new window, without checks for shrinking, and without
    758 * scaling applied to the result.  The caller does these things
    759 * if necessary.  This is a "raw" window selection.
    760 */
    761u32 __tcp_select_window(struct sock *sk);
    762
    763void tcp_send_window_probe(struct sock *sk);
    764
    765/* TCP uses 32bit jiffies to save some space.
    766 * Note that this is different from tcp_time_stamp, which
    767 * historically has been the same until linux-4.13.
    768 */
    769#define tcp_jiffies32 ((u32)jiffies)
    770
    771/*
    772 * Deliver a 32bit value for TCP timestamp option (RFC 7323)
    773 * It is no longer tied to jiffies, but to 1 ms clock.
    774 * Note: double check if you want to use tcp_jiffies32 instead of this.
    775 */
    776#define TCP_TS_HZ	1000
    777
    778static inline u64 tcp_clock_ns(void)
    779{
    780	return ktime_get_ns();
    781}
    782
    783static inline u64 tcp_clock_us(void)
    784{
    785	return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
    786}
    787
    788/* This should only be used in contexts where tp->tcp_mstamp is up to date */
    789static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
    790{
    791	return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
    792}
    793
    794/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
    795static inline u32 tcp_ns_to_ts(u64 ns)
    796{
    797	return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
    798}
    799
    800/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
    801static inline u32 tcp_time_stamp_raw(void)
    802{
    803	return tcp_ns_to_ts(tcp_clock_ns());
    804}
    805
    806void tcp_mstamp_refresh(struct tcp_sock *tp);
    807
    808static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
    809{
    810	return max_t(s64, t1 - t0, 0);
    811}
    812
    813static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
    814{
    815	return tcp_ns_to_ts(skb->skb_mstamp_ns);
    816}
    817
    818/* provide the departure time in us unit */
    819static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
    820{
    821	return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
    822}
    823
    824
    825#define tcp_flag_byte(th) (((u_int8_t *)th)[13])
    826
    827#define TCPHDR_FIN 0x01
    828#define TCPHDR_SYN 0x02
    829#define TCPHDR_RST 0x04
    830#define TCPHDR_PSH 0x08
    831#define TCPHDR_ACK 0x10
    832#define TCPHDR_URG 0x20
    833#define TCPHDR_ECE 0x40
    834#define TCPHDR_CWR 0x80
    835
    836#define TCPHDR_SYN_ECN	(TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
    837
    838/* This is what the send packet queuing engine uses to pass
    839 * TCP per-packet control information to the transmission code.
    840 * We also store the host-order sequence numbers in here too.
    841 * This is 44 bytes if IPV6 is enabled.
    842 * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
    843 */
    844struct tcp_skb_cb {
    845	__u32		seq;		/* Starting sequence number	*/
    846	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
    847	union {
    848		/* Note : tcp_tw_isn is used in input path only
    849		 *	  (isn chosen by tcp_timewait_state_process())
    850		 *
    851		 * 	  tcp_gso_segs/size are used in write queue only,
    852		 *	  cf tcp_skb_pcount()/tcp_skb_mss()
    853		 */
    854		__u32		tcp_tw_isn;
    855		struct {
    856			u16	tcp_gso_segs;
    857			u16	tcp_gso_size;
    858		};
    859	};
    860	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
    861
    862	__u8		sacked;		/* State flags for SACK.	*/
    863#define TCPCB_SACKED_ACKED	0x01	/* SKB ACK'd by a SACK block	*/
    864#define TCPCB_SACKED_RETRANS	0x02	/* SKB retransmitted		*/
    865#define TCPCB_LOST		0x04	/* SKB is lost			*/
    866#define TCPCB_TAGBITS		0x07	/* All tag bits			*/
    867#define TCPCB_REPAIRED		0x10	/* SKB repaired (no skb_mstamp_ns)	*/
    868#define TCPCB_EVER_RETRANS	0x80	/* Ever retransmitted frame	*/
    869#define TCPCB_RETRANS		(TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
    870				TCPCB_REPAIRED)
    871
    872	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
    873	__u8		txstamp_ack:1,	/* Record TX timestamp for ack? */
    874			eor:1,		/* Is skb MSG_EOR marked? */
    875			has_rxtstamp:1,	/* SKB has a RX timestamp	*/
    876			unused:5;
    877	__u32		ack_seq;	/* Sequence number ACK'd	*/
    878	union {
    879		struct {
    880#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1)
    881			/* There is space for up to 24 bytes */
    882			__u32 is_app_limited:1, /* cwnd not fully used? */
    883			      delivered_ce:20,
    884			      unused:11;
    885			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
    886			__u32 delivered;
    887			/* start of send pipeline phase */
    888			u64 first_tx_mstamp;
    889			/* when we reached the "delivered" count */
    890			u64 delivered_mstamp;
    891		} tx;   /* only used for outgoing skbs */
    892		union {
    893			struct inet_skb_parm	h4;
    894#if IS_ENABLED(CONFIG_IPV6)
    895			struct inet6_skb_parm	h6;
    896#endif
    897		} header;	/* For incoming skbs */
    898	};
    899};
    900
    901#define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
    902
    903extern const struct inet_connection_sock_af_ops ipv4_specific;
    904
    905#if IS_ENABLED(CONFIG_IPV6)
    906/* This is the variant of inet6_iif() that must be used by TCP,
    907 * as TCP moves IP6CB into a different location in skb->cb[]
    908 */
    909static inline int tcp_v6_iif(const struct sk_buff *skb)
    910{
    911	return TCP_SKB_CB(skb)->header.h6.iif;
    912}
    913
    914static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb)
    915{
    916	bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
    917
    918	return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
    919}
    920
    921/* TCP_SKB_CB reference means this can not be used from early demux */
    922static inline int tcp_v6_sdif(const struct sk_buff *skb)
    923{
    924#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
    925	if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
    926		return TCP_SKB_CB(skb)->header.h6.iif;
    927#endif
    928	return 0;
    929}
    930
    931extern const struct inet_connection_sock_af_ops ipv6_specific;
    932
    933INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb));
    934INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb));
    935INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *skb));
    936
    937#endif
    938
    939/* TCP_SKB_CB reference means this can not be used from early demux */
    940static inline int tcp_v4_sdif(struct sk_buff *skb)
    941{
    942#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
    943	if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
    944		return TCP_SKB_CB(skb)->header.h4.iif;
    945#endif
    946	return 0;
    947}
    948
    949/* Due to TSO, an SKB can be composed of multiple actual
    950 * packets.  To keep these tracked properly, we use this.
    951 */
    952static inline int tcp_skb_pcount(const struct sk_buff *skb)
    953{
    954	return TCP_SKB_CB(skb)->tcp_gso_segs;
    955}
    956
    957static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
    958{
    959	TCP_SKB_CB(skb)->tcp_gso_segs = segs;
    960}
    961
    962static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
    963{
    964	TCP_SKB_CB(skb)->tcp_gso_segs += segs;
    965}
    966
    967/* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */
    968static inline int tcp_skb_mss(const struct sk_buff *skb)
    969{
    970	return TCP_SKB_CB(skb)->tcp_gso_size;
    971}
    972
    973static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
    974{
    975	return likely(!TCP_SKB_CB(skb)->eor);
    976}
    977
    978static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
    979					const struct sk_buff *from)
    980{
    981	return likely(tcp_skb_can_collapse_to(to) &&
    982		      mptcp_skb_can_collapse(to, from) &&
    983		      skb_pure_zcopy_same(to, from));
    984}
    985
    986/* Events passed to congestion control interface */
    987enum tcp_ca_event {
    988	CA_EVENT_TX_START,	/* first transmit when no packets in flight */
    989	CA_EVENT_CWND_RESTART,	/* congestion window restart */
    990	CA_EVENT_COMPLETE_CWR,	/* end of congestion recovery */
    991	CA_EVENT_LOSS,		/* loss timeout */
    992	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
    993	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
    994};
    995
    996/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
    997enum tcp_ca_ack_event_flags {
    998	CA_ACK_SLOWPATH		= (1 << 0),	/* In slow path processing */
    999	CA_ACK_WIN_UPDATE	= (1 << 1),	/* ACK updated window */
   1000	CA_ACK_ECE		= (1 << 2),	/* ECE bit is set on ack */
   1001};
   1002
   1003/*
   1004 * Interface for adding new TCP congestion control handlers
   1005 */
   1006#define TCP_CA_NAME_MAX	16
   1007#define TCP_CA_MAX	128
   1008#define TCP_CA_BUF_MAX	(TCP_CA_NAME_MAX*TCP_CA_MAX)
   1009
   1010#define TCP_CA_UNSPEC	0
   1011
   1012/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
   1013#define TCP_CONG_NON_RESTRICTED 0x1
   1014/* Requires ECN/ECT set on all packets */
   1015#define TCP_CONG_NEEDS_ECN	0x2
   1016#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
   1017
   1018union tcp_cc_info;
   1019
   1020struct ack_sample {
   1021	u32 pkts_acked;
   1022	s32 rtt_us;
   1023	u32 in_flight;
   1024};
   1025
   1026/* A rate sample measures the number of (original/retransmitted) data
   1027 * packets delivered "delivered" over an interval of time "interval_us".
   1028 * The tcp_rate.c code fills in the rate sample, and congestion
   1029 * control modules that define a cong_control function to run at the end
   1030 * of ACK processing can optionally chose to consult this sample when
   1031 * setting cwnd and pacing rate.
   1032 * A sample is invalid if "delivered" or "interval_us" is negative.
   1033 */
   1034struct rate_sample {
   1035	u64  prior_mstamp; /* starting timestamp for interval */
   1036	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
   1037	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
   1038	s32  delivered;		/* number of packets delivered over interval */
   1039	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
   1040	long interval_us;	/* time for tp->delivered to incr "delivered" */
   1041	u32 snd_interval_us;	/* snd interval for delivered packets */
   1042	u32 rcv_interval_us;	/* rcv interval for delivered packets */
   1043	long rtt_us;		/* RTT of last (S)ACKed packet (or -1) */
   1044	int  losses;		/* number of packets marked lost upon ACK */
   1045	u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
   1046	u32  prior_in_flight;	/* in flight before this ACK */
   1047	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
   1048	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
   1049	bool is_retrans;	/* is sample from retransmission? */
   1050	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
   1051};
   1052
   1053struct tcp_congestion_ops {
   1054/* fast path fields are put first to fill one cache line */
   1055
   1056	/* return slow start threshold (required) */
   1057	u32 (*ssthresh)(struct sock *sk);
   1058
   1059	/* do new cwnd calculation (required) */
   1060	void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
   1061
   1062	/* call before changing ca_state (optional) */
   1063	void (*set_state)(struct sock *sk, u8 new_state);
   1064
   1065	/* call when cwnd event occurs (optional) */
   1066	void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
   1067
   1068	/* call when ack arrives (optional) */
   1069	void (*in_ack_event)(struct sock *sk, u32 flags);
   1070
   1071	/* hook for packet ack accounting (optional) */
   1072	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
   1073
   1074	/* override sysctl_tcp_min_tso_segs */
   1075	u32 (*min_tso_segs)(struct sock *sk);
   1076
   1077	/* call when packets are delivered to update cwnd and pacing rate,
   1078	 * after all the ca_state processing. (optional)
   1079	 */
   1080	void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
   1081
   1082
   1083	/* new value of cwnd after loss (required) */
   1084	u32  (*undo_cwnd)(struct sock *sk);
   1085	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
   1086	u32 (*sndbuf_expand)(struct sock *sk);
   1087
   1088/* control/slow paths put last */
   1089	/* get info for inet_diag (optional) */
   1090	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
   1091			   union tcp_cc_info *info);
   1092
   1093	char 			name[TCP_CA_NAME_MAX];
   1094	struct module		*owner;
   1095	struct list_head	list;
   1096	u32			key;
   1097	u32			flags;
   1098
   1099	/* initialize private data (optional) */
   1100	void (*init)(struct sock *sk);
   1101	/* cleanup private data  (optional) */
   1102	void (*release)(struct sock *sk);
   1103} ____cacheline_aligned_in_smp;
   1104
   1105int tcp_register_congestion_control(struct tcp_congestion_ops *type);
   1106void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
   1107
   1108void tcp_assign_congestion_control(struct sock *sk);
   1109void tcp_init_congestion_control(struct sock *sk);
   1110void tcp_cleanup_congestion_control(struct sock *sk);
   1111int tcp_set_default_congestion_control(struct net *net, const char *name);
   1112void tcp_get_default_congestion_control(struct net *net, char *name);
   1113void tcp_get_available_congestion_control(char *buf, size_t len);
   1114void tcp_get_allowed_congestion_control(char *buf, size_t len);
   1115int tcp_set_allowed_congestion_control(char *allowed);
   1116int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
   1117			       bool cap_net_admin);
   1118u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
   1119void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
   1120
   1121u32 tcp_reno_ssthresh(struct sock *sk);
   1122u32 tcp_reno_undo_cwnd(struct sock *sk);
   1123void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
   1124extern struct tcp_congestion_ops tcp_reno;
   1125
   1126struct tcp_congestion_ops *tcp_ca_find(const char *name);
   1127struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
   1128u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
   1129#ifdef CONFIG_INET
   1130char *tcp_ca_get_name_by_key(u32 key, char *buffer);
   1131#else
   1132static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
   1133{
   1134	return NULL;
   1135}
   1136#endif
   1137
   1138static inline bool tcp_ca_needs_ecn(const struct sock *sk)
   1139{
   1140	const struct inet_connection_sock *icsk = inet_csk(sk);
   1141
   1142	return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
   1143}
   1144
   1145static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
   1146{
   1147	const struct inet_connection_sock *icsk = inet_csk(sk);
   1148
   1149	if (icsk->icsk_ca_ops->cwnd_event)
   1150		icsk->icsk_ca_ops->cwnd_event(sk, event);
   1151}
   1152
   1153/* From tcp_cong.c */
   1154void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
   1155
   1156/* From tcp_rate.c */
   1157void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
   1158void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
   1159			    struct rate_sample *rs);
   1160void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
   1161		  bool is_sack_reneg, struct rate_sample *rs);
   1162void tcp_rate_check_app_limited(struct sock *sk);
   1163
   1164static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
   1165{
   1166	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
   1167}
   1168
   1169/* These functions determine how the current flow behaves in respect of SACK
   1170 * handling. SACK is negotiated with the peer, and therefore it can vary
   1171 * between different flows.
   1172 *
   1173 * tcp_is_sack - SACK enabled
   1174 * tcp_is_reno - No SACK
   1175 */
   1176static inline int tcp_is_sack(const struct tcp_sock *tp)
   1177{
   1178	return likely(tp->rx_opt.sack_ok);
   1179}
   1180
   1181static inline bool tcp_is_reno(const struct tcp_sock *tp)
   1182{
   1183	return !tcp_is_sack(tp);
   1184}
   1185
   1186static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
   1187{
   1188	return tp->sacked_out + tp->lost_out;
   1189}
   1190
   1191/* This determines how many packets are "in the network" to the best
   1192 * of our knowledge.  In many cases it is conservative, but where
   1193 * detailed information is available from the receiver (via SACK
   1194 * blocks etc.) we can make more aggressive calculations.
   1195 *
   1196 * Use this for decisions involving congestion control, use just
   1197 * tp->packets_out to determine if the send queue is empty or not.
   1198 *
   1199 * Read this equation as:
   1200 *
   1201 *	"Packets sent once on transmission queue" MINUS
   1202 *	"Packets left network, but not honestly ACKed yet" PLUS
   1203 *	"Packets fast retransmitted"
   1204 */
   1205static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
   1206{
   1207	return tp->packets_out - tcp_left_out(tp) + tp->retrans_out;
   1208}
   1209
   1210#define TCP_INFINITE_SSTHRESH	0x7fffffff
   1211
   1212static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
   1213{
   1214	return tp->snd_cwnd;
   1215}
   1216
   1217static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
   1218{
   1219	WARN_ON_ONCE((int)val <= 0);
   1220	tp->snd_cwnd = val;
   1221}
   1222
   1223static inline bool tcp_in_slow_start(const struct tcp_sock *tp)
   1224{
   1225	return tcp_snd_cwnd(tp) < tp->snd_ssthresh;
   1226}
   1227
   1228static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp)
   1229{
   1230	return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH;
   1231}
   1232
   1233static inline bool tcp_in_cwnd_reduction(const struct sock *sk)
   1234{
   1235	return (TCPF_CA_CWR | TCPF_CA_Recovery) &
   1236	       (1 << inet_csk(sk)->icsk_ca_state);
   1237}
   1238
   1239/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
   1240 * The exception is cwnd reduction phase, when cwnd is decreasing towards
   1241 * ssthresh.
   1242 */
   1243static inline __u32 tcp_current_ssthresh(const struct sock *sk)
   1244{
   1245	const struct tcp_sock *tp = tcp_sk(sk);
   1246
   1247	if (tcp_in_cwnd_reduction(sk))
   1248		return tp->snd_ssthresh;
   1249	else
   1250		return max(tp->snd_ssthresh,
   1251			   ((tcp_snd_cwnd(tp) >> 1) +
   1252			    (tcp_snd_cwnd(tp) >> 2)));
   1253}
   1254
   1255/* Use define here intentionally to get WARN_ON location shown at the caller */
   1256#define tcp_verify_left_out(tp)	WARN_ON(tcp_left_out(tp) > tp->packets_out)
   1257
   1258void tcp_enter_cwr(struct sock *sk);
   1259__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst);
   1260
   1261/* The maximum number of MSS of available cwnd for which TSO defers
   1262 * sending if not using sysctl_tcp_tso_win_divisor.
   1263 */
   1264static inline __u32 tcp_max_tso_deferred_mss(const struct tcp_sock *tp)
   1265{
   1266	return 3;
   1267}
   1268
   1269/* Returns end sequence number of the receiver's advertised window */
   1270static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
   1271{
   1272	return tp->snd_una + tp->snd_wnd;
   1273}
   1274
   1275/* We follow the spirit of RFC2861 to validate cwnd but implement a more
   1276 * flexible approach. The RFC suggests cwnd should not be raised unless
   1277 * it was fully used previously. And that's exactly what we do in
   1278 * congestion avoidance mode. But in slow start we allow cwnd to grow
   1279 * as long as the application has used half the cwnd.
   1280 * Example :
   1281 *    cwnd is 10 (IW10), but application sends 9 frames.
   1282 *    We allow cwnd to reach 18 when all frames are ACKed.
   1283 * This check is safe because it's as aggressive as slow start which already
   1284 * risks 100% overshoot. The advantage is that we discourage application to
   1285 * either send more filler packets or data to artificially blow up the cwnd
   1286 * usage, and allow application-limited process to probe bw more aggressively.
   1287 */
   1288static inline bool tcp_is_cwnd_limited(const struct sock *sk)
   1289{
   1290	const struct tcp_sock *tp = tcp_sk(sk);
   1291
   1292	/* If in slow start, ensure cwnd grows to twice what was ACKed. */
   1293	if (tcp_in_slow_start(tp))
   1294		return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out;
   1295
   1296	return tp->is_cwnd_limited;
   1297}
   1298
   1299/* BBR congestion control needs pacing.
   1300 * Same remark for SO_MAX_PACING_RATE.
   1301 * sch_fq packet scheduler is efficiently handling pacing,
   1302 * but is not always installed/used.
   1303 * Return true if TCP stack should pace packets itself.
   1304 */
   1305static inline bool tcp_needs_internal_pacing(const struct sock *sk)
   1306{
   1307	return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
   1308}
   1309
   1310/* Estimates in how many jiffies next packet for this flow can be sent.
   1311 * Scheduling a retransmit timer too early would be silly.
   1312 */
   1313static inline unsigned long tcp_pacing_delay(const struct sock *sk)
   1314{
   1315	s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache;
   1316
   1317	return delay > 0 ? nsecs_to_jiffies(delay) : 0;
   1318}
   1319
   1320static inline void tcp_reset_xmit_timer(struct sock *sk,
   1321					const int what,
   1322					unsigned long when,
   1323					const unsigned long max_when)
   1324{
   1325	inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk),
   1326				  max_when);
   1327}
   1328
   1329/* Something is really bad, we could not queue an additional packet,
   1330 * because qdisc is full or receiver sent a 0 window, or we are paced.
   1331 * We do not want to add fuel to the fire, or abort too early,
   1332 * so make sure the timer we arm now is at least 200ms in the future,
   1333 * regardless of current icsk_rto value (as it could be ~2ms)
   1334 */
   1335static inline unsigned long tcp_probe0_base(const struct sock *sk)
   1336{
   1337	return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
   1338}
   1339
   1340/* Variant of inet_csk_rto_backoff() used for zero window probes */
   1341static inline unsigned long tcp_probe0_when(const struct sock *sk,
   1342					    unsigned long max_when)
   1343{
   1344	u8 backoff = min_t(u8, ilog2(TCP_RTO_MAX / TCP_RTO_MIN) + 1,
   1345			   inet_csk(sk)->icsk_backoff);
   1346	u64 when = (u64)tcp_probe0_base(sk) << backoff;
   1347
   1348	return (unsigned long)min_t(u64, when, max_when);
   1349}
   1350
   1351static inline void tcp_check_probe_timer(struct sock *sk)
   1352{
   1353	if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
   1354		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
   1355				     tcp_probe0_base(sk), TCP_RTO_MAX);
   1356}
   1357
   1358static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
   1359{
   1360	tp->snd_wl1 = seq;
   1361}
   1362
   1363static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq)
   1364{
   1365	tp->snd_wl1 = seq;
   1366}
   1367
   1368/*
   1369 * Calculate(/check) TCP checksum
   1370 */
   1371static inline __sum16 tcp_v4_check(int len, __be32 saddr,
   1372				   __be32 daddr, __wsum base)
   1373{
   1374	return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base);
   1375}
   1376
   1377static inline bool tcp_checksum_complete(struct sk_buff *skb)
   1378{
   1379	return !skb_csum_unnecessary(skb) &&
   1380		__skb_checksum_complete(skb);
   1381}
   1382
   1383bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
   1384		     enum skb_drop_reason *reason);
   1385
   1386
   1387int tcp_filter(struct sock *sk, struct sk_buff *skb);
   1388void tcp_set_state(struct sock *sk, int state);
   1389void tcp_done(struct sock *sk);
   1390int tcp_abort(struct sock *sk, int err);
   1391
   1392static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
   1393{
   1394	rx_opt->dsack = 0;
   1395	rx_opt->num_sacks = 0;
   1396}
   1397
   1398void tcp_cwnd_restart(struct sock *sk, s32 delta);
   1399
   1400static inline void tcp_slow_start_after_idle_check(struct sock *sk)
   1401{
   1402	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
   1403	struct tcp_sock *tp = tcp_sk(sk);
   1404	s32 delta;
   1405
   1406	if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out ||
   1407	    ca_ops->cong_control)
   1408		return;
   1409	delta = tcp_jiffies32 - tp->lsndtime;
   1410	if (delta > inet_csk(sk)->icsk_rto)
   1411		tcp_cwnd_restart(sk, delta);
   1412}
   1413
   1414/* Determine a window scaling and initial window to offer. */
   1415void tcp_select_initial_window(const struct sock *sk, int __space,
   1416			       __u32 mss, __u32 *rcv_wnd,
   1417			       __u32 *window_clamp, int wscale_ok,
   1418			       __u8 *rcv_wscale, __u32 init_rcv_wnd);
   1419
   1420static inline int tcp_win_from_space(const struct sock *sk, int space)
   1421{
   1422	int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale;
   1423
   1424	return tcp_adv_win_scale <= 0 ?
   1425		(space>>(-tcp_adv_win_scale)) :
   1426		space - (space>>tcp_adv_win_scale);
   1427}
   1428
   1429/* Note: caller must be prepared to deal with negative returns */
   1430static inline int tcp_space(const struct sock *sk)
   1431{
   1432	return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
   1433				  READ_ONCE(sk->sk_backlog.len) -
   1434				  atomic_read(&sk->sk_rmem_alloc));
   1435}
   1436
   1437static inline int tcp_full_space(const struct sock *sk)
   1438{
   1439	return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
   1440}
   1441
   1442static inline void tcp_adjust_rcv_ssthresh(struct sock *sk)
   1443{
   1444	int unused_mem = sk_unused_reserved_mem(sk);
   1445	struct tcp_sock *tp = tcp_sk(sk);
   1446
   1447	tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
   1448	if (unused_mem)
   1449		tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh,
   1450					 tcp_win_from_space(sk, unused_mem));
   1451}
   1452
   1453void tcp_cleanup_rbuf(struct sock *sk, int copied);
   1454
   1455/* We provision sk_rcvbuf around 200% of sk_rcvlowat.
   1456 * If 87.5 % (7/8) of the space has been consumed, we want to override
   1457 * SO_RCVLOWAT constraint, since we are receiving skbs with too small
   1458 * len/truesize ratio.
   1459 */
   1460static inline bool tcp_rmem_pressure(const struct sock *sk)
   1461{
   1462	int rcvbuf, threshold;
   1463
   1464	if (tcp_under_memory_pressure(sk))
   1465		return true;
   1466
   1467	rcvbuf = READ_ONCE(sk->sk_rcvbuf);
   1468	threshold = rcvbuf - (rcvbuf >> 3);
   1469
   1470	return atomic_read(&sk->sk_rmem_alloc) > threshold;
   1471}
   1472
   1473static inline bool tcp_epollin_ready(const struct sock *sk, int target)
   1474{
   1475	const struct tcp_sock *tp = tcp_sk(sk);
   1476	int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
   1477
   1478	if (avail <= 0)
   1479		return false;
   1480
   1481	return (avail >= target) || tcp_rmem_pressure(sk) ||
   1482	       (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss);
   1483}
   1484
   1485extern void tcp_openreq_init_rwin(struct request_sock *req,
   1486				  const struct sock *sk_listener,
   1487				  const struct dst_entry *dst);
   1488
   1489void tcp_enter_memory_pressure(struct sock *sk);
   1490void tcp_leave_memory_pressure(struct sock *sk);
   1491
   1492static inline int keepalive_intvl_when(const struct tcp_sock *tp)
   1493{
   1494	struct net *net = sock_net((struct sock *)tp);
   1495
   1496	return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl;
   1497}
   1498
   1499static inline int keepalive_time_when(const struct tcp_sock *tp)
   1500{
   1501	struct net *net = sock_net((struct sock *)tp);
   1502
   1503	return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time;
   1504}
   1505
   1506static inline int keepalive_probes(const struct tcp_sock *tp)
   1507{
   1508	struct net *net = sock_net((struct sock *)tp);
   1509
   1510	return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes;
   1511}
   1512
   1513static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
   1514{
   1515	const struct inet_connection_sock *icsk = &tp->inet_conn;
   1516
   1517	return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime,
   1518			  tcp_jiffies32 - tp->rcv_tstamp);
   1519}
   1520
   1521static inline int tcp_fin_time(const struct sock *sk)
   1522{
   1523	int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout;
   1524	const int rto = inet_csk(sk)->icsk_rto;
   1525
   1526	if (fin_timeout < (rto << 2) - (rto >> 1))
   1527		fin_timeout = (rto << 2) - (rto >> 1);
   1528
   1529	return fin_timeout;
   1530}
   1531
   1532static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
   1533				  int paws_win)
   1534{
   1535	if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
   1536		return true;
   1537	if (unlikely(!time_before32(ktime_get_seconds(),
   1538				    rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)))
   1539		return true;
   1540	/*
   1541	 * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
   1542	 * then following tcp messages have valid values. Ignore 0 value,
   1543	 * or else 'negative' tsval might forbid us to accept their packets.
   1544	 */
   1545	if (!rx_opt->ts_recent)
   1546		return true;
   1547	return false;
   1548}
   1549
   1550static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
   1551				   int rst)
   1552{
   1553	if (tcp_paws_check(rx_opt, 0))
   1554		return false;
   1555
   1556	/* RST segments are not recommended to carry timestamp,
   1557	   and, if they do, it is recommended to ignore PAWS because
   1558	   "their cleanup function should take precedence over timestamps."
   1559	   Certainly, it is mistake. It is necessary to understand the reasons
   1560	   of this constraint to relax it: if peer reboots, clock may go
   1561	   out-of-sync and half-open connections will not be reset.
   1562	   Actually, the problem would be not existing if all
   1563	   the implementations followed draft about maintaining clock
   1564	   via reboots. Linux-2.2 DOES NOT!
   1565
   1566	   However, we can relax time bounds for RST segments to MSL.
   1567	 */
   1568	if (rst && !time_before32(ktime_get_seconds(),
   1569				  rx_opt->ts_recent_stamp + TCP_PAWS_MSL))
   1570		return false;
   1571	return true;
   1572}
   1573
   1574bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
   1575			  int mib_idx, u32 *last_oow_ack_time);
   1576
   1577static inline void tcp_mib_init(struct net *net)
   1578{
   1579	/* See RFC 2012 */
   1580	TCP_ADD_STATS(net, TCP_MIB_RTOALGORITHM, 1);
   1581	TCP_ADD_STATS(net, TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ);
   1582	TCP_ADD_STATS(net, TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ);
   1583	TCP_ADD_STATS(net, TCP_MIB_MAXCONN, -1);
   1584}
   1585
   1586/* from STCP */
   1587static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
   1588{
   1589	tp->lost_skb_hint = NULL;
   1590}
   1591
   1592static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
   1593{
   1594	tcp_clear_retrans_hints_partial(tp);
   1595	tp->retransmit_skb_hint = NULL;
   1596}
   1597
   1598union tcp_md5_addr {
   1599	struct in_addr  a4;
   1600#if IS_ENABLED(CONFIG_IPV6)
   1601	struct in6_addr	a6;
   1602#endif
   1603};
   1604
   1605/* - key database */
   1606struct tcp_md5sig_key {
   1607	struct hlist_node	node;
   1608	u8			keylen;
   1609	u8			family; /* AF_INET or AF_INET6 */
   1610	u8			prefixlen;
   1611	u8			flags;
   1612	union tcp_md5_addr	addr;
   1613	int			l3index; /* set if key added with L3 scope */
   1614	u8			key[TCP_MD5SIG_MAXKEYLEN];
   1615	struct rcu_head		rcu;
   1616};
   1617
   1618/* - sock block */
   1619struct tcp_md5sig_info {
   1620	struct hlist_head	head;
   1621	struct rcu_head		rcu;
   1622};
   1623
   1624/* - pseudo header */
   1625struct tcp4_pseudohdr {
   1626	__be32		saddr;
   1627	__be32		daddr;
   1628	__u8		pad;
   1629	__u8		protocol;
   1630	__be16		len;
   1631};
   1632
   1633struct tcp6_pseudohdr {
   1634	struct in6_addr	saddr;
   1635	struct in6_addr daddr;
   1636	__be32		len;
   1637	__be32		protocol;	/* including padding */
   1638};
   1639
   1640union tcp_md5sum_block {
   1641	struct tcp4_pseudohdr ip4;
   1642#if IS_ENABLED(CONFIG_IPV6)
   1643	struct tcp6_pseudohdr ip6;
   1644#endif
   1645};
   1646
   1647/* - pool: digest algorithm, hash description and scratch buffer */
   1648struct tcp_md5sig_pool {
   1649	struct ahash_request	*md5_req;
   1650	void			*scratch;
   1651};
   1652
   1653/* - functions */
   1654int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
   1655			const struct sock *sk, const struct sk_buff *skb);
   1656int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
   1657		   int family, u8 prefixlen, int l3index, u8 flags,
   1658		   const u8 *newkey, u8 newkeylen, gfp_t gfp);
   1659int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr,
   1660		   int family, u8 prefixlen, int l3index, u8 flags);
   1661struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
   1662					 const struct sock *addr_sk);
   1663
   1664#ifdef CONFIG_TCP_MD5SIG
   1665#include <linux/jump_label.h>
   1666extern struct static_key_false tcp_md5_needed;
   1667struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
   1668					   const union tcp_md5_addr *addr,
   1669					   int family);
   1670static inline struct tcp_md5sig_key *
   1671tcp_md5_do_lookup(const struct sock *sk, int l3index,
   1672		  const union tcp_md5_addr *addr, int family)
   1673{
   1674	if (!static_branch_unlikely(&tcp_md5_needed))
   1675		return NULL;
   1676	return __tcp_md5_do_lookup(sk, l3index, addr, family);
   1677}
   1678
   1679enum skb_drop_reason
   1680tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
   1681		     const void *saddr, const void *daddr,
   1682		     int family, int dif, int sdif);
   1683
   1684
   1685#define tcp_twsk_md5_key(twsk)	((twsk)->tw_md5_key)
   1686#else
   1687static inline struct tcp_md5sig_key *
   1688tcp_md5_do_lookup(const struct sock *sk, int l3index,
   1689		  const union tcp_md5_addr *addr, int family)
   1690{
   1691	return NULL;
   1692}
   1693
   1694static inline enum skb_drop_reason
   1695tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
   1696		     const void *saddr, const void *daddr,
   1697		     int family, int dif, int sdif)
   1698{
   1699	return SKB_NOT_DROPPED_YET;
   1700}
   1701#define tcp_twsk_md5_key(twsk)	NULL
   1702#endif
   1703
   1704bool tcp_alloc_md5sig_pool(void);
   1705
   1706struct tcp_md5sig_pool *tcp_get_md5sig_pool(void);
   1707static inline void tcp_put_md5sig_pool(void)
   1708{
   1709	local_bh_enable();
   1710}
   1711
   1712int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *,
   1713			  unsigned int header_len);
   1714int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
   1715		     const struct tcp_md5sig_key *key);
   1716
   1717/* From tcp_fastopen.c */
   1718void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
   1719			    struct tcp_fastopen_cookie *cookie);
   1720void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
   1721			    struct tcp_fastopen_cookie *cookie, bool syn_lost,
   1722			    u16 try_exp);
   1723struct tcp_fastopen_request {
   1724	/* Fast Open cookie. Size 0 means a cookie request */
   1725	struct tcp_fastopen_cookie	cookie;
   1726	struct msghdr			*data;  /* data in MSG_FASTOPEN */
   1727	size_t				size;
   1728	int				copied;	/* queued in tcp_connect() */
   1729	struct ubuf_info		*uarg;
   1730};
   1731void tcp_free_fastopen_req(struct tcp_sock *tp);
   1732void tcp_fastopen_destroy_cipher(struct sock *sk);
   1733void tcp_fastopen_ctx_destroy(struct net *net);
   1734int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
   1735			      void *primary_key, void *backup_key);
   1736int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
   1737			    u64 *key);
   1738void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
   1739struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
   1740			      struct request_sock *req,
   1741			      struct tcp_fastopen_cookie *foc,
   1742			      const struct dst_entry *dst);
   1743void tcp_fastopen_init_key_once(struct net *net);
   1744bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
   1745			     struct tcp_fastopen_cookie *cookie);
   1746bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
   1747#define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t)
   1748#define TCP_FASTOPEN_KEY_MAX 2
   1749#define TCP_FASTOPEN_KEY_BUF_LENGTH \
   1750	(TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX)
   1751
   1752/* Fastopen key context */
   1753struct tcp_fastopen_context {
   1754	siphash_key_t	key[TCP_FASTOPEN_KEY_MAX];
   1755	int		num;
   1756	struct rcu_head	rcu;
   1757};
   1758
   1759void tcp_fastopen_active_disable(struct sock *sk);
   1760bool tcp_fastopen_active_should_disable(struct sock *sk);
   1761void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
   1762void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired);
   1763
   1764/* Caller needs to wrap with rcu_read_(un)lock() */
   1765static inline
   1766struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk)
   1767{
   1768	struct tcp_fastopen_context *ctx;
   1769
   1770	ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
   1771	if (!ctx)
   1772		ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
   1773	return ctx;
   1774}
   1775
   1776static inline
   1777bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc,
   1778			       const struct tcp_fastopen_cookie *orig)
   1779{
   1780	if (orig->len == TCP_FASTOPEN_COOKIE_SIZE &&
   1781	    orig->len == foc->len &&
   1782	    !memcmp(orig->val, foc->val, foc->len))
   1783		return true;
   1784	return false;
   1785}
   1786
   1787static inline
   1788int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx)
   1789{
   1790	return ctx->num;
   1791}
   1792
   1793/* Latencies incurred by various limits for a sender. They are
   1794 * chronograph-like stats that are mutually exclusive.
   1795 */
   1796enum tcp_chrono {
   1797	TCP_CHRONO_UNSPEC,
   1798	TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
   1799	TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
   1800	TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
   1801	__TCP_CHRONO_MAX,
   1802};
   1803
   1804void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
   1805void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
   1806
   1807/* This helper is needed, because skb->tcp_tsorted_anchor uses
   1808 * the same memory storage than skb->destructor/_skb_refdst
   1809 */
   1810static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
   1811{
   1812	skb->destructor = NULL;
   1813	skb->_skb_refdst = 0UL;
   1814}
   1815
   1816#define tcp_skb_tsorted_save(skb) {		\
   1817	unsigned long _save = skb->_skb_refdst;	\
   1818	skb->_skb_refdst = 0UL;
   1819
   1820#define tcp_skb_tsorted_restore(skb)		\
   1821	skb->_skb_refdst = _save;		\
   1822}
   1823
   1824void tcp_write_queue_purge(struct sock *sk);
   1825
   1826static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
   1827{
   1828	return skb_rb_first(&sk->tcp_rtx_queue);
   1829}
   1830
   1831static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk)
   1832{
   1833	return skb_rb_last(&sk->tcp_rtx_queue);
   1834}
   1835
   1836static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk)
   1837{
   1838	return skb_peek_tail(&sk->sk_write_queue);
   1839}
   1840
   1841#define tcp_for_write_queue_from_safe(skb, tmp, sk)			\
   1842	skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
   1843
   1844static inline struct sk_buff *tcp_send_head(const struct sock *sk)
   1845{
   1846	return skb_peek(&sk->sk_write_queue);
   1847}
   1848
   1849static inline bool tcp_skb_is_last(const struct sock *sk,
   1850				   const struct sk_buff *skb)
   1851{
   1852	return skb_queue_is_last(&sk->sk_write_queue, skb);
   1853}
   1854
   1855/**
   1856 * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue
   1857 * @sk: socket
   1858 *
   1859 * Since the write queue can have a temporary empty skb in it,
   1860 * we must not use "return skb_queue_empty(&sk->sk_write_queue)"
   1861 */
   1862static inline bool tcp_write_queue_empty(const struct sock *sk)
   1863{
   1864	const struct tcp_sock *tp = tcp_sk(sk);
   1865
   1866	return tp->write_seq == tp->snd_nxt;
   1867}
   1868
   1869static inline bool tcp_rtx_queue_empty(const struct sock *sk)
   1870{
   1871	return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
   1872}
   1873
   1874static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
   1875{
   1876	return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
   1877}
   1878
   1879static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
   1880{
   1881	__skb_queue_tail(&sk->sk_write_queue, skb);
   1882
   1883	/* Queue it, remembering where we must start sending. */
   1884	if (sk->sk_write_queue.next == skb)
   1885		tcp_chrono_start(sk, TCP_CHRONO_BUSY);
   1886}
   1887
   1888/* Insert new before skb on the write queue of sk.  */
   1889static inline void tcp_insert_write_queue_before(struct sk_buff *new,
   1890						  struct sk_buff *skb,
   1891						  struct sock *sk)
   1892{
   1893	__skb_queue_before(&sk->sk_write_queue, skb, new);
   1894}
   1895
   1896static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
   1897{
   1898	tcp_skb_tsorted_anchor_cleanup(skb);
   1899	__skb_unlink(skb, &sk->sk_write_queue);
   1900}
   1901
   1902void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
   1903
   1904static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
   1905{
   1906	tcp_skb_tsorted_anchor_cleanup(skb);
   1907	rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
   1908}
   1909
   1910static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
   1911{
   1912	list_del(&skb->tcp_tsorted_anchor);
   1913	tcp_rtx_queue_unlink(skb, sk);
   1914	tcp_wmem_free_skb(sk, skb);
   1915}
   1916
   1917static inline void tcp_push_pending_frames(struct sock *sk)
   1918{
   1919	if (tcp_send_head(sk)) {
   1920		struct tcp_sock *tp = tcp_sk(sk);
   1921
   1922		__tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle);
   1923	}
   1924}
   1925
   1926/* Start sequence of the skb just after the highest skb with SACKed
   1927 * bit, valid only if sacked_out > 0 or when the caller has ensured
   1928 * validity by itself.
   1929 */
   1930static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
   1931{
   1932	if (!tp->sacked_out)
   1933		return tp->snd_una;
   1934
   1935	if (tp->highest_sack == NULL)
   1936		return tp->snd_nxt;
   1937
   1938	return TCP_SKB_CB(tp->highest_sack)->seq;
   1939}
   1940
   1941static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
   1942{
   1943	tcp_sk(sk)->highest_sack = skb_rb_next(skb);
   1944}
   1945
   1946static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
   1947{
   1948	return tcp_sk(sk)->highest_sack;
   1949}
   1950
   1951static inline void tcp_highest_sack_reset(struct sock *sk)
   1952{
   1953	tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk);
   1954}
   1955
   1956/* Called when old skb is about to be deleted and replaced by new skb */
   1957static inline void tcp_highest_sack_replace(struct sock *sk,
   1958					    struct sk_buff *old,
   1959					    struct sk_buff *new)
   1960{
   1961	if (old == tcp_highest_sack(sk))
   1962		tcp_sk(sk)->highest_sack = new;
   1963}
   1964
   1965/* This helper checks if socket has IP_TRANSPARENT set */
   1966static inline bool inet_sk_transparent(const struct sock *sk)
   1967{
   1968	switch (sk->sk_state) {
   1969	case TCP_TIME_WAIT:
   1970		return inet_twsk(sk)->tw_transparent;
   1971	case TCP_NEW_SYN_RECV:
   1972		return inet_rsk(inet_reqsk(sk))->no_srccheck;
   1973	}
   1974	return inet_sk(sk)->transparent;
   1975}
   1976
   1977/* Determines whether this is a thin stream (which may suffer from
   1978 * increased latency). Used to trigger latency-reducing mechanisms.
   1979 */
   1980static inline bool tcp_stream_is_thin(struct tcp_sock *tp)
   1981{
   1982	return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp);
   1983}
   1984
   1985/* /proc */
   1986enum tcp_seq_states {
   1987	TCP_SEQ_STATE_LISTENING,
   1988	TCP_SEQ_STATE_ESTABLISHED,
   1989};
   1990
   1991void *tcp_seq_start(struct seq_file *seq, loff_t *pos);
   1992void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
   1993void tcp_seq_stop(struct seq_file *seq, void *v);
   1994
   1995struct tcp_seq_afinfo {
   1996	sa_family_t			family;
   1997};
   1998
   1999struct tcp_iter_state {
   2000	struct seq_net_private	p;
   2001	enum tcp_seq_states	state;
   2002	struct sock		*syn_wait_sk;
   2003	int			bucket, offset, sbucket, num;
   2004	loff_t			last_pos;
   2005};
   2006
   2007extern struct request_sock_ops tcp_request_sock_ops;
   2008extern struct request_sock_ops tcp6_request_sock_ops;
   2009
   2010void tcp_v4_destroy_sock(struct sock *sk);
   2011
   2012struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
   2013				netdev_features_t features);
   2014struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
   2015INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
   2016INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
   2017INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
   2018INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb));
   2019int tcp_gro_complete(struct sk_buff *skb);
   2020
   2021void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
   2022
   2023static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
   2024{
   2025	struct net *net = sock_net((struct sock *)tp);
   2026	return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
   2027}
   2028
   2029bool tcp_stream_memory_free(const struct sock *sk, int wake);
   2030
   2031#ifdef CONFIG_PROC_FS
   2032int tcp4_proc_init(void);
   2033void tcp4_proc_exit(void);
   2034#endif
   2035
   2036int tcp_rtx_synack(const struct sock *sk, struct request_sock *req);
   2037int tcp_conn_request(struct request_sock_ops *rsk_ops,
   2038		     const struct tcp_request_sock_ops *af_ops,
   2039		     struct sock *sk, struct sk_buff *skb);
   2040
   2041/* TCP af-specific functions */
   2042struct tcp_sock_af_ops {
   2043#ifdef CONFIG_TCP_MD5SIG
   2044	struct tcp_md5sig_key	*(*md5_lookup) (const struct sock *sk,
   2045						const struct sock *addr_sk);
   2046	int		(*calc_md5_hash)(char *location,
   2047					 const struct tcp_md5sig_key *md5,
   2048					 const struct sock *sk,
   2049					 const struct sk_buff *skb);
   2050	int		(*md5_parse)(struct sock *sk,
   2051				     int optname,
   2052				     sockptr_t optval,
   2053				     int optlen);
   2054#endif
   2055};
   2056
   2057struct tcp_request_sock_ops {
   2058	u16 mss_clamp;
   2059#ifdef CONFIG_TCP_MD5SIG
   2060	struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk,
   2061						 const struct sock *addr_sk);
   2062	int		(*calc_md5_hash) (char *location,
   2063					  const struct tcp_md5sig_key *md5,
   2064					  const struct sock *sk,
   2065					  const struct sk_buff *skb);
   2066#endif
   2067#ifdef CONFIG_SYN_COOKIES
   2068	__u32 (*cookie_init_seq)(const struct sk_buff *skb,
   2069				 __u16 *mss);
   2070#endif
   2071	struct dst_entry *(*route_req)(const struct sock *sk,
   2072				       struct sk_buff *skb,
   2073				       struct flowi *fl,
   2074				       struct request_sock *req);
   2075	u32 (*init_seq)(const struct sk_buff *skb);
   2076	u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb);
   2077	int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
   2078			   struct flowi *fl, struct request_sock *req,
   2079			   struct tcp_fastopen_cookie *foc,
   2080			   enum tcp_synack_type synack_type,
   2081			   struct sk_buff *syn_skb);
   2082};
   2083
   2084extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
   2085#if IS_ENABLED(CONFIG_IPV6)
   2086extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
   2087#endif
   2088
   2089#ifdef CONFIG_SYN_COOKIES
   2090static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
   2091					 const struct sock *sk, struct sk_buff *skb,
   2092					 __u16 *mss)
   2093{
   2094	tcp_synq_overflow(sk);
   2095	__NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
   2096	return ops->cookie_init_seq(skb, mss);
   2097}
   2098#else
   2099static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
   2100					 const struct sock *sk, struct sk_buff *skb,
   2101					 __u16 *mss)
   2102{
   2103	return 0;
   2104}
   2105#endif
   2106
   2107int tcpv4_offload_init(void);
   2108
   2109void tcp_v4_init(void);
   2110void tcp_init(void);
   2111
   2112/* tcp_recovery.c */
   2113void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
   2114void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
   2115extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
   2116				u32 reo_wnd);
   2117extern bool tcp_rack_mark_lost(struct sock *sk);
   2118extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
   2119			     u64 xmit_time);
   2120extern void tcp_rack_reo_timeout(struct sock *sk);
   2121extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
   2122
   2123/* At how many usecs into the future should the RTO fire? */
   2124static inline s64 tcp_rto_delta_us(const struct sock *sk)
   2125{
   2126	const struct sk_buff *skb = tcp_rtx_queue_head(sk);
   2127	u32 rto = inet_csk(sk)->icsk_rto;
   2128	u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto);
   2129
   2130	return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
   2131}
   2132
   2133/*
   2134 * Save and compile IPv4 options, return a pointer to it
   2135 */
   2136static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net,
   2137							 struct sk_buff *skb)
   2138{
   2139	const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
   2140	struct ip_options_rcu *dopt = NULL;
   2141
   2142	if (opt->optlen) {
   2143		int opt_size = sizeof(*dopt) + opt->optlen;
   2144
   2145		dopt = kmalloc(opt_size, GFP_ATOMIC);
   2146		if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) {
   2147			kfree(dopt);
   2148			dopt = NULL;
   2149		}
   2150	}
   2151	return dopt;
   2152}
   2153
   2154/* locally generated TCP pure ACKs have skb->truesize == 2
   2155 * (check tcp_send_ack() in net/ipv4/tcp_output.c )
   2156 * This is much faster than dissecting the packet to find out.
   2157 * (Think of GRE encapsulations, IPv4, IPv6, ...)
   2158 */
   2159static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb)
   2160{
   2161	return skb->truesize == 2;
   2162}
   2163
   2164static inline void skb_set_tcp_pure_ack(struct sk_buff *skb)
   2165{
   2166	skb->truesize = 2;
   2167}
   2168
   2169static inline int tcp_inq(struct sock *sk)
   2170{
   2171	struct tcp_sock *tp = tcp_sk(sk);
   2172	int answ;
   2173
   2174	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
   2175		answ = 0;
   2176	} else if (sock_flag(sk, SOCK_URGINLINE) ||
   2177		   !tp->urg_data ||
   2178		   before(tp->urg_seq, tp->copied_seq) ||
   2179		   !before(tp->urg_seq, tp->rcv_nxt)) {
   2180
   2181		answ = tp->rcv_nxt - tp->copied_seq;
   2182
   2183		/* Subtract 1, if FIN was received */
   2184		if (answ && sock_flag(sk, SOCK_DONE))
   2185			answ--;
   2186	} else {
   2187		answ = tp->urg_seq - tp->copied_seq;
   2188	}
   2189
   2190	return answ;
   2191}
   2192
   2193int tcp_peek_len(struct socket *sock);
   2194
   2195static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
   2196{
   2197	u16 segs_in;
   2198
   2199	segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
   2200
   2201	/* We update these fields while other threads might
   2202	 * read them from tcp_get_info()
   2203	 */
   2204	WRITE_ONCE(tp->segs_in, tp->segs_in + segs_in);
   2205	if (skb->len > tcp_hdrlen(skb))
   2206		WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in);
   2207}
   2208
   2209/*
   2210 * TCP listen path runs lockless.
   2211 * We forced "struct sock" to be const qualified to make sure
   2212 * we don't modify one of its field by mistake.
   2213 * Here, we increment sk_drops which is an atomic_t, so we can safely
   2214 * make sock writable again.
   2215 */
   2216static inline void tcp_listendrop(const struct sock *sk)
   2217{
   2218	atomic_inc(&((struct sock *)sk)->sk_drops);
   2219	__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
   2220}
   2221
   2222enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
   2223
   2224/*
   2225 * Interface for adding Upper Level Protocols over TCP
   2226 */
   2227
   2228#define TCP_ULP_NAME_MAX	16
   2229#define TCP_ULP_MAX		128
   2230#define TCP_ULP_BUF_MAX		(TCP_ULP_NAME_MAX*TCP_ULP_MAX)
   2231
   2232struct tcp_ulp_ops {
   2233	struct list_head	list;
   2234
   2235	/* initialize ulp */
   2236	int (*init)(struct sock *sk);
   2237	/* update ulp */
   2238	void (*update)(struct sock *sk, struct proto *p,
   2239		       void (*write_space)(struct sock *sk));
   2240	/* cleanup ulp */
   2241	void (*release)(struct sock *sk);
   2242	/* diagnostic */
   2243	int (*get_info)(const struct sock *sk, struct sk_buff *skb);
   2244	size_t (*get_info_size)(const struct sock *sk);
   2245	/* clone ulp */
   2246	void (*clone)(const struct request_sock *req, struct sock *newsk,
   2247		      const gfp_t priority);
   2248
   2249	char		name[TCP_ULP_NAME_MAX];
   2250	struct module	*owner;
   2251};
   2252int tcp_register_ulp(struct tcp_ulp_ops *type);
   2253void tcp_unregister_ulp(struct tcp_ulp_ops *type);
   2254int tcp_set_ulp(struct sock *sk, const char *name);
   2255void tcp_get_available_ulp(char *buf, size_t len);
   2256void tcp_cleanup_ulp(struct sock *sk);
   2257void tcp_update_ulp(struct sock *sk, struct proto *p,
   2258		    void (*write_space)(struct sock *sk));
   2259
   2260#define MODULE_ALIAS_TCP_ULP(name)				\
   2261	__MODULE_INFO(alias, alias_userspace, name);		\
   2262	__MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
   2263
   2264#ifdef CONFIG_NET_SOCK_MSG
   2265struct sk_msg;
   2266struct sk_psock;
   2267
   2268#ifdef CONFIG_BPF_SYSCALL
   2269struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
   2270int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
   2271void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
   2272#endif /* CONFIG_BPF_SYSCALL */
   2273
   2274int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
   2275			  int flags);
   2276#endif /* CONFIG_NET_SOCK_MSG */
   2277
   2278#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
   2279static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
   2280{
   2281}
   2282#endif
   2283
   2284#ifdef CONFIG_CGROUP_BPF
   2285static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
   2286				      struct sk_buff *skb,
   2287				      unsigned int end_offset)
   2288{
   2289	skops->skb = skb;
   2290	skops->skb_data_end = skb->data + end_offset;
   2291}
   2292#else
   2293static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
   2294				      struct sk_buff *skb,
   2295				      unsigned int end_offset)
   2296{
   2297}
   2298#endif
   2299
   2300/* Call BPF_SOCK_OPS program that returns an int. If the return value
   2301 * is < 0, then the BPF op failed (for example if the loaded BPF
   2302 * program does not support the chosen operation or there is no BPF
   2303 * program loaded).
   2304 */
   2305#ifdef CONFIG_BPF
   2306static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
   2307{
   2308	struct bpf_sock_ops_kern sock_ops;
   2309	int ret;
   2310
   2311	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
   2312	if (sk_fullsock(sk)) {
   2313		sock_ops.is_fullsock = 1;
   2314		sock_owned_by_me(sk);
   2315	}
   2316
   2317	sock_ops.sk = sk;
   2318	sock_ops.op = op;
   2319	if (nargs > 0)
   2320		memcpy(sock_ops.args, args, nargs * sizeof(*args));
   2321
   2322	ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
   2323	if (ret == 0)
   2324		ret = sock_ops.reply;
   2325	else
   2326		ret = -1;
   2327	return ret;
   2328}
   2329
   2330static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
   2331{
   2332	u32 args[2] = {arg1, arg2};
   2333
   2334	return tcp_call_bpf(sk, op, 2, args);
   2335}
   2336
   2337static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
   2338				    u32 arg3)
   2339{
   2340	u32 args[3] = {arg1, arg2, arg3};
   2341
   2342	return tcp_call_bpf(sk, op, 3, args);
   2343}
   2344
   2345#else
   2346static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
   2347{
   2348	return -EPERM;
   2349}
   2350
   2351static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
   2352{
   2353	return -EPERM;
   2354}
   2355
   2356static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
   2357				    u32 arg3)
   2358{
   2359	return -EPERM;
   2360}
   2361
   2362#endif
   2363
   2364static inline u32 tcp_timeout_init(struct sock *sk)
   2365{
   2366	int timeout;
   2367
   2368	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
   2369
   2370	if (timeout <= 0)
   2371		timeout = TCP_TIMEOUT_INIT;
   2372	return min_t(int, timeout, TCP_RTO_MAX);
   2373}
   2374
   2375static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
   2376{
   2377	int rwnd;
   2378
   2379	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
   2380
   2381	if (rwnd < 0)
   2382		rwnd = 0;
   2383	return rwnd;
   2384}
   2385
   2386static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
   2387{
   2388	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
   2389}
   2390
   2391static inline void tcp_bpf_rtt(struct sock *sk)
   2392{
   2393	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG))
   2394		tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
   2395}
   2396
   2397#if IS_ENABLED(CONFIG_SMC)
   2398extern struct static_key_false tcp_have_smc;
   2399#endif
   2400
   2401#if IS_ENABLED(CONFIG_TLS_DEVICE)
   2402void clean_acked_data_enable(struct inet_connection_sock *icsk,
   2403			     void (*cad)(struct sock *sk, u32 ack_seq));
   2404void clean_acked_data_disable(struct inet_connection_sock *icsk);
   2405void clean_acked_data_flush(void);
   2406#endif
   2407
   2408DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
   2409static inline void tcp_add_tx_delay(struct sk_buff *skb,
   2410				    const struct tcp_sock *tp)
   2411{
   2412	if (static_branch_unlikely(&tcp_tx_delay_enabled))
   2413		skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
   2414}
   2415
   2416/* Compute Earliest Departure Time for some control packets
   2417 * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets.
   2418 */
   2419static inline u64 tcp_transmit_time(const struct sock *sk)
   2420{
   2421	if (static_branch_unlikely(&tcp_tx_delay_enabled)) {
   2422		u32 delay = (sk->sk_state == TCP_TIME_WAIT) ?
   2423			tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay;
   2424
   2425		return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
   2426	}
   2427	return 0;
   2428}
   2429
   2430#endif	/* _TCP_H */