cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

tcp_output.c (122071B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
      4 *		operating system.  INET is implemented using the  BSD Socket
      5 *		interface as the means of communication with the user level.
      6 *
      7 *		Implementation of the Transmission Control Protocol(TCP).
      8 *
      9 * Authors:	Ross Biro
     10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     11 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
     12 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
     13 *		Florian La Roche, <flla@stud.uni-sb.de>
     14 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
     15 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
     16 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
     17 *		Matthew Dillon, <dillon@apollo.west.oic.com>
     18 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
     19 *		Jorge Cwik, <jorge@laser.satlink.net>
     20 */
     21
     22/*
     23 * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
     24 *				:	Fragmentation on mtu decrease
     25 *				:	Segment collapse on retransmit
     26 *				:	AF independence
     27 *
     28 *		Linus Torvalds	:	send_delayed_ack
     29 *		David S. Miller	:	Charge memory using the right skb
     30 *					during syn/ack processing.
     31 *		David S. Miller :	Output engine completely rewritten.
     32 *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
     33 *		Cacophonix Gaul :	draft-minshall-nagle-01
     34 *		J Hadi Salim	:	ECN support
     35 *
     36 */
     37
     38#define pr_fmt(fmt) "TCP: " fmt
     39
     40#include <net/tcp.h>
     41#include <net/mptcp.h>
     42
     43#include <linux/compiler.h>
     44#include <linux/gfp.h>
     45#include <linux/module.h>
     46#include <linux/static_key.h>
     47
     48#include <trace/events/tcp.h>
     49
     50/* Refresh clocks of a TCP socket,
     51 * ensuring monotically increasing values.
     52 */
     53void tcp_mstamp_refresh(struct tcp_sock *tp)
     54{
     55	u64 val = tcp_clock_ns();
     56
     57	tp->tcp_clock_cache = val;
     58	tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
     59}
     60
     61static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
     62			   int push_one, gfp_t gfp);
     63
     64/* Account for new data that has been sent to the network. */
     65static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
     66{
     67	struct inet_connection_sock *icsk = inet_csk(sk);
     68	struct tcp_sock *tp = tcp_sk(sk);
     69	unsigned int prior_packets = tp->packets_out;
     70
     71	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
     72
     73	__skb_unlink(skb, &sk->sk_write_queue);
     74	tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
     75
     76	if (tp->highest_sack == NULL)
     77		tp->highest_sack = skb;
     78
     79	tp->packets_out += tcp_skb_pcount(skb);
     80	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
     81		tcp_rearm_rto(sk);
     82
     83	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
     84		      tcp_skb_pcount(skb));
     85	tcp_check_space(sk);
     86}
     87
     88/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
     89 * window scaling factor due to loss of precision.
     90 * If window has been shrunk, what should we make? It is not clear at all.
     91 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
     92 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
     93 * invalid. OK, let's make this for now:
     94 */
     95static inline __u32 tcp_acceptable_seq(const struct sock *sk)
     96{
     97	const struct tcp_sock *tp = tcp_sk(sk);
     98
     99	if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
    100	    (tp->rx_opt.wscale_ok &&
    101	     ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
    102		return tp->snd_nxt;
    103	else
    104		return tcp_wnd_end(tp);
    105}
    106
    107/* Calculate mss to advertise in SYN segment.
    108 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
    109 *
    110 * 1. It is independent of path mtu.
    111 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
    112 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
    113 *    attached devices, because some buggy hosts are confused by
    114 *    large MSS.
    115 * 4. We do not make 3, we advertise MSS, calculated from first
    116 *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
    117 *    This may be overridden via information stored in routing table.
    118 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
    119 *    probably even Jumbo".
    120 */
    121static __u16 tcp_advertise_mss(struct sock *sk)
    122{
    123	struct tcp_sock *tp = tcp_sk(sk);
    124	const struct dst_entry *dst = __sk_dst_get(sk);
    125	int mss = tp->advmss;
    126
    127	if (dst) {
    128		unsigned int metric = dst_metric_advmss(dst);
    129
    130		if (metric < mss) {
    131			mss = metric;
    132			tp->advmss = mss;
    133		}
    134	}
    135
    136	return (__u16)mss;
    137}
    138
    139/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
    140 * This is the first part of cwnd validation mechanism.
    141 */
    142void tcp_cwnd_restart(struct sock *sk, s32 delta)
    143{
    144	struct tcp_sock *tp = tcp_sk(sk);
    145	u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
    146	u32 cwnd = tcp_snd_cwnd(tp);
    147
    148	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
    149
    150	tp->snd_ssthresh = tcp_current_ssthresh(sk);
    151	restart_cwnd = min(restart_cwnd, cwnd);
    152
    153	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
    154		cwnd >>= 1;
    155	tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd));
    156	tp->snd_cwnd_stamp = tcp_jiffies32;
    157	tp->snd_cwnd_used = 0;
    158}
    159
    160/* Congestion state accounting after a packet has been sent. */
    161static void tcp_event_data_sent(struct tcp_sock *tp,
    162				struct sock *sk)
    163{
    164	struct inet_connection_sock *icsk = inet_csk(sk);
    165	const u32 now = tcp_jiffies32;
    166
    167	if (tcp_packets_in_flight(tp) == 0)
    168		tcp_ca_event(sk, CA_EVENT_TX_START);
    169
    170	/* If this is the first data packet sent in response to the
    171	 * previous received data,
    172	 * and it is a reply for ato after last received packet,
    173	 * increase pingpong count.
    174	 */
    175	if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
    176	    (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
    177		inet_csk_inc_pingpong_cnt(sk);
    178
    179	tp->lsndtime = now;
    180}
    181
    182/* Account for an ACK we sent. */
    183static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
    184				      u32 rcv_nxt)
    185{
    186	struct tcp_sock *tp = tcp_sk(sk);
    187
    188	if (unlikely(tp->compressed_ack)) {
    189		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
    190			      tp->compressed_ack);
    191		tp->compressed_ack = 0;
    192		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
    193			__sock_put(sk);
    194	}
    195
    196	if (unlikely(rcv_nxt != tp->rcv_nxt))
    197		return;  /* Special ACK sent by DCTCP to reflect ECN */
    198	tcp_dec_quickack_mode(sk, pkts);
    199	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
    200}
    201
    202/* Determine a window scaling and initial window to offer.
    203 * Based on the assumption that the given amount of space
    204 * will be offered. Store the results in the tp structure.
    205 * NOTE: for smooth operation initial space offering should
    206 * be a multiple of mss if possible. We assume here that mss >= 1.
    207 * This MUST be enforced by all callers.
    208 */
    209void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
    210			       __u32 *rcv_wnd, __u32 *window_clamp,
    211			       int wscale_ok, __u8 *rcv_wscale,
    212			       __u32 init_rcv_wnd)
    213{
    214	unsigned int space = (__space < 0 ? 0 : __space);
    215
    216	/* If no clamp set the clamp to the max possible scaled window */
    217	if (*window_clamp == 0)
    218		(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
    219	space = min(*window_clamp, space);
    220
    221	/* Quantize space offering to a multiple of mss if possible. */
    222	if (space > mss)
    223		space = rounddown(space, mss);
    224
    225	/* NOTE: offering an initial window larger than 32767
    226	 * will break some buggy TCP stacks. If the admin tells us
    227	 * it is likely we could be speaking with such a buggy stack
    228	 * we will truncate our initial window offering to 32K-1
    229	 * unless the remote has sent us a window scaling option,
    230	 * which we interpret as a sign the remote TCP is not
    231	 * misinterpreting the window field as a signed quantity.
    232	 */
    233	if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
    234		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
    235	else
    236		(*rcv_wnd) = min_t(u32, space, U16_MAX);
    237
    238	if (init_rcv_wnd)
    239		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
    240
    241	*rcv_wscale = 0;
    242	if (wscale_ok) {
    243		/* Set window scaling on max possible window */
    244		space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
    245		space = max_t(u32, space, sysctl_rmem_max);
    246		space = min_t(u32, space, *window_clamp);
    247		*rcv_wscale = clamp_t(int, ilog2(space) - 15,
    248				      0, TCP_MAX_WSCALE);
    249	}
    250	/* Set the clamp no higher than max representable value */
    251	(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
    252}
    253EXPORT_SYMBOL(tcp_select_initial_window);
    254
    255/* Chose a new window to advertise, update state in tcp_sock for the
    256 * socket, and return result with RFC1323 scaling applied.  The return
    257 * value can be stuffed directly into th->window for an outgoing
    258 * frame.
    259 */
    260static u16 tcp_select_window(struct sock *sk)
    261{
    262	struct tcp_sock *tp = tcp_sk(sk);
    263	u32 old_win = tp->rcv_wnd;
    264	u32 cur_win = tcp_receive_window(tp);
    265	u32 new_win = __tcp_select_window(sk);
    266
    267	/* Never shrink the offered window */
    268	if (new_win < cur_win) {
    269		/* Danger Will Robinson!
    270		 * Don't update rcv_wup/rcv_wnd here or else
    271		 * we will not be able to advertise a zero
    272		 * window in time.  --DaveM
    273		 *
    274		 * Relax Will Robinson.
    275		 */
    276		if (new_win == 0)
    277			NET_INC_STATS(sock_net(sk),
    278				      LINUX_MIB_TCPWANTZEROWINDOWADV);
    279		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
    280	}
    281	tp->rcv_wnd = new_win;
    282	tp->rcv_wup = tp->rcv_nxt;
    283
    284	/* Make sure we do not exceed the maximum possible
    285	 * scaled window.
    286	 */
    287	if (!tp->rx_opt.rcv_wscale &&
    288	    sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
    289		new_win = min(new_win, MAX_TCP_WINDOW);
    290	else
    291		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
    292
    293	/* RFC1323 scaling applied */
    294	new_win >>= tp->rx_opt.rcv_wscale;
    295
    296	/* If we advertise zero window, disable fast path. */
    297	if (new_win == 0) {
    298		tp->pred_flags = 0;
    299		if (old_win)
    300			NET_INC_STATS(sock_net(sk),
    301				      LINUX_MIB_TCPTOZEROWINDOWADV);
    302	} else if (old_win == 0) {
    303		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
    304	}
    305
    306	return new_win;
    307}
    308
    309/* Packet ECN state for a SYN-ACK */
    310static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
    311{
    312	const struct tcp_sock *tp = tcp_sk(sk);
    313
    314	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
    315	if (!(tp->ecn_flags & TCP_ECN_OK))
    316		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
    317	else if (tcp_ca_needs_ecn(sk) ||
    318		 tcp_bpf_ca_needs_ecn(sk))
    319		INET_ECN_xmit(sk);
    320}
    321
    322/* Packet ECN state for a SYN.  */
    323static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
    324{
    325	struct tcp_sock *tp = tcp_sk(sk);
    326	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
    327	bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
    328		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
    329
    330	if (!use_ecn) {
    331		const struct dst_entry *dst = __sk_dst_get(sk);
    332
    333		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
    334			use_ecn = true;
    335	}
    336
    337	tp->ecn_flags = 0;
    338
    339	if (use_ecn) {
    340		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
    341		tp->ecn_flags = TCP_ECN_OK;
    342		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
    343			INET_ECN_xmit(sk);
    344	}
    345}
    346
    347static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
    348{
    349	if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
    350		/* tp->ecn_flags are cleared at a later point in time when
    351		 * SYN ACK is ultimatively being received.
    352		 */
    353		TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
    354}
    355
    356static void
    357tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
    358{
    359	if (inet_rsk(req)->ecn_ok)
    360		th->ece = 1;
    361}
    362
    363/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
    364 * be sent.
    365 */
    366static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
    367			 struct tcphdr *th, int tcp_header_len)
    368{
    369	struct tcp_sock *tp = tcp_sk(sk);
    370
    371	if (tp->ecn_flags & TCP_ECN_OK) {
    372		/* Not-retransmitted data segment: set ECT and inject CWR. */
    373		if (skb->len != tcp_header_len &&
    374		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
    375			INET_ECN_xmit(sk);
    376			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
    377				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
    378				th->cwr = 1;
    379				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
    380			}
    381		} else if (!tcp_ca_needs_ecn(sk)) {
    382			/* ACK or retransmitted segment: clear ECT|CE */
    383			INET_ECN_dontxmit(sk);
    384		}
    385		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
    386			th->ece = 1;
    387	}
    388}
    389
    390/* Constructs common control bits of non-data skb. If SYN/FIN is present,
    391 * auto increment end seqno.
    392 */
    393static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
    394{
    395	skb->ip_summed = CHECKSUM_PARTIAL;
    396
    397	TCP_SKB_CB(skb)->tcp_flags = flags;
    398
    399	tcp_skb_pcount_set(skb, 1);
    400
    401	TCP_SKB_CB(skb)->seq = seq;
    402	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
    403		seq++;
    404	TCP_SKB_CB(skb)->end_seq = seq;
    405}
    406
    407static inline bool tcp_urg_mode(const struct tcp_sock *tp)
    408{
    409	return tp->snd_una != tp->snd_up;
    410}
    411
    412#define OPTION_SACK_ADVERTISE	BIT(0)
    413#define OPTION_TS		BIT(1)
    414#define OPTION_MD5		BIT(2)
    415#define OPTION_WSCALE		BIT(3)
    416#define OPTION_FAST_OPEN_COOKIE	BIT(8)
    417#define OPTION_SMC		BIT(9)
    418#define OPTION_MPTCP		BIT(10)
    419
    420static void smc_options_write(__be32 *ptr, u16 *options)
    421{
    422#if IS_ENABLED(CONFIG_SMC)
    423	if (static_branch_unlikely(&tcp_have_smc)) {
    424		if (unlikely(OPTION_SMC & *options)) {
    425			*ptr++ = htonl((TCPOPT_NOP  << 24) |
    426				       (TCPOPT_NOP  << 16) |
    427				       (TCPOPT_EXP <<  8) |
    428				       (TCPOLEN_EXP_SMC_BASE));
    429			*ptr++ = htonl(TCPOPT_SMC_MAGIC);
    430		}
    431	}
    432#endif
    433}
    434
    435struct tcp_out_options {
    436	u16 options;		/* bit field of OPTION_* */
    437	u16 mss;		/* 0 to disable */
    438	u8 ws;			/* window scale, 0 to disable */
    439	u8 num_sack_blocks;	/* number of SACK blocks to include */
    440	u8 hash_size;		/* bytes in hash_location */
    441	u8 bpf_opt_len;		/* length of BPF hdr option */
    442	__u8 *hash_location;	/* temporary pointer, overloaded */
    443	__u32 tsval, tsecr;	/* need to include OPTION_TS */
    444	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
    445	struct mptcp_out_options mptcp;
    446};
    447
    448static void mptcp_options_write(struct tcphdr *th, __be32 *ptr,
    449				struct tcp_sock *tp,
    450				struct tcp_out_options *opts)
    451{
    452#if IS_ENABLED(CONFIG_MPTCP)
    453	if (unlikely(OPTION_MPTCP & opts->options))
    454		mptcp_write_options(th, ptr, tp, &opts->mptcp);
    455#endif
    456}
    457
    458#ifdef CONFIG_CGROUP_BPF
    459static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
    460					enum tcp_synack_type synack_type)
    461{
    462	if (unlikely(!skb))
    463		return BPF_WRITE_HDR_TCP_CURRENT_MSS;
    464
    465	if (unlikely(synack_type == TCP_SYNACK_COOKIE))
    466		return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
    467
    468	return 0;
    469}
    470
    471/* req, syn_skb and synack_type are used when writing synack */
    472static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
    473				  struct request_sock *req,
    474				  struct sk_buff *syn_skb,
    475				  enum tcp_synack_type synack_type,
    476				  struct tcp_out_options *opts,
    477				  unsigned int *remaining)
    478{
    479	struct bpf_sock_ops_kern sock_ops;
    480	int err;
    481
    482	if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
    483					   BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
    484	    !*remaining)
    485		return;
    486
    487	/* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
    488
    489	/* init sock_ops */
    490	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
    491
    492	sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
    493
    494	if (req) {
    495		/* The listen "sk" cannot be passed here because
    496		 * it is not locked.  It would not make too much
    497		 * sense to do bpf_setsockopt(listen_sk) based
    498		 * on individual connection request also.
    499		 *
    500		 * Thus, "req" is passed here and the cgroup-bpf-progs
    501		 * of the listen "sk" will be run.
    502		 *
    503		 * "req" is also used here for fastopen even the "sk" here is
    504		 * a fullsock "child" sk.  It is to keep the behavior
    505		 * consistent between fastopen and non-fastopen on
    506		 * the bpf programming side.
    507		 */
    508		sock_ops.sk = (struct sock *)req;
    509		sock_ops.syn_skb = syn_skb;
    510	} else {
    511		sock_owned_by_me(sk);
    512
    513		sock_ops.is_fullsock = 1;
    514		sock_ops.sk = sk;
    515	}
    516
    517	sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
    518	sock_ops.remaining_opt_len = *remaining;
    519	/* tcp_current_mss() does not pass a skb */
    520	if (skb)
    521		bpf_skops_init_skb(&sock_ops, skb, 0);
    522
    523	err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
    524
    525	if (err || sock_ops.remaining_opt_len == *remaining)
    526		return;
    527
    528	opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
    529	/* round up to 4 bytes */
    530	opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
    531
    532	*remaining -= opts->bpf_opt_len;
    533}
    534
    535static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
    536				    struct request_sock *req,
    537				    struct sk_buff *syn_skb,
    538				    enum tcp_synack_type synack_type,
    539				    struct tcp_out_options *opts)
    540{
    541	u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
    542	struct bpf_sock_ops_kern sock_ops;
    543	int err;
    544
    545	if (likely(!max_opt_len))
    546		return;
    547
    548	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
    549
    550	sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
    551
    552	if (req) {
    553		sock_ops.sk = (struct sock *)req;
    554		sock_ops.syn_skb = syn_skb;
    555	} else {
    556		sock_owned_by_me(sk);
    557
    558		sock_ops.is_fullsock = 1;
    559		sock_ops.sk = sk;
    560	}
    561
    562	sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
    563	sock_ops.remaining_opt_len = max_opt_len;
    564	first_opt_off = tcp_hdrlen(skb) - max_opt_len;
    565	bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
    566
    567	err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
    568
    569	if (err)
    570		nr_written = 0;
    571	else
    572		nr_written = max_opt_len - sock_ops.remaining_opt_len;
    573
    574	if (nr_written < max_opt_len)
    575		memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
    576		       max_opt_len - nr_written);
    577}
    578#else
    579static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
    580				  struct request_sock *req,
    581				  struct sk_buff *syn_skb,
    582				  enum tcp_synack_type synack_type,
    583				  struct tcp_out_options *opts,
    584				  unsigned int *remaining)
    585{
    586}
    587
    588static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
    589				    struct request_sock *req,
    590				    struct sk_buff *syn_skb,
    591				    enum tcp_synack_type synack_type,
    592				    struct tcp_out_options *opts)
    593{
    594}
    595#endif
    596
    597/* Write previously computed TCP options to the packet.
    598 *
    599 * Beware: Something in the Internet is very sensitive to the ordering of
    600 * TCP options, we learned this through the hard way, so be careful here.
    601 * Luckily we can at least blame others for their non-compliance but from
    602 * inter-operability perspective it seems that we're somewhat stuck with
    603 * the ordering which we have been using if we want to keep working with
    604 * those broken things (not that it currently hurts anybody as there isn't
    605 * particular reason why the ordering would need to be changed).
    606 *
    607 * At least SACK_PERM as the first option is known to lead to a disaster
    608 * (but it may well be that other scenarios fail similarly).
    609 */
    610static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
    611			      struct tcp_out_options *opts)
    612{
    613	__be32 *ptr = (__be32 *)(th + 1);
    614	u16 options = opts->options;	/* mungable copy */
    615
    616	if (unlikely(OPTION_MD5 & options)) {
    617		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
    618			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
    619		/* overload cookie hash location */
    620		opts->hash_location = (__u8 *)ptr;
    621		ptr += 4;
    622	}
    623
    624	if (unlikely(opts->mss)) {
    625		*ptr++ = htonl((TCPOPT_MSS << 24) |
    626			       (TCPOLEN_MSS << 16) |
    627			       opts->mss);
    628	}
    629
    630	if (likely(OPTION_TS & options)) {
    631		if (unlikely(OPTION_SACK_ADVERTISE & options)) {
    632			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
    633				       (TCPOLEN_SACK_PERM << 16) |
    634				       (TCPOPT_TIMESTAMP << 8) |
    635				       TCPOLEN_TIMESTAMP);
    636			options &= ~OPTION_SACK_ADVERTISE;
    637		} else {
    638			*ptr++ = htonl((TCPOPT_NOP << 24) |
    639				       (TCPOPT_NOP << 16) |
    640				       (TCPOPT_TIMESTAMP << 8) |
    641				       TCPOLEN_TIMESTAMP);
    642		}
    643		*ptr++ = htonl(opts->tsval);
    644		*ptr++ = htonl(opts->tsecr);
    645	}
    646
    647	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
    648		*ptr++ = htonl((TCPOPT_NOP << 24) |
    649			       (TCPOPT_NOP << 16) |
    650			       (TCPOPT_SACK_PERM << 8) |
    651			       TCPOLEN_SACK_PERM);
    652	}
    653
    654	if (unlikely(OPTION_WSCALE & options)) {
    655		*ptr++ = htonl((TCPOPT_NOP << 24) |
    656			       (TCPOPT_WINDOW << 16) |
    657			       (TCPOLEN_WINDOW << 8) |
    658			       opts->ws);
    659	}
    660
    661	if (unlikely(opts->num_sack_blocks)) {
    662		struct tcp_sack_block *sp = tp->rx_opt.dsack ?
    663			tp->duplicate_sack : tp->selective_acks;
    664		int this_sack;
    665
    666		*ptr++ = htonl((TCPOPT_NOP  << 24) |
    667			       (TCPOPT_NOP  << 16) |
    668			       (TCPOPT_SACK <<  8) |
    669			       (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
    670						     TCPOLEN_SACK_PERBLOCK)));
    671
    672		for (this_sack = 0; this_sack < opts->num_sack_blocks;
    673		     ++this_sack) {
    674			*ptr++ = htonl(sp[this_sack].start_seq);
    675			*ptr++ = htonl(sp[this_sack].end_seq);
    676		}
    677
    678		tp->rx_opt.dsack = 0;
    679	}
    680
    681	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
    682		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
    683		u8 *p = (u8 *)ptr;
    684		u32 len; /* Fast Open option length */
    685
    686		if (foc->exp) {
    687			len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
    688			*ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
    689				     TCPOPT_FASTOPEN_MAGIC);
    690			p += TCPOLEN_EXP_FASTOPEN_BASE;
    691		} else {
    692			len = TCPOLEN_FASTOPEN_BASE + foc->len;
    693			*p++ = TCPOPT_FASTOPEN;
    694			*p++ = len;
    695		}
    696
    697		memcpy(p, foc->val, foc->len);
    698		if ((len & 3) == 2) {
    699			p[foc->len] = TCPOPT_NOP;
    700			p[foc->len + 1] = TCPOPT_NOP;
    701		}
    702		ptr += (len + 3) >> 2;
    703	}
    704
    705	smc_options_write(ptr, &options);
    706
    707	mptcp_options_write(th, ptr, tp, opts);
    708}
    709
    710static void smc_set_option(const struct tcp_sock *tp,
    711			   struct tcp_out_options *opts,
    712			   unsigned int *remaining)
    713{
    714#if IS_ENABLED(CONFIG_SMC)
    715	if (static_branch_unlikely(&tcp_have_smc)) {
    716		if (tp->syn_smc) {
    717			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
    718				opts->options |= OPTION_SMC;
    719				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
    720			}
    721		}
    722	}
    723#endif
    724}
    725
    726static void smc_set_option_cond(const struct tcp_sock *tp,
    727				const struct inet_request_sock *ireq,
    728				struct tcp_out_options *opts,
    729				unsigned int *remaining)
    730{
    731#if IS_ENABLED(CONFIG_SMC)
    732	if (static_branch_unlikely(&tcp_have_smc)) {
    733		if (tp->syn_smc && ireq->smc_ok) {
    734			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
    735				opts->options |= OPTION_SMC;
    736				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
    737			}
    738		}
    739	}
    740#endif
    741}
    742
    743static void mptcp_set_option_cond(const struct request_sock *req,
    744				  struct tcp_out_options *opts,
    745				  unsigned int *remaining)
    746{
    747	if (rsk_is_mptcp(req)) {
    748		unsigned int size;
    749
    750		if (mptcp_synack_options(req, &size, &opts->mptcp)) {
    751			if (*remaining >= size) {
    752				opts->options |= OPTION_MPTCP;
    753				*remaining -= size;
    754			}
    755		}
    756	}
    757}
    758
    759/* Compute TCP options for SYN packets. This is not the final
    760 * network wire format yet.
    761 */
    762static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
    763				struct tcp_out_options *opts,
    764				struct tcp_md5sig_key **md5)
    765{
    766	struct tcp_sock *tp = tcp_sk(sk);
    767	unsigned int remaining = MAX_TCP_OPTION_SPACE;
    768	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
    769
    770	*md5 = NULL;
    771#ifdef CONFIG_TCP_MD5SIG
    772	if (static_branch_unlikely(&tcp_md5_needed) &&
    773	    rcu_access_pointer(tp->md5sig_info)) {
    774		*md5 = tp->af_specific->md5_lookup(sk, sk);
    775		if (*md5) {
    776			opts->options |= OPTION_MD5;
    777			remaining -= TCPOLEN_MD5SIG_ALIGNED;
    778		}
    779	}
    780#endif
    781
    782	/* We always get an MSS option.  The option bytes which will be seen in
    783	 * normal data packets should timestamps be used, must be in the MSS
    784	 * advertised.  But we subtract them from tp->mss_cache so that
    785	 * calculations in tcp_sendmsg are simpler etc.  So account for this
    786	 * fact here if necessary.  If we don't do this correctly, as a
    787	 * receiver we won't recognize data packets as being full sized when we
    788	 * should, and thus we won't abide by the delayed ACK rules correctly.
    789	 * SACKs don't matter, we never delay an ACK when we have any of those
    790	 * going out.  */
    791	opts->mss = tcp_advertise_mss(sk);
    792	remaining -= TCPOLEN_MSS_ALIGNED;
    793
    794	if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
    795		opts->options |= OPTION_TS;
    796		opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
    797		opts->tsecr = tp->rx_opt.ts_recent;
    798		remaining -= TCPOLEN_TSTAMP_ALIGNED;
    799	}
    800	if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
    801		opts->ws = tp->rx_opt.rcv_wscale;
    802		opts->options |= OPTION_WSCALE;
    803		remaining -= TCPOLEN_WSCALE_ALIGNED;
    804	}
    805	if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
    806		opts->options |= OPTION_SACK_ADVERTISE;
    807		if (unlikely(!(OPTION_TS & opts->options)))
    808			remaining -= TCPOLEN_SACKPERM_ALIGNED;
    809	}
    810
    811	if (fastopen && fastopen->cookie.len >= 0) {
    812		u32 need = fastopen->cookie.len;
    813
    814		need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
    815					       TCPOLEN_FASTOPEN_BASE;
    816		need = (need + 3) & ~3U;  /* Align to 32 bits */
    817		if (remaining >= need) {
    818			opts->options |= OPTION_FAST_OPEN_COOKIE;
    819			opts->fastopen_cookie = &fastopen->cookie;
    820			remaining -= need;
    821			tp->syn_fastopen = 1;
    822			tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
    823		}
    824	}
    825
    826	smc_set_option(tp, opts, &remaining);
    827
    828	if (sk_is_mptcp(sk)) {
    829		unsigned int size;
    830
    831		if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
    832			opts->options |= OPTION_MPTCP;
    833			remaining -= size;
    834		}
    835	}
    836
    837	bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
    838
    839	return MAX_TCP_OPTION_SPACE - remaining;
    840}
    841
    842/* Set up TCP options for SYN-ACKs. */
    843static unsigned int tcp_synack_options(const struct sock *sk,
    844				       struct request_sock *req,
    845				       unsigned int mss, struct sk_buff *skb,
    846				       struct tcp_out_options *opts,
    847				       const struct tcp_md5sig_key *md5,
    848				       struct tcp_fastopen_cookie *foc,
    849				       enum tcp_synack_type synack_type,
    850				       struct sk_buff *syn_skb)
    851{
    852	struct inet_request_sock *ireq = inet_rsk(req);
    853	unsigned int remaining = MAX_TCP_OPTION_SPACE;
    854
    855#ifdef CONFIG_TCP_MD5SIG
    856	if (md5) {
    857		opts->options |= OPTION_MD5;
    858		remaining -= TCPOLEN_MD5SIG_ALIGNED;
    859
    860		/* We can't fit any SACK blocks in a packet with MD5 + TS
    861		 * options. There was discussion about disabling SACK
    862		 * rather than TS in order to fit in better with old,
    863		 * buggy kernels, but that was deemed to be unnecessary.
    864		 */
    865		if (synack_type != TCP_SYNACK_COOKIE)
    866			ireq->tstamp_ok &= !ireq->sack_ok;
    867	}
    868#endif
    869
    870	/* We always send an MSS option. */
    871	opts->mss = mss;
    872	remaining -= TCPOLEN_MSS_ALIGNED;
    873
    874	if (likely(ireq->wscale_ok)) {
    875		opts->ws = ireq->rcv_wscale;
    876		opts->options |= OPTION_WSCALE;
    877		remaining -= TCPOLEN_WSCALE_ALIGNED;
    878	}
    879	if (likely(ireq->tstamp_ok)) {
    880		opts->options |= OPTION_TS;
    881		opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
    882		opts->tsecr = req->ts_recent;
    883		remaining -= TCPOLEN_TSTAMP_ALIGNED;
    884	}
    885	if (likely(ireq->sack_ok)) {
    886		opts->options |= OPTION_SACK_ADVERTISE;
    887		if (unlikely(!ireq->tstamp_ok))
    888			remaining -= TCPOLEN_SACKPERM_ALIGNED;
    889	}
    890	if (foc != NULL && foc->len >= 0) {
    891		u32 need = foc->len;
    892
    893		need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
    894				   TCPOLEN_FASTOPEN_BASE;
    895		need = (need + 3) & ~3U;  /* Align to 32 bits */
    896		if (remaining >= need) {
    897			opts->options |= OPTION_FAST_OPEN_COOKIE;
    898			opts->fastopen_cookie = foc;
    899			remaining -= need;
    900		}
    901	}
    902
    903	mptcp_set_option_cond(req, opts, &remaining);
    904
    905	smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
    906
    907	bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
    908			      synack_type, opts, &remaining);
    909
    910	return MAX_TCP_OPTION_SPACE - remaining;
    911}
    912
    913/* Compute TCP options for ESTABLISHED sockets. This is not the
    914 * final wire format yet.
    915 */
    916static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
    917					struct tcp_out_options *opts,
    918					struct tcp_md5sig_key **md5)
    919{
    920	struct tcp_sock *tp = tcp_sk(sk);
    921	unsigned int size = 0;
    922	unsigned int eff_sacks;
    923
    924	opts->options = 0;
    925
    926	*md5 = NULL;
    927#ifdef CONFIG_TCP_MD5SIG
    928	if (static_branch_unlikely(&tcp_md5_needed) &&
    929	    rcu_access_pointer(tp->md5sig_info)) {
    930		*md5 = tp->af_specific->md5_lookup(sk, sk);
    931		if (*md5) {
    932			opts->options |= OPTION_MD5;
    933			size += TCPOLEN_MD5SIG_ALIGNED;
    934		}
    935	}
    936#endif
    937
    938	if (likely(tp->rx_opt.tstamp_ok)) {
    939		opts->options |= OPTION_TS;
    940		opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
    941		opts->tsecr = tp->rx_opt.ts_recent;
    942		size += TCPOLEN_TSTAMP_ALIGNED;
    943	}
    944
    945	/* MPTCP options have precedence over SACK for the limited TCP
    946	 * option space because a MPTCP connection would be forced to
    947	 * fall back to regular TCP if a required multipath option is
    948	 * missing. SACK still gets a chance to use whatever space is
    949	 * left.
    950	 */
    951	if (sk_is_mptcp(sk)) {
    952		unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
    953		unsigned int opt_size = 0;
    954
    955		if (mptcp_established_options(sk, skb, &opt_size, remaining,
    956					      &opts->mptcp)) {
    957			opts->options |= OPTION_MPTCP;
    958			size += opt_size;
    959		}
    960	}
    961
    962	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
    963	if (unlikely(eff_sacks)) {
    964		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
    965		if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
    966					 TCPOLEN_SACK_PERBLOCK))
    967			return size;
    968
    969		opts->num_sack_blocks =
    970			min_t(unsigned int, eff_sacks,
    971			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
    972			      TCPOLEN_SACK_PERBLOCK);
    973
    974		size += TCPOLEN_SACK_BASE_ALIGNED +
    975			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
    976	}
    977
    978	if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
    979					    BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
    980		unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
    981
    982		bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
    983
    984		size = MAX_TCP_OPTION_SPACE - remaining;
    985	}
    986
    987	return size;
    988}
    989
    990
    991/* TCP SMALL QUEUES (TSQ)
    992 *
    993 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
    994 * to reduce RTT and bufferbloat.
    995 * We do this using a special skb destructor (tcp_wfree).
    996 *
    997 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
    998 * needs to be reallocated in a driver.
    999 * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
   1000 *
   1001 * Since transmit from skb destructor is forbidden, we use a tasklet
   1002 * to process all sockets that eventually need to send more skbs.
   1003 * We use one tasklet per cpu, with its own queue of sockets.
   1004 */
   1005struct tsq_tasklet {
   1006	struct tasklet_struct	tasklet;
   1007	struct list_head	head; /* queue of tcp sockets */
   1008};
   1009static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
   1010
   1011static void tcp_tsq_write(struct sock *sk)
   1012{
   1013	if ((1 << sk->sk_state) &
   1014	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
   1015	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
   1016		struct tcp_sock *tp = tcp_sk(sk);
   1017
   1018		if (tp->lost_out > tp->retrans_out &&
   1019		    tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) {
   1020			tcp_mstamp_refresh(tp);
   1021			tcp_xmit_retransmit_queue(sk);
   1022		}
   1023
   1024		tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
   1025			       0, GFP_ATOMIC);
   1026	}
   1027}
   1028
   1029static void tcp_tsq_handler(struct sock *sk)
   1030{
   1031	bh_lock_sock(sk);
   1032	if (!sock_owned_by_user(sk))
   1033		tcp_tsq_write(sk);
   1034	else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
   1035		sock_hold(sk);
   1036	bh_unlock_sock(sk);
   1037}
   1038/*
   1039 * One tasklet per cpu tries to send more skbs.
   1040 * We run in tasklet context but need to disable irqs when
   1041 * transferring tsq->head because tcp_wfree() might
   1042 * interrupt us (non NAPI drivers)
   1043 */
   1044static void tcp_tasklet_func(struct tasklet_struct *t)
   1045{
   1046	struct tsq_tasklet *tsq = from_tasklet(tsq,  t, tasklet);
   1047	LIST_HEAD(list);
   1048	unsigned long flags;
   1049	struct list_head *q, *n;
   1050	struct tcp_sock *tp;
   1051	struct sock *sk;
   1052
   1053	local_irq_save(flags);
   1054	list_splice_init(&tsq->head, &list);
   1055	local_irq_restore(flags);
   1056
   1057	list_for_each_safe(q, n, &list) {
   1058		tp = list_entry(q, struct tcp_sock, tsq_node);
   1059		list_del(&tp->tsq_node);
   1060
   1061		sk = (struct sock *)tp;
   1062		smp_mb__before_atomic();
   1063		clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
   1064
   1065		tcp_tsq_handler(sk);
   1066		sk_free(sk);
   1067	}
   1068}
   1069
   1070#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |		\
   1071			  TCPF_WRITE_TIMER_DEFERRED |	\
   1072			  TCPF_DELACK_TIMER_DEFERRED |	\
   1073			  TCPF_MTU_REDUCED_DEFERRED)
   1074/**
   1075 * tcp_release_cb - tcp release_sock() callback
   1076 * @sk: socket
   1077 *
   1078 * called from release_sock() to perform protocol dependent
   1079 * actions before socket release.
   1080 */
   1081void tcp_release_cb(struct sock *sk)
   1082{
   1083	unsigned long flags, nflags;
   1084
   1085	/* perform an atomic operation only if at least one flag is set */
   1086	do {
   1087		flags = sk->sk_tsq_flags;
   1088		if (!(flags & TCP_DEFERRED_ALL))
   1089			return;
   1090		nflags = flags & ~TCP_DEFERRED_ALL;
   1091	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
   1092
   1093	if (flags & TCPF_TSQ_DEFERRED) {
   1094		tcp_tsq_write(sk);
   1095		__sock_put(sk);
   1096	}
   1097	/* Here begins the tricky part :
   1098	 * We are called from release_sock() with :
   1099	 * 1) BH disabled
   1100	 * 2) sk_lock.slock spinlock held
   1101	 * 3) socket owned by us (sk->sk_lock.owned == 1)
   1102	 *
   1103	 * But following code is meant to be called from BH handlers,
   1104	 * so we should keep BH disabled, but early release socket ownership
   1105	 */
   1106	sock_release_ownership(sk);
   1107
   1108	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
   1109		tcp_write_timer_handler(sk);
   1110		__sock_put(sk);
   1111	}
   1112	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
   1113		tcp_delack_timer_handler(sk);
   1114		__sock_put(sk);
   1115	}
   1116	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
   1117		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
   1118		__sock_put(sk);
   1119	}
   1120}
   1121EXPORT_SYMBOL(tcp_release_cb);
   1122
   1123void __init tcp_tasklet_init(void)
   1124{
   1125	int i;
   1126
   1127	for_each_possible_cpu(i) {
   1128		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
   1129
   1130		INIT_LIST_HEAD(&tsq->head);
   1131		tasklet_setup(&tsq->tasklet, tcp_tasklet_func);
   1132	}
   1133}
   1134
   1135/*
   1136 * Write buffer destructor automatically called from kfree_skb.
   1137 * We can't xmit new skbs from this context, as we might already
   1138 * hold qdisc lock.
   1139 */
   1140void tcp_wfree(struct sk_buff *skb)
   1141{
   1142	struct sock *sk = skb->sk;
   1143	struct tcp_sock *tp = tcp_sk(sk);
   1144	unsigned long flags, nval, oval;
   1145
   1146	/* Keep one reference on sk_wmem_alloc.
   1147	 * Will be released by sk_free() from here or tcp_tasklet_func()
   1148	 */
   1149	WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
   1150
   1151	/* If this softirq is serviced by ksoftirqd, we are likely under stress.
   1152	 * Wait until our queues (qdisc + devices) are drained.
   1153	 * This gives :
   1154	 * - less callbacks to tcp_write_xmit(), reducing stress (batches)
   1155	 * - chance for incoming ACK (processed by another cpu maybe)
   1156	 *   to migrate this flow (skb->ooo_okay will be eventually set)
   1157	 */
   1158	if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
   1159		goto out;
   1160
   1161	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
   1162		struct tsq_tasklet *tsq;
   1163		bool empty;
   1164
   1165		if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
   1166			goto out;
   1167
   1168		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
   1169		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
   1170		if (nval != oval)
   1171			continue;
   1172
   1173		/* queue this socket to tasklet queue */
   1174		local_irq_save(flags);
   1175		tsq = this_cpu_ptr(&tsq_tasklet);
   1176		empty = list_empty(&tsq->head);
   1177		list_add(&tp->tsq_node, &tsq->head);
   1178		if (empty)
   1179			tasklet_schedule(&tsq->tasklet);
   1180		local_irq_restore(flags);
   1181		return;
   1182	}
   1183out:
   1184	sk_free(sk);
   1185}
   1186
   1187/* Note: Called under soft irq.
   1188 * We can call TCP stack right away, unless socket is owned by user.
   1189 */
   1190enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
   1191{
   1192	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
   1193	struct sock *sk = (struct sock *)tp;
   1194
   1195	tcp_tsq_handler(sk);
   1196	sock_put(sk);
   1197
   1198	return HRTIMER_NORESTART;
   1199}
   1200
   1201static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
   1202				      u64 prior_wstamp)
   1203{
   1204	struct tcp_sock *tp = tcp_sk(sk);
   1205
   1206	if (sk->sk_pacing_status != SK_PACING_NONE) {
   1207		unsigned long rate = sk->sk_pacing_rate;
   1208
   1209		/* Original sch_fq does not pace first 10 MSS
   1210		 * Note that tp->data_segs_out overflows after 2^32 packets,
   1211		 * this is a minor annoyance.
   1212		 */
   1213		if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
   1214			u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
   1215			u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
   1216
   1217			/* take into account OS jitter */
   1218			len_ns -= min_t(u64, len_ns / 2, credit);
   1219			tp->tcp_wstamp_ns += len_ns;
   1220		}
   1221	}
   1222	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
   1223}
   1224
   1225INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
   1226INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
   1227INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
   1228
   1229/* This routine actually transmits TCP packets queued in by
   1230 * tcp_do_sendmsg().  This is used by both the initial
   1231 * transmission and possible later retransmissions.
   1232 * All SKB's seen here are completely headerless.  It is our
   1233 * job to build the TCP header, and pass the packet down to
   1234 * IP so it can do the same plus pass the packet off to the
   1235 * device.
   1236 *
   1237 * We are working here with either a clone of the original
   1238 * SKB, or a fresh unique copy made by the retransmit engine.
   1239 */
   1240static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
   1241			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
   1242{
   1243	const struct inet_connection_sock *icsk = inet_csk(sk);
   1244	struct inet_sock *inet;
   1245	struct tcp_sock *tp;
   1246	struct tcp_skb_cb *tcb;
   1247	struct tcp_out_options opts;
   1248	unsigned int tcp_options_size, tcp_header_size;
   1249	struct sk_buff *oskb = NULL;
   1250	struct tcp_md5sig_key *md5;
   1251	struct tcphdr *th;
   1252	u64 prior_wstamp;
   1253	int err;
   1254
   1255	BUG_ON(!skb || !tcp_skb_pcount(skb));
   1256	tp = tcp_sk(sk);
   1257	prior_wstamp = tp->tcp_wstamp_ns;
   1258	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
   1259	skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
   1260	if (clone_it) {
   1261		oskb = skb;
   1262
   1263		tcp_skb_tsorted_save(oskb) {
   1264			if (unlikely(skb_cloned(oskb)))
   1265				skb = pskb_copy(oskb, gfp_mask);
   1266			else
   1267				skb = skb_clone(oskb, gfp_mask);
   1268		} tcp_skb_tsorted_restore(oskb);
   1269
   1270		if (unlikely(!skb))
   1271			return -ENOBUFS;
   1272		/* retransmit skbs might have a non zero value in skb->dev
   1273		 * because skb->dev is aliased with skb->rbnode.rb_left
   1274		 */
   1275		skb->dev = NULL;
   1276	}
   1277
   1278	inet = inet_sk(sk);
   1279	tcb = TCP_SKB_CB(skb);
   1280	memset(&opts, 0, sizeof(opts));
   1281
   1282	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
   1283		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
   1284	} else {
   1285		tcp_options_size = tcp_established_options(sk, skb, &opts,
   1286							   &md5);
   1287		/* Force a PSH flag on all (GSO) packets to expedite GRO flush
   1288		 * at receiver : This slightly improve GRO performance.
   1289		 * Note that we do not force the PSH flag for non GSO packets,
   1290		 * because they might be sent under high congestion events,
   1291		 * and in this case it is better to delay the delivery of 1-MSS
   1292		 * packets and thus the corresponding ACK packet that would
   1293		 * release the following packet.
   1294		 */
   1295		if (tcp_skb_pcount(skb) > 1)
   1296			tcb->tcp_flags |= TCPHDR_PSH;
   1297	}
   1298	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
   1299
   1300	/* if no packet is in qdisc/device queue, then allow XPS to select
   1301	 * another queue. We can be called from tcp_tsq_handler()
   1302	 * which holds one reference to sk.
   1303	 *
   1304	 * TODO: Ideally, in-flight pure ACK packets should not matter here.
   1305	 * One way to get this would be to set skb->truesize = 2 on them.
   1306	 */
   1307	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
   1308
   1309	/* If we had to use memory reserve to allocate this skb,
   1310	 * this might cause drops if packet is looped back :
   1311	 * Other socket might not have SOCK_MEMALLOC.
   1312	 * Packets not looped back do not care about pfmemalloc.
   1313	 */
   1314	skb->pfmemalloc = 0;
   1315
   1316	skb_push(skb, tcp_header_size);
   1317	skb_reset_transport_header(skb);
   1318
   1319	skb_orphan(skb);
   1320	skb->sk = sk;
   1321	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
   1322	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
   1323
   1324	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
   1325
   1326	/* Build TCP header and checksum it. */
   1327	th = (struct tcphdr *)skb->data;
   1328	th->source		= inet->inet_sport;
   1329	th->dest		= inet->inet_dport;
   1330	th->seq			= htonl(tcb->seq);
   1331	th->ack_seq		= htonl(rcv_nxt);
   1332	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
   1333					tcb->tcp_flags);
   1334
   1335	th->check		= 0;
   1336	th->urg_ptr		= 0;
   1337
   1338	/* The urg_mode check is necessary during a below snd_una win probe */
   1339	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
   1340		if (before(tp->snd_up, tcb->seq + 0x10000)) {
   1341			th->urg_ptr = htons(tp->snd_up - tcb->seq);
   1342			th->urg = 1;
   1343		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
   1344			th->urg_ptr = htons(0xFFFF);
   1345			th->urg = 1;
   1346		}
   1347	}
   1348
   1349	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
   1350	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
   1351		th->window      = htons(tcp_select_window(sk));
   1352		tcp_ecn_send(sk, skb, th, tcp_header_size);
   1353	} else {
   1354		/* RFC1323: The window in SYN & SYN/ACK segments
   1355		 * is never scaled.
   1356		 */
   1357		th->window	= htons(min(tp->rcv_wnd, 65535U));
   1358	}
   1359
   1360	tcp_options_write(th, tp, &opts);
   1361
   1362#ifdef CONFIG_TCP_MD5SIG
   1363	/* Calculate the MD5 hash, as we have all we need now */
   1364	if (md5) {
   1365		sk_gso_disable(sk);
   1366		tp->af_specific->calc_md5_hash(opts.hash_location,
   1367					       md5, sk, skb);
   1368	}
   1369#endif
   1370
   1371	/* BPF prog is the last one writing header option */
   1372	bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
   1373
   1374	INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
   1375			   tcp_v6_send_check, tcp_v4_send_check,
   1376			   sk, skb);
   1377
   1378	if (likely(tcb->tcp_flags & TCPHDR_ACK))
   1379		tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
   1380
   1381	if (skb->len != tcp_header_size) {
   1382		tcp_event_data_sent(tp, sk);
   1383		tp->data_segs_out += tcp_skb_pcount(skb);
   1384		tp->bytes_sent += skb->len - tcp_header_size;
   1385	}
   1386
   1387	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
   1388		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
   1389			      tcp_skb_pcount(skb));
   1390
   1391	tp->segs_out += tcp_skb_pcount(skb);
   1392	skb_set_hash_from_sk(skb, sk);
   1393	/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
   1394	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
   1395	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
   1396
   1397	/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
   1398
   1399	/* Cleanup our debris for IP stacks */
   1400	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
   1401			       sizeof(struct inet6_skb_parm)));
   1402
   1403	tcp_add_tx_delay(skb, tp);
   1404
   1405	err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
   1406				 inet6_csk_xmit, ip_queue_xmit,
   1407				 sk, skb, &inet->cork.fl);
   1408
   1409	if (unlikely(err > 0)) {
   1410		tcp_enter_cwr(sk);
   1411		err = net_xmit_eval(err);
   1412	}
   1413	if (!err && oskb) {
   1414		tcp_update_skb_after_send(sk, oskb, prior_wstamp);
   1415		tcp_rate_skb_sent(sk, oskb);
   1416	}
   1417	return err;
   1418}
   1419
   1420static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
   1421			    gfp_t gfp_mask)
   1422{
   1423	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
   1424				  tcp_sk(sk)->rcv_nxt);
   1425}
   1426
   1427/* This routine just queues the buffer for sending.
   1428 *
   1429 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
   1430 * otherwise socket can stall.
   1431 */
   1432static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
   1433{
   1434	struct tcp_sock *tp = tcp_sk(sk);
   1435
   1436	/* Advance write_seq and place onto the write_queue. */
   1437	WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
   1438	__skb_header_release(skb);
   1439	tcp_add_write_queue_tail(sk, skb);
   1440	sk_wmem_queued_add(sk, skb->truesize);
   1441	sk_mem_charge(sk, skb->truesize);
   1442}
   1443
   1444/* Initialize TSO segments for a packet. */
   1445static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
   1446{
   1447	if (skb->len <= mss_now) {
   1448		/* Avoid the costly divide in the normal
   1449		 * non-TSO case.
   1450		 */
   1451		tcp_skb_pcount_set(skb, 1);
   1452		TCP_SKB_CB(skb)->tcp_gso_size = 0;
   1453	} else {
   1454		tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
   1455		TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
   1456	}
   1457}
   1458
   1459/* Pcount in the middle of the write queue got changed, we need to do various
   1460 * tweaks to fix counters
   1461 */
   1462static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
   1463{
   1464	struct tcp_sock *tp = tcp_sk(sk);
   1465
   1466	tp->packets_out -= decr;
   1467
   1468	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
   1469		tp->sacked_out -= decr;
   1470	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
   1471		tp->retrans_out -= decr;
   1472	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
   1473		tp->lost_out -= decr;
   1474
   1475	/* Reno case is special. Sigh... */
   1476	if (tcp_is_reno(tp) && decr > 0)
   1477		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
   1478
   1479	if (tp->lost_skb_hint &&
   1480	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
   1481	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
   1482		tp->lost_cnt_hint -= decr;
   1483
   1484	tcp_verify_left_out(tp);
   1485}
   1486
   1487static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
   1488{
   1489	return TCP_SKB_CB(skb)->txstamp_ack ||
   1490		(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
   1491}
   1492
   1493static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
   1494{
   1495	struct skb_shared_info *shinfo = skb_shinfo(skb);
   1496
   1497	if (unlikely(tcp_has_tx_tstamp(skb)) &&
   1498	    !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
   1499		struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
   1500		u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
   1501
   1502		shinfo->tx_flags &= ~tsflags;
   1503		shinfo2->tx_flags |= tsflags;
   1504		swap(shinfo->tskey, shinfo2->tskey);
   1505		TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
   1506		TCP_SKB_CB(skb)->txstamp_ack = 0;
   1507	}
   1508}
   1509
   1510static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
   1511{
   1512	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
   1513	TCP_SKB_CB(skb)->eor = 0;
   1514}
   1515
   1516/* Insert buff after skb on the write or rtx queue of sk.  */
   1517static void tcp_insert_write_queue_after(struct sk_buff *skb,
   1518					 struct sk_buff *buff,
   1519					 struct sock *sk,
   1520					 enum tcp_queue tcp_queue)
   1521{
   1522	if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
   1523		__skb_queue_after(&sk->sk_write_queue, skb, buff);
   1524	else
   1525		tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
   1526}
   1527
   1528/* Function to create two new TCP segments.  Shrinks the given segment
   1529 * to the specified size and appends a new segment with the rest of the
   1530 * packet to the list.  This won't be called frequently, I hope.
   1531 * Remember, these are still headerless SKBs at this point.
   1532 */
   1533int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
   1534		 struct sk_buff *skb, u32 len,
   1535		 unsigned int mss_now, gfp_t gfp)
   1536{
   1537	struct tcp_sock *tp = tcp_sk(sk);
   1538	struct sk_buff *buff;
   1539	int nsize, old_factor;
   1540	long limit;
   1541	int nlen;
   1542	u8 flags;
   1543
   1544	if (WARN_ON(len > skb->len))
   1545		return -EINVAL;
   1546
   1547	nsize = skb_headlen(skb) - len;
   1548	if (nsize < 0)
   1549		nsize = 0;
   1550
   1551	/* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
   1552	 * We need some allowance to not penalize applications setting small
   1553	 * SO_SNDBUF values.
   1554	 * Also allow first and last skb in retransmit queue to be split.
   1555	 */
   1556	limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE);
   1557	if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
   1558		     tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
   1559		     skb != tcp_rtx_queue_head(sk) &&
   1560		     skb != tcp_rtx_queue_tail(sk))) {
   1561		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
   1562		return -ENOMEM;
   1563	}
   1564
   1565	if (skb_unclone_keeptruesize(skb, gfp))
   1566		return -ENOMEM;
   1567
   1568	/* Get a new skb... force flag on. */
   1569	buff = tcp_stream_alloc_skb(sk, nsize, gfp, true);
   1570	if (!buff)
   1571		return -ENOMEM; /* We'll just try again later. */
   1572	skb_copy_decrypted(buff, skb);
   1573	mptcp_skb_ext_copy(buff, skb);
   1574
   1575	sk_wmem_queued_add(sk, buff->truesize);
   1576	sk_mem_charge(sk, buff->truesize);
   1577	nlen = skb->len - len - nsize;
   1578	buff->truesize += nlen;
   1579	skb->truesize -= nlen;
   1580
   1581	/* Correct the sequence numbers. */
   1582	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
   1583	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
   1584	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
   1585
   1586	/* PSH and FIN should only be set in the second packet. */
   1587	flags = TCP_SKB_CB(skb)->tcp_flags;
   1588	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
   1589	TCP_SKB_CB(buff)->tcp_flags = flags;
   1590	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
   1591	tcp_skb_fragment_eor(skb, buff);
   1592
   1593	skb_split(skb, buff, len);
   1594
   1595	skb_set_delivery_time(buff, skb->tstamp, true);
   1596	tcp_fragment_tstamp(skb, buff);
   1597
   1598	old_factor = tcp_skb_pcount(skb);
   1599
   1600	/* Fix up tso_factor for both original and new SKB.  */
   1601	tcp_set_skb_tso_segs(skb, mss_now);
   1602	tcp_set_skb_tso_segs(buff, mss_now);
   1603
   1604	/* Update delivered info for the new segment */
   1605	TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
   1606
   1607	/* If this packet has been sent out already, we must
   1608	 * adjust the various packet counters.
   1609	 */
   1610	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
   1611		int diff = old_factor - tcp_skb_pcount(skb) -
   1612			tcp_skb_pcount(buff);
   1613
   1614		if (diff)
   1615			tcp_adjust_pcount(sk, skb, diff);
   1616	}
   1617
   1618	/* Link BUFF into the send queue. */
   1619	__skb_header_release(buff);
   1620	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
   1621	if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
   1622		list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
   1623
   1624	return 0;
   1625}
   1626
   1627/* This is similar to __pskb_pull_tail(). The difference is that pulled
   1628 * data is not copied, but immediately discarded.
   1629 */
   1630static int __pskb_trim_head(struct sk_buff *skb, int len)
   1631{
   1632	struct skb_shared_info *shinfo;
   1633	int i, k, eat;
   1634
   1635	eat = min_t(int, len, skb_headlen(skb));
   1636	if (eat) {
   1637		__skb_pull(skb, eat);
   1638		len -= eat;
   1639		if (!len)
   1640			return 0;
   1641	}
   1642	eat = len;
   1643	k = 0;
   1644	shinfo = skb_shinfo(skb);
   1645	for (i = 0; i < shinfo->nr_frags; i++) {
   1646		int size = skb_frag_size(&shinfo->frags[i]);
   1647
   1648		if (size <= eat) {
   1649			skb_frag_unref(skb, i);
   1650			eat -= size;
   1651		} else {
   1652			shinfo->frags[k] = shinfo->frags[i];
   1653			if (eat) {
   1654				skb_frag_off_add(&shinfo->frags[k], eat);
   1655				skb_frag_size_sub(&shinfo->frags[k], eat);
   1656				eat = 0;
   1657			}
   1658			k++;
   1659		}
   1660	}
   1661	shinfo->nr_frags = k;
   1662
   1663	skb->data_len -= len;
   1664	skb->len = skb->data_len;
   1665	return len;
   1666}
   1667
   1668/* Remove acked data from a packet in the transmit queue. */
   1669int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
   1670{
   1671	u32 delta_truesize;
   1672
   1673	if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
   1674		return -ENOMEM;
   1675
   1676	delta_truesize = __pskb_trim_head(skb, len);
   1677
   1678	TCP_SKB_CB(skb)->seq += len;
   1679
   1680	if (delta_truesize) {
   1681		skb->truesize	   -= delta_truesize;
   1682		sk_wmem_queued_add(sk, -delta_truesize);
   1683		if (!skb_zcopy_pure(skb))
   1684			sk_mem_uncharge(sk, delta_truesize);
   1685	}
   1686
   1687	/* Any change of skb->len requires recalculation of tso factor. */
   1688	if (tcp_skb_pcount(skb) > 1)
   1689		tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
   1690
   1691	return 0;
   1692}
   1693
   1694/* Calculate MSS not accounting any TCP options.  */
   1695static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
   1696{
   1697	const struct tcp_sock *tp = tcp_sk(sk);
   1698	const struct inet_connection_sock *icsk = inet_csk(sk);
   1699	int mss_now;
   1700
   1701	/* Calculate base mss without TCP options:
   1702	   It is MMS_S - sizeof(tcphdr) of rfc1122
   1703	 */
   1704	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
   1705
   1706	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
   1707	if (icsk->icsk_af_ops->net_frag_header_len) {
   1708		const struct dst_entry *dst = __sk_dst_get(sk);
   1709
   1710		if (dst && dst_allfrag(dst))
   1711			mss_now -= icsk->icsk_af_ops->net_frag_header_len;
   1712	}
   1713
   1714	/* Clamp it (mss_clamp does not include tcp options) */
   1715	if (mss_now > tp->rx_opt.mss_clamp)
   1716		mss_now = tp->rx_opt.mss_clamp;
   1717
   1718	/* Now subtract optional transport overhead */
   1719	mss_now -= icsk->icsk_ext_hdr_len;
   1720
   1721	/* Then reserve room for full set of TCP options and 8 bytes of data */
   1722	mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
   1723	return mss_now;
   1724}
   1725
   1726/* Calculate MSS. Not accounting for SACKs here.  */
   1727int tcp_mtu_to_mss(struct sock *sk, int pmtu)
   1728{
   1729	/* Subtract TCP options size, not including SACKs */
   1730	return __tcp_mtu_to_mss(sk, pmtu) -
   1731	       (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
   1732}
   1733EXPORT_SYMBOL(tcp_mtu_to_mss);
   1734
   1735/* Inverse of above */
   1736int tcp_mss_to_mtu(struct sock *sk, int mss)
   1737{
   1738	const struct tcp_sock *tp = tcp_sk(sk);
   1739	const struct inet_connection_sock *icsk = inet_csk(sk);
   1740	int mtu;
   1741
   1742	mtu = mss +
   1743	      tp->tcp_header_len +
   1744	      icsk->icsk_ext_hdr_len +
   1745	      icsk->icsk_af_ops->net_header_len;
   1746
   1747	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
   1748	if (icsk->icsk_af_ops->net_frag_header_len) {
   1749		const struct dst_entry *dst = __sk_dst_get(sk);
   1750
   1751		if (dst && dst_allfrag(dst))
   1752			mtu += icsk->icsk_af_ops->net_frag_header_len;
   1753	}
   1754	return mtu;
   1755}
   1756EXPORT_SYMBOL(tcp_mss_to_mtu);
   1757
   1758/* MTU probing init per socket */
   1759void tcp_mtup_init(struct sock *sk)
   1760{
   1761	struct tcp_sock *tp = tcp_sk(sk);
   1762	struct inet_connection_sock *icsk = inet_csk(sk);
   1763	struct net *net = sock_net(sk);
   1764
   1765	icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
   1766	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
   1767			       icsk->icsk_af_ops->net_header_len;
   1768	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
   1769	icsk->icsk_mtup.probe_size = 0;
   1770	if (icsk->icsk_mtup.enabled)
   1771		icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
   1772}
   1773EXPORT_SYMBOL(tcp_mtup_init);
   1774
   1775/* This function synchronize snd mss to current pmtu/exthdr set.
   1776
   1777   tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
   1778   for TCP options, but includes only bare TCP header.
   1779
   1780   tp->rx_opt.mss_clamp is mss negotiated at connection setup.
   1781   It is minimum of user_mss and mss received with SYN.
   1782   It also does not include TCP options.
   1783
   1784   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
   1785
   1786   tp->mss_cache is current effective sending mss, including
   1787   all tcp options except for SACKs. It is evaluated,
   1788   taking into account current pmtu, but never exceeds
   1789   tp->rx_opt.mss_clamp.
   1790
   1791   NOTE1. rfc1122 clearly states that advertised MSS
   1792   DOES NOT include either tcp or ip options.
   1793
   1794   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
   1795   are READ ONLY outside this function.		--ANK (980731)
   1796 */
   1797unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
   1798{
   1799	struct tcp_sock *tp = tcp_sk(sk);
   1800	struct inet_connection_sock *icsk = inet_csk(sk);
   1801	int mss_now;
   1802
   1803	if (icsk->icsk_mtup.search_high > pmtu)
   1804		icsk->icsk_mtup.search_high = pmtu;
   1805
   1806	mss_now = tcp_mtu_to_mss(sk, pmtu);
   1807	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
   1808
   1809	/* And store cached results */
   1810	icsk->icsk_pmtu_cookie = pmtu;
   1811	if (icsk->icsk_mtup.enabled)
   1812		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
   1813	tp->mss_cache = mss_now;
   1814
   1815	return mss_now;
   1816}
   1817EXPORT_SYMBOL(tcp_sync_mss);
   1818
   1819/* Compute the current effective MSS, taking SACKs and IP options,
   1820 * and even PMTU discovery events into account.
   1821 */
   1822unsigned int tcp_current_mss(struct sock *sk)
   1823{
   1824	const struct tcp_sock *tp = tcp_sk(sk);
   1825	const struct dst_entry *dst = __sk_dst_get(sk);
   1826	u32 mss_now;
   1827	unsigned int header_len;
   1828	struct tcp_out_options opts;
   1829	struct tcp_md5sig_key *md5;
   1830
   1831	mss_now = tp->mss_cache;
   1832
   1833	if (dst) {
   1834		u32 mtu = dst_mtu(dst);
   1835		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
   1836			mss_now = tcp_sync_mss(sk, mtu);
   1837	}
   1838
   1839	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
   1840		     sizeof(struct tcphdr);
   1841	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
   1842	 * some common options. If this is an odd packet (because we have SACK
   1843	 * blocks etc) then our calculated header_len will be different, and
   1844	 * we have to adjust mss_now correspondingly */
   1845	if (header_len != tp->tcp_header_len) {
   1846		int delta = (int) header_len - tp->tcp_header_len;
   1847		mss_now -= delta;
   1848	}
   1849
   1850	return mss_now;
   1851}
   1852
   1853/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
   1854 * As additional protections, we do not touch cwnd in retransmission phases,
   1855 * and if application hit its sndbuf limit recently.
   1856 */
   1857static void tcp_cwnd_application_limited(struct sock *sk)
   1858{
   1859	struct tcp_sock *tp = tcp_sk(sk);
   1860
   1861	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
   1862	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
   1863		/* Limited by application or receiver window. */
   1864		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
   1865		u32 win_used = max(tp->snd_cwnd_used, init_win);
   1866		if (win_used < tcp_snd_cwnd(tp)) {
   1867			tp->snd_ssthresh = tcp_current_ssthresh(sk);
   1868			tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1);
   1869		}
   1870		tp->snd_cwnd_used = 0;
   1871	}
   1872	tp->snd_cwnd_stamp = tcp_jiffies32;
   1873}
   1874
   1875static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
   1876{
   1877	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
   1878	struct tcp_sock *tp = tcp_sk(sk);
   1879
   1880	/* Track the maximum number of outstanding packets in each
   1881	 * window, and remember whether we were cwnd-limited then.
   1882	 */
   1883	if (!before(tp->snd_una, tp->max_packets_seq) ||
   1884	    tp->packets_out > tp->max_packets_out ||
   1885	    is_cwnd_limited) {
   1886		tp->max_packets_out = tp->packets_out;
   1887		tp->max_packets_seq = tp->snd_nxt;
   1888		tp->is_cwnd_limited = is_cwnd_limited;
   1889	}
   1890
   1891	if (tcp_is_cwnd_limited(sk)) {
   1892		/* Network is feed fully. */
   1893		tp->snd_cwnd_used = 0;
   1894		tp->snd_cwnd_stamp = tcp_jiffies32;
   1895	} else {
   1896		/* Network starves. */
   1897		if (tp->packets_out > tp->snd_cwnd_used)
   1898			tp->snd_cwnd_used = tp->packets_out;
   1899
   1900		if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
   1901		    (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
   1902		    !ca_ops->cong_control)
   1903			tcp_cwnd_application_limited(sk);
   1904
   1905		/* The following conditions together indicate the starvation
   1906		 * is caused by insufficient sender buffer:
   1907		 * 1) just sent some data (see tcp_write_xmit)
   1908		 * 2) not cwnd limited (this else condition)
   1909		 * 3) no more data to send (tcp_write_queue_empty())
   1910		 * 4) application is hitting buffer limit (SOCK_NOSPACE)
   1911		 */
   1912		if (tcp_write_queue_empty(sk) && sk->sk_socket &&
   1913		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
   1914		    (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
   1915			tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
   1916	}
   1917}
   1918
   1919/* Minshall's variant of the Nagle send check. */
   1920static bool tcp_minshall_check(const struct tcp_sock *tp)
   1921{
   1922	return after(tp->snd_sml, tp->snd_una) &&
   1923		!after(tp->snd_sml, tp->snd_nxt);
   1924}
   1925
   1926/* Update snd_sml if this skb is under mss
   1927 * Note that a TSO packet might end with a sub-mss segment
   1928 * The test is really :
   1929 * if ((skb->len % mss) != 0)
   1930 *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
   1931 * But we can avoid doing the divide again given we already have
   1932 *  skb_pcount = skb->len / mss_now
   1933 */
   1934static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
   1935				const struct sk_buff *skb)
   1936{
   1937	if (skb->len < tcp_skb_pcount(skb) * mss_now)
   1938		tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
   1939}
   1940
   1941/* Return false, if packet can be sent now without violation Nagle's rules:
   1942 * 1. It is full sized. (provided by caller in %partial bool)
   1943 * 2. Or it contains FIN. (already checked by caller)
   1944 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
   1945 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
   1946 *    With Minshall's modification: all sent small packets are ACKed.
   1947 */
   1948static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
   1949			    int nonagle)
   1950{
   1951	return partial &&
   1952		((nonagle & TCP_NAGLE_CORK) ||
   1953		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
   1954}
   1955
   1956/* Return how many segs we'd like on a TSO packet,
   1957 * depending on current pacing rate, and how close the peer is.
   1958 *
   1959 * Rationale is:
   1960 * - For close peers, we rather send bigger packets to reduce
   1961 *   cpu costs, because occasional losses will be repaired fast.
   1962 * - For long distance/rtt flows, we would like to get ACK clocking
   1963 *   with 1 ACK per ms.
   1964 *
   1965 * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
   1966 * in bigger TSO bursts. We we cut the RTT-based allowance in half
   1967 * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
   1968 * is below 1500 bytes after 6 * ~500 usec = 3ms.
   1969 */
   1970static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
   1971			    int min_tso_segs)
   1972{
   1973	unsigned long bytes;
   1974	u32 r;
   1975
   1976	bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift);
   1977
   1978	r = tcp_min_rtt(tcp_sk(sk)) >> sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log;
   1979	if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
   1980		bytes += sk->sk_gso_max_size >> r;
   1981
   1982	bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);
   1983
   1984	return max_t(u32, bytes / mss_now, min_tso_segs);
   1985}
   1986
   1987/* Return the number of segments we want in the skb we are transmitting.
   1988 * See if congestion control module wants to decide; otherwise, autosize.
   1989 */
   1990static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
   1991{
   1992	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
   1993	u32 min_tso, tso_segs;
   1994
   1995	min_tso = ca_ops->min_tso_segs ?
   1996			ca_ops->min_tso_segs(sk) :
   1997			sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
   1998
   1999	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
   2000	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
   2001}
   2002
   2003/* Returns the portion of skb which can be sent right away */
   2004static unsigned int tcp_mss_split_point(const struct sock *sk,
   2005					const struct sk_buff *skb,
   2006					unsigned int mss_now,
   2007					unsigned int max_segs,
   2008					int nonagle)
   2009{
   2010	const struct tcp_sock *tp = tcp_sk(sk);
   2011	u32 partial, needed, window, max_len;
   2012
   2013	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
   2014	max_len = mss_now * max_segs;
   2015
   2016	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
   2017		return max_len;
   2018
   2019	needed = min(skb->len, window);
   2020
   2021	if (max_len <= needed)
   2022		return max_len;
   2023
   2024	partial = needed % mss_now;
   2025	/* If last segment is not a full MSS, check if Nagle rules allow us
   2026	 * to include this last segment in this skb.
   2027	 * Otherwise, we'll split the skb at last MSS boundary
   2028	 */
   2029	if (tcp_nagle_check(partial != 0, tp, nonagle))
   2030		return needed - partial;
   2031
   2032	return needed;
   2033}
   2034
   2035/* Can at least one segment of SKB be sent right now, according to the
   2036 * congestion window rules?  If so, return how many segments are allowed.
   2037 */
   2038static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
   2039					 const struct sk_buff *skb)
   2040{
   2041	u32 in_flight, cwnd, halfcwnd;
   2042
   2043	/* Don't be strict about the congestion window for the final FIN.  */
   2044	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
   2045	    tcp_skb_pcount(skb) == 1)
   2046		return 1;
   2047
   2048	in_flight = tcp_packets_in_flight(tp);
   2049	cwnd = tcp_snd_cwnd(tp);
   2050	if (in_flight >= cwnd)
   2051		return 0;
   2052
   2053	/* For better scheduling, ensure we have at least
   2054	 * 2 GSO packets in flight.
   2055	 */
   2056	halfcwnd = max(cwnd >> 1, 1U);
   2057	return min(halfcwnd, cwnd - in_flight);
   2058}
   2059
   2060/* Initialize TSO state of a skb.
   2061 * This must be invoked the first time we consider transmitting
   2062 * SKB onto the wire.
   2063 */
   2064static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
   2065{
   2066	int tso_segs = tcp_skb_pcount(skb);
   2067
   2068	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
   2069		tcp_set_skb_tso_segs(skb, mss_now);
   2070		tso_segs = tcp_skb_pcount(skb);
   2071	}
   2072	return tso_segs;
   2073}
   2074
   2075
   2076/* Return true if the Nagle test allows this packet to be
   2077 * sent now.
   2078 */
   2079static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
   2080				  unsigned int cur_mss, int nonagle)
   2081{
   2082	/* Nagle rule does not apply to frames, which sit in the middle of the
   2083	 * write_queue (they have no chances to get new data).
   2084	 *
   2085	 * This is implemented in the callers, where they modify the 'nonagle'
   2086	 * argument based upon the location of SKB in the send queue.
   2087	 */
   2088	if (nonagle & TCP_NAGLE_PUSH)
   2089		return true;
   2090
   2091	/* Don't use the nagle rule for urgent data (or for the final FIN). */
   2092	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
   2093		return true;
   2094
   2095	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
   2096		return true;
   2097
   2098	return false;
   2099}
   2100
   2101/* Does at least the first segment of SKB fit into the send window? */
   2102static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
   2103			     const struct sk_buff *skb,
   2104			     unsigned int cur_mss)
   2105{
   2106	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
   2107
   2108	if (skb->len > cur_mss)
   2109		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
   2110
   2111	return !after(end_seq, tcp_wnd_end(tp));
   2112}
   2113
   2114/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
   2115 * which is put after SKB on the list.  It is very much like
   2116 * tcp_fragment() except that it may make several kinds of assumptions
   2117 * in order to speed up the splitting operation.  In particular, we
   2118 * know that all the data is in scatter-gather pages, and that the
   2119 * packet has never been sent out before (and thus is not cloned).
   2120 */
   2121static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
   2122			unsigned int mss_now, gfp_t gfp)
   2123{
   2124	int nlen = skb->len - len;
   2125	struct sk_buff *buff;
   2126	u8 flags;
   2127
   2128	/* All of a TSO frame must be composed of paged data.  */
   2129	if (skb->len != skb->data_len)
   2130		return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
   2131				    skb, len, mss_now, gfp);
   2132
   2133	buff = tcp_stream_alloc_skb(sk, 0, gfp, true);
   2134	if (unlikely(!buff))
   2135		return -ENOMEM;
   2136	skb_copy_decrypted(buff, skb);
   2137	mptcp_skb_ext_copy(buff, skb);
   2138
   2139	sk_wmem_queued_add(sk, buff->truesize);
   2140	sk_mem_charge(sk, buff->truesize);
   2141	buff->truesize += nlen;
   2142	skb->truesize -= nlen;
   2143
   2144	/* Correct the sequence numbers. */
   2145	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
   2146	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
   2147	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
   2148
   2149	/* PSH and FIN should only be set in the second packet. */
   2150	flags = TCP_SKB_CB(skb)->tcp_flags;
   2151	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
   2152	TCP_SKB_CB(buff)->tcp_flags = flags;
   2153
   2154	tcp_skb_fragment_eor(skb, buff);
   2155
   2156	skb_split(skb, buff, len);
   2157	tcp_fragment_tstamp(skb, buff);
   2158
   2159	/* Fix up tso_factor for both original and new SKB.  */
   2160	tcp_set_skb_tso_segs(skb, mss_now);
   2161	tcp_set_skb_tso_segs(buff, mss_now);
   2162
   2163	/* Link BUFF into the send queue. */
   2164	__skb_header_release(buff);
   2165	tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
   2166
   2167	return 0;
   2168}
   2169
   2170/* Try to defer sending, if possible, in order to minimize the amount
   2171 * of TSO splitting we do.  View it as a kind of TSO Nagle test.
   2172 *
   2173 * This algorithm is from John Heffner.
   2174 */
   2175static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
   2176				 bool *is_cwnd_limited,
   2177				 bool *is_rwnd_limited,
   2178				 u32 max_segs)
   2179{
   2180	const struct inet_connection_sock *icsk = inet_csk(sk);
   2181	u32 send_win, cong_win, limit, in_flight;
   2182	struct tcp_sock *tp = tcp_sk(sk);
   2183	struct sk_buff *head;
   2184	int win_divisor;
   2185	s64 delta;
   2186
   2187	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
   2188		goto send_now;
   2189
   2190	/* Avoid bursty behavior by allowing defer
   2191	 * only if the last write was recent (1 ms).
   2192	 * Note that tp->tcp_wstamp_ns can be in the future if we have
   2193	 * packets waiting in a qdisc or device for EDT delivery.
   2194	 */
   2195	delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
   2196	if (delta > 0)
   2197		goto send_now;
   2198
   2199	in_flight = tcp_packets_in_flight(tp);
   2200
   2201	BUG_ON(tcp_skb_pcount(skb) <= 1);
   2202	BUG_ON(tcp_snd_cwnd(tp) <= in_flight);
   2203
   2204	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
   2205
   2206	/* From in_flight test above, we know that cwnd > in_flight.  */
   2207	cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache;
   2208
   2209	limit = min(send_win, cong_win);
   2210
   2211	/* If a full-sized TSO skb can be sent, do it. */
   2212	if (limit >= max_segs * tp->mss_cache)
   2213		goto send_now;
   2214
   2215	/* Middle in queue won't get any more data, full sendable already? */
   2216	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
   2217		goto send_now;
   2218
   2219	win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
   2220	if (win_divisor) {
   2221		u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache);
   2222
   2223		/* If at least some fraction of a window is available,
   2224		 * just use it.
   2225		 */
   2226		chunk /= win_divisor;
   2227		if (limit >= chunk)
   2228			goto send_now;
   2229	} else {
   2230		/* Different approach, try not to defer past a single
   2231		 * ACK.  Receiver should ACK every other full sized
   2232		 * frame, so if we have space for more than 3 frames
   2233		 * then send now.
   2234		 */
   2235		if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
   2236			goto send_now;
   2237	}
   2238
   2239	/* TODO : use tsorted_sent_queue ? */
   2240	head = tcp_rtx_queue_head(sk);
   2241	if (!head)
   2242		goto send_now;
   2243	delta = tp->tcp_clock_cache - head->tstamp;
   2244	/* If next ACK is likely to come too late (half srtt), do not defer */
   2245	if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
   2246		goto send_now;
   2247
   2248	/* Ok, it looks like it is advisable to defer.
   2249	 * Three cases are tracked :
   2250	 * 1) We are cwnd-limited
   2251	 * 2) We are rwnd-limited
   2252	 * 3) We are application limited.
   2253	 */
   2254	if (cong_win < send_win) {
   2255		if (cong_win <= skb->len) {
   2256			*is_cwnd_limited = true;
   2257			return true;
   2258		}
   2259	} else {
   2260		if (send_win <= skb->len) {
   2261			*is_rwnd_limited = true;
   2262			return true;
   2263		}
   2264	}
   2265
   2266	/* If this packet won't get more data, do not wait. */
   2267	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
   2268	    TCP_SKB_CB(skb)->eor)
   2269		goto send_now;
   2270
   2271	return true;
   2272
   2273send_now:
   2274	return false;
   2275}
   2276
   2277static inline void tcp_mtu_check_reprobe(struct sock *sk)
   2278{
   2279	struct inet_connection_sock *icsk = inet_csk(sk);
   2280	struct tcp_sock *tp = tcp_sk(sk);
   2281	struct net *net = sock_net(sk);
   2282	u32 interval;
   2283	s32 delta;
   2284
   2285	interval = net->ipv4.sysctl_tcp_probe_interval;
   2286	delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
   2287	if (unlikely(delta >= interval * HZ)) {
   2288		int mss = tcp_current_mss(sk);
   2289
   2290		/* Update current search range */
   2291		icsk->icsk_mtup.probe_size = 0;
   2292		icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
   2293			sizeof(struct tcphdr) +
   2294			icsk->icsk_af_ops->net_header_len;
   2295		icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
   2296
   2297		/* Update probe time stamp */
   2298		icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
   2299	}
   2300}
   2301
   2302static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
   2303{
   2304	struct sk_buff *skb, *next;
   2305
   2306	skb = tcp_send_head(sk);
   2307	tcp_for_write_queue_from_safe(skb, next, sk) {
   2308		if (len <= skb->len)
   2309			break;
   2310
   2311		if (unlikely(TCP_SKB_CB(skb)->eor) ||
   2312		    tcp_has_tx_tstamp(skb) ||
   2313		    !skb_pure_zcopy_same(skb, next))
   2314			return false;
   2315
   2316		len -= skb->len;
   2317	}
   2318
   2319	return true;
   2320}
   2321
   2322/* Create a new MTU probe if we are ready.
   2323 * MTU probe is regularly attempting to increase the path MTU by
   2324 * deliberately sending larger packets.  This discovers routing
   2325 * changes resulting in larger path MTUs.
   2326 *
   2327 * Returns 0 if we should wait to probe (no cwnd available),
   2328 *         1 if a probe was sent,
   2329 *         -1 otherwise
   2330 */
   2331static int tcp_mtu_probe(struct sock *sk)
   2332{
   2333	struct inet_connection_sock *icsk = inet_csk(sk);
   2334	struct tcp_sock *tp = tcp_sk(sk);
   2335	struct sk_buff *skb, *nskb, *next;
   2336	struct net *net = sock_net(sk);
   2337	int probe_size;
   2338	int size_needed;
   2339	int copy, len;
   2340	int mss_now;
   2341	int interval;
   2342
   2343	/* Not currently probing/verifying,
   2344	 * not in recovery,
   2345	 * have enough cwnd, and
   2346	 * not SACKing (the variable headers throw things off)
   2347	 */
   2348	if (likely(!icsk->icsk_mtup.enabled ||
   2349		   icsk->icsk_mtup.probe_size ||
   2350		   inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
   2351		   tcp_snd_cwnd(tp) < 11 ||
   2352		   tp->rx_opt.num_sacks || tp->rx_opt.dsack))
   2353		return -1;
   2354
   2355	/* Use binary search for probe_size between tcp_mss_base,
   2356	 * and current mss_clamp. if (search_high - search_low)
   2357	 * smaller than a threshold, backoff from probing.
   2358	 */
   2359	mss_now = tcp_current_mss(sk);
   2360	probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
   2361				    icsk->icsk_mtup.search_low) >> 1);
   2362	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
   2363	interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
   2364	/* When misfortune happens, we are reprobing actively,
   2365	 * and then reprobe timer has expired. We stick with current
   2366	 * probing process by not resetting search range to its orignal.
   2367	 */
   2368	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
   2369		interval < net->ipv4.sysctl_tcp_probe_threshold) {
   2370		/* Check whether enough time has elaplased for
   2371		 * another round of probing.
   2372		 */
   2373		tcp_mtu_check_reprobe(sk);
   2374		return -1;
   2375	}
   2376
   2377	/* Have enough data in the send queue to probe? */
   2378	if (tp->write_seq - tp->snd_nxt < size_needed)
   2379		return -1;
   2380
   2381	if (tp->snd_wnd < size_needed)
   2382		return -1;
   2383	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
   2384		return 0;
   2385
   2386	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
   2387	if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) {
   2388		if (!tcp_packets_in_flight(tp))
   2389			return -1;
   2390		else
   2391			return 0;
   2392	}
   2393
   2394	if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
   2395		return -1;
   2396
   2397	/* We're allowed to probe.  Build it now. */
   2398	nskb = tcp_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
   2399	if (!nskb)
   2400		return -1;
   2401	sk_wmem_queued_add(sk, nskb->truesize);
   2402	sk_mem_charge(sk, nskb->truesize);
   2403
   2404	skb = tcp_send_head(sk);
   2405	skb_copy_decrypted(nskb, skb);
   2406	mptcp_skb_ext_copy(nskb, skb);
   2407
   2408	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
   2409	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
   2410	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
   2411
   2412	tcp_insert_write_queue_before(nskb, skb, sk);
   2413	tcp_highest_sack_replace(sk, skb, nskb);
   2414
   2415	len = 0;
   2416	tcp_for_write_queue_from_safe(skb, next, sk) {
   2417		copy = min_t(int, skb->len, probe_size - len);
   2418		skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
   2419
   2420		if (skb->len <= copy) {
   2421			/* We've eaten all the data from this skb.
   2422			 * Throw it away. */
   2423			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
   2424			/* If this is the last SKB we copy and eor is set
   2425			 * we need to propagate it to the new skb.
   2426			 */
   2427			TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
   2428			tcp_skb_collapse_tstamp(nskb, skb);
   2429			tcp_unlink_write_queue(skb, sk);
   2430			tcp_wmem_free_skb(sk, skb);
   2431		} else {
   2432			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
   2433						   ~(TCPHDR_FIN|TCPHDR_PSH);
   2434			if (!skb_shinfo(skb)->nr_frags) {
   2435				skb_pull(skb, copy);
   2436			} else {
   2437				__pskb_trim_head(skb, copy);
   2438				tcp_set_skb_tso_segs(skb, mss_now);
   2439			}
   2440			TCP_SKB_CB(skb)->seq += copy;
   2441		}
   2442
   2443		len += copy;
   2444
   2445		if (len >= probe_size)
   2446			break;
   2447	}
   2448	tcp_init_tso_segs(nskb, nskb->len);
   2449
   2450	/* We're ready to send.  If this fails, the probe will
   2451	 * be resegmented into mss-sized pieces by tcp_write_xmit().
   2452	 */
   2453	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
   2454		/* Decrement cwnd here because we are sending
   2455		 * effectively two packets. */
   2456		tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
   2457		tcp_event_new_data_sent(sk, nskb);
   2458
   2459		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
   2460		tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
   2461		tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
   2462
   2463		return 1;
   2464	}
   2465
   2466	return -1;
   2467}
   2468
   2469static bool tcp_pacing_check(struct sock *sk)
   2470{
   2471	struct tcp_sock *tp = tcp_sk(sk);
   2472
   2473	if (!tcp_needs_internal_pacing(sk))
   2474		return false;
   2475
   2476	if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
   2477		return false;
   2478
   2479	if (!hrtimer_is_queued(&tp->pacing_timer)) {
   2480		hrtimer_start(&tp->pacing_timer,
   2481			      ns_to_ktime(tp->tcp_wstamp_ns),
   2482			      HRTIMER_MODE_ABS_PINNED_SOFT);
   2483		sock_hold(sk);
   2484	}
   2485	return true;
   2486}
   2487
   2488/* TCP Small Queues :
   2489 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
   2490 * (These limits are doubled for retransmits)
   2491 * This allows for :
   2492 *  - better RTT estimation and ACK scheduling
   2493 *  - faster recovery
   2494 *  - high rates
   2495 * Alas, some drivers / subsystems require a fair amount
   2496 * of queued bytes to ensure line rate.
   2497 * One example is wifi aggregation (802.11 AMPDU)
   2498 */
   2499static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
   2500				  unsigned int factor)
   2501{
   2502	unsigned long limit;
   2503
   2504	limit = max_t(unsigned long,
   2505		      2 * skb->truesize,
   2506		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
   2507	if (sk->sk_pacing_status == SK_PACING_NONE)
   2508		limit = min_t(unsigned long, limit,
   2509			      sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
   2510	limit <<= factor;
   2511
   2512	if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
   2513	    tcp_sk(sk)->tcp_tx_delay) {
   2514		u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
   2515
   2516		/* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
   2517		 * approximate our needs assuming an ~100% skb->truesize overhead.
   2518		 * USEC_PER_SEC is approximated by 2^20.
   2519		 * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
   2520		 */
   2521		extra_bytes >>= (20 - 1);
   2522		limit += extra_bytes;
   2523	}
   2524	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
   2525		/* Always send skb if rtx queue is empty.
   2526		 * No need to wait for TX completion to call us back,
   2527		 * after softirq/tasklet schedule.
   2528		 * This helps when TX completions are delayed too much.
   2529		 */
   2530		if (tcp_rtx_queue_empty(sk))
   2531			return false;
   2532
   2533		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
   2534		/* It is possible TX completion already happened
   2535		 * before we set TSQ_THROTTLED, so we must
   2536		 * test again the condition.
   2537		 */
   2538		smp_mb__after_atomic();
   2539		if (refcount_read(&sk->sk_wmem_alloc) > limit)
   2540			return true;
   2541	}
   2542	return false;
   2543}
   2544
   2545static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
   2546{
   2547	const u32 now = tcp_jiffies32;
   2548	enum tcp_chrono old = tp->chrono_type;
   2549
   2550	if (old > TCP_CHRONO_UNSPEC)
   2551		tp->chrono_stat[old - 1] += now - tp->chrono_start;
   2552	tp->chrono_start = now;
   2553	tp->chrono_type = new;
   2554}
   2555
   2556void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
   2557{
   2558	struct tcp_sock *tp = tcp_sk(sk);
   2559
   2560	/* If there are multiple conditions worthy of tracking in a
   2561	 * chronograph then the highest priority enum takes precedence
   2562	 * over the other conditions. So that if something "more interesting"
   2563	 * starts happening, stop the previous chrono and start a new one.
   2564	 */
   2565	if (type > tp->chrono_type)
   2566		tcp_chrono_set(tp, type);
   2567}
   2568
   2569void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
   2570{
   2571	struct tcp_sock *tp = tcp_sk(sk);
   2572
   2573
   2574	/* There are multiple conditions worthy of tracking in a
   2575	 * chronograph, so that the highest priority enum takes
   2576	 * precedence over the other conditions (see tcp_chrono_start).
   2577	 * If a condition stops, we only stop chrono tracking if
   2578	 * it's the "most interesting" or current chrono we are
   2579	 * tracking and starts busy chrono if we have pending data.
   2580	 */
   2581	if (tcp_rtx_and_write_queues_empty(sk))
   2582		tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
   2583	else if (type == tp->chrono_type)
   2584		tcp_chrono_set(tp, TCP_CHRONO_BUSY);
   2585}
   2586
   2587/* This routine writes packets to the network.  It advances the
   2588 * send_head.  This happens as incoming acks open up the remote
   2589 * window for us.
   2590 *
   2591 * LARGESEND note: !tcp_urg_mode is overkill, only frames between
   2592 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
   2593 * account rare use of URG, this is not a big flaw.
   2594 *
   2595 * Send at most one packet when push_one > 0. Temporarily ignore
   2596 * cwnd limit to force at most one packet out when push_one == 2.
   2597
   2598 * Returns true, if no segments are in flight and we have queued segments,
   2599 * but cannot send anything now because of SWS or another problem.
   2600 */
   2601static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
   2602			   int push_one, gfp_t gfp)
   2603{
   2604	struct tcp_sock *tp = tcp_sk(sk);
   2605	struct sk_buff *skb;
   2606	unsigned int tso_segs, sent_pkts;
   2607	int cwnd_quota;
   2608	int result;
   2609	bool is_cwnd_limited = false, is_rwnd_limited = false;
   2610	u32 max_segs;
   2611
   2612	sent_pkts = 0;
   2613
   2614	tcp_mstamp_refresh(tp);
   2615	if (!push_one) {
   2616		/* Do MTU probing. */
   2617		result = tcp_mtu_probe(sk);
   2618		if (!result) {
   2619			return false;
   2620		} else if (result > 0) {
   2621			sent_pkts = 1;
   2622		}
   2623	}
   2624
   2625	max_segs = tcp_tso_segs(sk, mss_now);
   2626	while ((skb = tcp_send_head(sk))) {
   2627		unsigned int limit;
   2628
   2629		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
   2630			/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
   2631			tp->tcp_wstamp_ns = tp->tcp_clock_cache;
   2632			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
   2633			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
   2634			tcp_init_tso_segs(skb, mss_now);
   2635			goto repair; /* Skip network transmission */
   2636		}
   2637
   2638		if (tcp_pacing_check(sk))
   2639			break;
   2640
   2641		tso_segs = tcp_init_tso_segs(skb, mss_now);
   2642		BUG_ON(!tso_segs);
   2643
   2644		cwnd_quota = tcp_cwnd_test(tp, skb);
   2645		if (!cwnd_quota) {
   2646			if (push_one == 2)
   2647				/* Force out a loss probe pkt. */
   2648				cwnd_quota = 1;
   2649			else
   2650				break;
   2651		}
   2652
   2653		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
   2654			is_rwnd_limited = true;
   2655			break;
   2656		}
   2657
   2658		if (tso_segs == 1) {
   2659			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
   2660						     (tcp_skb_is_last(sk, skb) ?
   2661						      nonagle : TCP_NAGLE_PUSH))))
   2662				break;
   2663		} else {
   2664			if (!push_one &&
   2665			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
   2666						 &is_rwnd_limited, max_segs))
   2667				break;
   2668		}
   2669
   2670		limit = mss_now;
   2671		if (tso_segs > 1 && !tcp_urg_mode(tp))
   2672			limit = tcp_mss_split_point(sk, skb, mss_now,
   2673						    min_t(unsigned int,
   2674							  cwnd_quota,
   2675							  max_segs),
   2676						    nonagle);
   2677
   2678		if (skb->len > limit &&
   2679		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
   2680			break;
   2681
   2682		if (tcp_small_queue_check(sk, skb, 0))
   2683			break;
   2684
   2685		/* Argh, we hit an empty skb(), presumably a thread
   2686		 * is sleeping in sendmsg()/sk_stream_wait_memory().
   2687		 * We do not want to send a pure-ack packet and have
   2688		 * a strange looking rtx queue with empty packet(s).
   2689		 */
   2690		if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
   2691			break;
   2692
   2693		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
   2694			break;
   2695
   2696repair:
   2697		/* Advance the send_head.  This one is sent out.
   2698		 * This call will increment packets_out.
   2699		 */
   2700		tcp_event_new_data_sent(sk, skb);
   2701
   2702		tcp_minshall_update(tp, mss_now, skb);
   2703		sent_pkts += tcp_skb_pcount(skb);
   2704
   2705		if (push_one)
   2706			break;
   2707	}
   2708
   2709	if (is_rwnd_limited)
   2710		tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
   2711	else
   2712		tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
   2713
   2714	is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp));
   2715	if (likely(sent_pkts || is_cwnd_limited))
   2716		tcp_cwnd_validate(sk, is_cwnd_limited);
   2717
   2718	if (likely(sent_pkts)) {
   2719		if (tcp_in_cwnd_reduction(sk))
   2720			tp->prr_out += sent_pkts;
   2721
   2722		/* Send one loss probe per tail loss episode. */
   2723		if (push_one != 2)
   2724			tcp_schedule_loss_probe(sk, false);
   2725		return false;
   2726	}
   2727	return !tp->packets_out && !tcp_write_queue_empty(sk);
   2728}
   2729
   2730bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
   2731{
   2732	struct inet_connection_sock *icsk = inet_csk(sk);
   2733	struct tcp_sock *tp = tcp_sk(sk);
   2734	u32 timeout, rto_delta_us;
   2735	int early_retrans;
   2736
   2737	/* Don't do any loss probe on a Fast Open connection before 3WHS
   2738	 * finishes.
   2739	 */
   2740	if (rcu_access_pointer(tp->fastopen_rsk))
   2741		return false;
   2742
   2743	early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
   2744	/* Schedule a loss probe in 2*RTT for SACK capable connections
   2745	 * not in loss recovery, that are either limited by cwnd or application.
   2746	 */
   2747	if ((early_retrans != 3 && early_retrans != 4) ||
   2748	    !tp->packets_out || !tcp_is_sack(tp) ||
   2749	    (icsk->icsk_ca_state != TCP_CA_Open &&
   2750	     icsk->icsk_ca_state != TCP_CA_CWR))
   2751		return false;
   2752
   2753	/* Probe timeout is 2*rtt. Add minimum RTO to account
   2754	 * for delayed ack when there's one outstanding packet. If no RTT
   2755	 * sample is available then probe after TCP_TIMEOUT_INIT.
   2756	 */
   2757	if (tp->srtt_us) {
   2758		timeout = usecs_to_jiffies(tp->srtt_us >> 2);
   2759		if (tp->packets_out == 1)
   2760			timeout += TCP_RTO_MIN;
   2761		else
   2762			timeout += TCP_TIMEOUT_MIN;
   2763	} else {
   2764		timeout = TCP_TIMEOUT_INIT;
   2765	}
   2766
   2767	/* If the RTO formula yields an earlier time, then use that time. */
   2768	rto_delta_us = advancing_rto ?
   2769			jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
   2770			tcp_rto_delta_us(sk);  /* How far in future is RTO? */
   2771	if (rto_delta_us > 0)
   2772		timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
   2773
   2774	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
   2775	return true;
   2776}
   2777
   2778/* Thanks to skb fast clones, we can detect if a prior transmit of
   2779 * a packet is still in a qdisc or driver queue.
   2780 * In this case, there is very little point doing a retransmit !
   2781 */
   2782static bool skb_still_in_host_queue(struct sock *sk,
   2783				    const struct sk_buff *skb)
   2784{
   2785	if (unlikely(skb_fclone_busy(sk, skb))) {
   2786		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
   2787		smp_mb__after_atomic();
   2788		if (skb_fclone_busy(sk, skb)) {
   2789			NET_INC_STATS(sock_net(sk),
   2790				      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
   2791			return true;
   2792		}
   2793	}
   2794	return false;
   2795}
   2796
   2797/* When probe timeout (PTO) fires, try send a new segment if possible, else
   2798 * retransmit the last segment.
   2799 */
   2800void tcp_send_loss_probe(struct sock *sk)
   2801{
   2802	struct tcp_sock *tp = tcp_sk(sk);
   2803	struct sk_buff *skb;
   2804	int pcount;
   2805	int mss = tcp_current_mss(sk);
   2806
   2807	/* At most one outstanding TLP */
   2808	if (tp->tlp_high_seq)
   2809		goto rearm_timer;
   2810
   2811	tp->tlp_retrans = 0;
   2812	skb = tcp_send_head(sk);
   2813	if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
   2814		pcount = tp->packets_out;
   2815		tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
   2816		if (tp->packets_out > pcount)
   2817			goto probe_sent;
   2818		goto rearm_timer;
   2819	}
   2820	skb = skb_rb_last(&sk->tcp_rtx_queue);
   2821	if (unlikely(!skb)) {
   2822		WARN_ONCE(tp->packets_out,
   2823			  "invalid inflight: %u state %u cwnd %u mss %d\n",
   2824			  tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss);
   2825		inet_csk(sk)->icsk_pending = 0;
   2826		return;
   2827	}
   2828
   2829	if (skb_still_in_host_queue(sk, skb))
   2830		goto rearm_timer;
   2831
   2832	pcount = tcp_skb_pcount(skb);
   2833	if (WARN_ON(!pcount))
   2834		goto rearm_timer;
   2835
   2836	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
   2837		if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
   2838					  (pcount - 1) * mss, mss,
   2839					  GFP_ATOMIC)))
   2840			goto rearm_timer;
   2841		skb = skb_rb_next(skb);
   2842	}
   2843
   2844	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
   2845		goto rearm_timer;
   2846
   2847	if (__tcp_retransmit_skb(sk, skb, 1))
   2848		goto rearm_timer;
   2849
   2850	tp->tlp_retrans = 1;
   2851
   2852probe_sent:
   2853	/* Record snd_nxt for loss detection. */
   2854	tp->tlp_high_seq = tp->snd_nxt;
   2855
   2856	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
   2857	/* Reset s.t. tcp_rearm_rto will restart timer from now */
   2858	inet_csk(sk)->icsk_pending = 0;
   2859rearm_timer:
   2860	tcp_rearm_rto(sk);
   2861}
   2862
   2863/* Push out any pending frames which were held back due to
   2864 * TCP_CORK or attempt at coalescing tiny packets.
   2865 * The socket must be locked by the caller.
   2866 */
   2867void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
   2868			       int nonagle)
   2869{
   2870	/* If we are closed, the bytes will have to remain here.
   2871	 * In time closedown will finish, we empty the write queue and
   2872	 * all will be happy.
   2873	 */
   2874	if (unlikely(sk->sk_state == TCP_CLOSE))
   2875		return;
   2876
   2877	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
   2878			   sk_gfp_mask(sk, GFP_ATOMIC)))
   2879		tcp_check_probe_timer(sk);
   2880}
   2881
   2882/* Send _single_ skb sitting at the send head. This function requires
   2883 * true push pending frames to setup probe timer etc.
   2884 */
   2885void tcp_push_one(struct sock *sk, unsigned int mss_now)
   2886{
   2887	struct sk_buff *skb = tcp_send_head(sk);
   2888
   2889	BUG_ON(!skb || skb->len < mss_now);
   2890
   2891	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
   2892}
   2893
   2894/* This function returns the amount that we can raise the
   2895 * usable window based on the following constraints
   2896 *
   2897 * 1. The window can never be shrunk once it is offered (RFC 793)
   2898 * 2. We limit memory per socket
   2899 *
   2900 * RFC 1122:
   2901 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
   2902 *  RECV.NEXT + RCV.WIN fixed until:
   2903 *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
   2904 *
   2905 * i.e. don't raise the right edge of the window until you can raise
   2906 * it at least MSS bytes.
   2907 *
   2908 * Unfortunately, the recommended algorithm breaks header prediction,
   2909 * since header prediction assumes th->window stays fixed.
   2910 *
   2911 * Strictly speaking, keeping th->window fixed violates the receiver
   2912 * side SWS prevention criteria. The problem is that under this rule
   2913 * a stream of single byte packets will cause the right side of the
   2914 * window to always advance by a single byte.
   2915 *
   2916 * Of course, if the sender implements sender side SWS prevention
   2917 * then this will not be a problem.
   2918 *
   2919 * BSD seems to make the following compromise:
   2920 *
   2921 *	If the free space is less than the 1/4 of the maximum
   2922 *	space available and the free space is less than 1/2 mss,
   2923 *	then set the window to 0.
   2924 *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
   2925 *	Otherwise, just prevent the window from shrinking
   2926 *	and from being larger than the largest representable value.
   2927 *
   2928 * This prevents incremental opening of the window in the regime
   2929 * where TCP is limited by the speed of the reader side taking
   2930 * data out of the TCP receive queue. It does nothing about
   2931 * those cases where the window is constrained on the sender side
   2932 * because the pipeline is full.
   2933 *
   2934 * BSD also seems to "accidentally" limit itself to windows that are a
   2935 * multiple of MSS, at least until the free space gets quite small.
   2936 * This would appear to be a side effect of the mbuf implementation.
   2937 * Combining these two algorithms results in the observed behavior
   2938 * of having a fixed window size at almost all times.
   2939 *
   2940 * Below we obtain similar behavior by forcing the offered window to
   2941 * a multiple of the mss when it is feasible to do so.
   2942 *
   2943 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
   2944 * Regular options like TIMESTAMP are taken into account.
   2945 */
   2946u32 __tcp_select_window(struct sock *sk)
   2947{
   2948	struct inet_connection_sock *icsk = inet_csk(sk);
   2949	struct tcp_sock *tp = tcp_sk(sk);
   2950	/* MSS for the peer's data.  Previous versions used mss_clamp
   2951	 * here.  I don't know if the value based on our guesses
   2952	 * of peer's MSS is better for the performance.  It's more correct
   2953	 * but may be worse for the performance because of rcv_mss
   2954	 * fluctuations.  --SAW  1998/11/1
   2955	 */
   2956	int mss = icsk->icsk_ack.rcv_mss;
   2957	int free_space = tcp_space(sk);
   2958	int allowed_space = tcp_full_space(sk);
   2959	int full_space, window;
   2960
   2961	if (sk_is_mptcp(sk))
   2962		mptcp_space(sk, &free_space, &allowed_space);
   2963
   2964	full_space = min_t(int, tp->window_clamp, allowed_space);
   2965
   2966	if (unlikely(mss > full_space)) {
   2967		mss = full_space;
   2968		if (mss <= 0)
   2969			return 0;
   2970	}
   2971	if (free_space < (full_space >> 1)) {
   2972		icsk->icsk_ack.quick = 0;
   2973
   2974		if (tcp_under_memory_pressure(sk))
   2975			tcp_adjust_rcv_ssthresh(sk);
   2976
   2977		/* free_space might become our new window, make sure we don't
   2978		 * increase it due to wscale.
   2979		 */
   2980		free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
   2981
   2982		/* if free space is less than mss estimate, or is below 1/16th
   2983		 * of the maximum allowed, try to move to zero-window, else
   2984		 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
   2985		 * new incoming data is dropped due to memory limits.
   2986		 * With large window, mss test triggers way too late in order
   2987		 * to announce zero window in time before rmem limit kicks in.
   2988		 */
   2989		if (free_space < (allowed_space >> 4) || free_space < mss)
   2990			return 0;
   2991	}
   2992
   2993	if (free_space > tp->rcv_ssthresh)
   2994		free_space = tp->rcv_ssthresh;
   2995
   2996	/* Don't do rounding if we are using window scaling, since the
   2997	 * scaled window will not line up with the MSS boundary anyway.
   2998	 */
   2999	if (tp->rx_opt.rcv_wscale) {
   3000		window = free_space;
   3001
   3002		/* Advertise enough space so that it won't get scaled away.
   3003		 * Import case: prevent zero window announcement if
   3004		 * 1<<rcv_wscale > mss.
   3005		 */
   3006		window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
   3007	} else {
   3008		window = tp->rcv_wnd;
   3009		/* Get the largest window that is a nice multiple of mss.
   3010		 * Window clamp already applied above.
   3011		 * If our current window offering is within 1 mss of the
   3012		 * free space we just keep it. This prevents the divide
   3013		 * and multiply from happening most of the time.
   3014		 * We also don't do any window rounding when the free space
   3015		 * is too small.
   3016		 */
   3017		if (window <= free_space - mss || window > free_space)
   3018			window = rounddown(free_space, mss);
   3019		else if (mss == full_space &&
   3020			 free_space > window + (full_space >> 1))
   3021			window = free_space;
   3022	}
   3023
   3024	return window;
   3025}
   3026
   3027void tcp_skb_collapse_tstamp(struct sk_buff *skb,
   3028			     const struct sk_buff *next_skb)
   3029{
   3030	if (unlikely(tcp_has_tx_tstamp(next_skb))) {
   3031		const struct skb_shared_info *next_shinfo =
   3032			skb_shinfo(next_skb);
   3033		struct skb_shared_info *shinfo = skb_shinfo(skb);
   3034
   3035		shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
   3036		shinfo->tskey = next_shinfo->tskey;
   3037		TCP_SKB_CB(skb)->txstamp_ack |=
   3038			TCP_SKB_CB(next_skb)->txstamp_ack;
   3039	}
   3040}
   3041
   3042/* Collapses two adjacent SKB's during retransmission. */
   3043static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
   3044{
   3045	struct tcp_sock *tp = tcp_sk(sk);
   3046	struct sk_buff *next_skb = skb_rb_next(skb);
   3047	int next_skb_size;
   3048
   3049	next_skb_size = next_skb->len;
   3050
   3051	BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
   3052
   3053	if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size))
   3054		return false;
   3055
   3056	tcp_highest_sack_replace(sk, next_skb, skb);
   3057
   3058	/* Update sequence range on original skb. */
   3059	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
   3060
   3061	/* Merge over control information. This moves PSH/FIN etc. over */
   3062	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
   3063
   3064	/* All done, get rid of second SKB and account for it so
   3065	 * packet counting does not break.
   3066	 */
   3067	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
   3068	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
   3069
   3070	/* changed transmit queue under us so clear hints */
   3071	tcp_clear_retrans_hints_partial(tp);
   3072	if (next_skb == tp->retransmit_skb_hint)
   3073		tp->retransmit_skb_hint = skb;
   3074
   3075	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
   3076
   3077	tcp_skb_collapse_tstamp(skb, next_skb);
   3078
   3079	tcp_rtx_queue_unlink_and_free(next_skb, sk);
   3080	return true;
   3081}
   3082
   3083/* Check if coalescing SKBs is legal. */
   3084static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
   3085{
   3086	if (tcp_skb_pcount(skb) > 1)
   3087		return false;
   3088	if (skb_cloned(skb))
   3089		return false;
   3090	/* Some heuristics for collapsing over SACK'd could be invented */
   3091	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
   3092		return false;
   3093
   3094	return true;
   3095}
   3096
   3097/* Collapse packets in the retransmit queue to make to create
   3098 * less packets on the wire. This is only done on retransmission.
   3099 */
   3100static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
   3101				     int space)
   3102{
   3103	struct tcp_sock *tp = tcp_sk(sk);
   3104	struct sk_buff *skb = to, *tmp;
   3105	bool first = true;
   3106
   3107	if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
   3108		return;
   3109	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
   3110		return;
   3111
   3112	skb_rbtree_walk_from_safe(skb, tmp) {
   3113		if (!tcp_can_collapse(sk, skb))
   3114			break;
   3115
   3116		if (!tcp_skb_can_collapse(to, skb))
   3117			break;
   3118
   3119		space -= skb->len;
   3120
   3121		if (first) {
   3122			first = false;
   3123			continue;
   3124		}
   3125
   3126		if (space < 0)
   3127			break;
   3128
   3129		if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
   3130			break;
   3131
   3132		if (!tcp_collapse_retrans(sk, to))
   3133			break;
   3134	}
   3135}
   3136
   3137/* This retransmits one SKB.  Policy decisions and retransmit queue
   3138 * state updates are done by the caller.  Returns non-zero if an
   3139 * error occurred which prevented the send.
   3140 */
   3141int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
   3142{
   3143	struct inet_connection_sock *icsk = inet_csk(sk);
   3144	struct tcp_sock *tp = tcp_sk(sk);
   3145	unsigned int cur_mss;
   3146	int diff, len, err;
   3147
   3148
   3149	/* Inconclusive MTU probe */
   3150	if (icsk->icsk_mtup.probe_size)
   3151		icsk->icsk_mtup.probe_size = 0;
   3152
   3153	if (skb_still_in_host_queue(sk, skb))
   3154		return -EBUSY;
   3155
   3156	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
   3157		if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
   3158			WARN_ON_ONCE(1);
   3159			return -EINVAL;
   3160		}
   3161		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
   3162			return -ENOMEM;
   3163	}
   3164
   3165	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
   3166		return -EHOSTUNREACH; /* Routing failure or similar. */
   3167
   3168	cur_mss = tcp_current_mss(sk);
   3169
   3170	/* If receiver has shrunk his window, and skb is out of
   3171	 * new window, do not retransmit it. The exception is the
   3172	 * case, when window is shrunk to zero. In this case
   3173	 * our retransmit serves as a zero window probe.
   3174	 */
   3175	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
   3176	    TCP_SKB_CB(skb)->seq != tp->snd_una)
   3177		return -EAGAIN;
   3178
   3179	len = cur_mss * segs;
   3180	if (skb->len > len) {
   3181		if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
   3182				 cur_mss, GFP_ATOMIC))
   3183			return -ENOMEM; /* We'll try again later. */
   3184	} else {
   3185		if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
   3186			return -ENOMEM;
   3187
   3188		diff = tcp_skb_pcount(skb);
   3189		tcp_set_skb_tso_segs(skb, cur_mss);
   3190		diff -= tcp_skb_pcount(skb);
   3191		if (diff)
   3192			tcp_adjust_pcount(sk, skb, diff);
   3193		if (skb->len < cur_mss)
   3194			tcp_retrans_try_collapse(sk, skb, cur_mss);
   3195	}
   3196
   3197	/* RFC3168, section 6.1.1.1. ECN fallback */
   3198	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
   3199		tcp_ecn_clear_syn(sk, skb);
   3200
   3201	/* Update global and local TCP statistics. */
   3202	segs = tcp_skb_pcount(skb);
   3203	TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
   3204	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
   3205		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
   3206	tp->total_retrans += segs;
   3207	tp->bytes_retrans += skb->len;
   3208
   3209	/* make sure skb->data is aligned on arches that require it
   3210	 * and check if ack-trimming & collapsing extended the headroom
   3211	 * beyond what csum_start can cover.
   3212	 */
   3213	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
   3214		     skb_headroom(skb) >= 0xFFFF)) {
   3215		struct sk_buff *nskb;
   3216
   3217		tcp_skb_tsorted_save(skb) {
   3218			nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
   3219			if (nskb) {
   3220				nskb->dev = NULL;
   3221				err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
   3222			} else {
   3223				err = -ENOBUFS;
   3224			}
   3225		} tcp_skb_tsorted_restore(skb);
   3226
   3227		if (!err) {
   3228			tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
   3229			tcp_rate_skb_sent(sk, skb);
   3230		}
   3231	} else {
   3232		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
   3233	}
   3234
   3235	/* To avoid taking spuriously low RTT samples based on a timestamp
   3236	 * for a transmit that never happened, always mark EVER_RETRANS
   3237	 */
   3238	TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
   3239
   3240	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
   3241		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
   3242				  TCP_SKB_CB(skb)->seq, segs, err);
   3243
   3244	if (likely(!err)) {
   3245		trace_tcp_retransmit_skb(sk, skb);
   3246	} else if (err != -EBUSY) {
   3247		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
   3248	}
   3249	return err;
   3250}
   3251
   3252int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
   3253{
   3254	struct tcp_sock *tp = tcp_sk(sk);
   3255	int err = __tcp_retransmit_skb(sk, skb, segs);
   3256
   3257	if (err == 0) {
   3258#if FASTRETRANS_DEBUG > 0
   3259		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
   3260			net_dbg_ratelimited("retrans_out leaked\n");
   3261		}
   3262#endif
   3263		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
   3264		tp->retrans_out += tcp_skb_pcount(skb);
   3265	}
   3266
   3267	/* Save stamp of the first (attempted) retransmit. */
   3268	if (!tp->retrans_stamp)
   3269		tp->retrans_stamp = tcp_skb_timestamp(skb);
   3270
   3271	if (tp->undo_retrans < 0)
   3272		tp->undo_retrans = 0;
   3273	tp->undo_retrans += tcp_skb_pcount(skb);
   3274	return err;
   3275}
   3276
   3277/* This gets called after a retransmit timeout, and the initially
   3278 * retransmitted data is acknowledged.  It tries to continue
   3279 * resending the rest of the retransmit queue, until either
   3280 * we've sent it all or the congestion window limit is reached.
   3281 */
   3282void tcp_xmit_retransmit_queue(struct sock *sk)
   3283{
   3284	const struct inet_connection_sock *icsk = inet_csk(sk);
   3285	struct sk_buff *skb, *rtx_head, *hole = NULL;
   3286	struct tcp_sock *tp = tcp_sk(sk);
   3287	bool rearm_timer = false;
   3288	u32 max_segs;
   3289	int mib_idx;
   3290
   3291	if (!tp->packets_out)
   3292		return;
   3293
   3294	rtx_head = tcp_rtx_queue_head(sk);
   3295	skb = tp->retransmit_skb_hint ?: rtx_head;
   3296	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
   3297	skb_rbtree_walk_from(skb) {
   3298		__u8 sacked;
   3299		int segs;
   3300
   3301		if (tcp_pacing_check(sk))
   3302			break;
   3303
   3304		/* we could do better than to assign each time */
   3305		if (!hole)
   3306			tp->retransmit_skb_hint = skb;
   3307
   3308		segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp);
   3309		if (segs <= 0)
   3310			break;
   3311		sacked = TCP_SKB_CB(skb)->sacked;
   3312		/* In case tcp_shift_skb_data() have aggregated large skbs,
   3313		 * we need to make sure not sending too bigs TSO packets
   3314		 */
   3315		segs = min_t(int, segs, max_segs);
   3316
   3317		if (tp->retrans_out >= tp->lost_out) {
   3318			break;
   3319		} else if (!(sacked & TCPCB_LOST)) {
   3320			if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
   3321				hole = skb;
   3322			continue;
   3323
   3324		} else {
   3325			if (icsk->icsk_ca_state != TCP_CA_Loss)
   3326				mib_idx = LINUX_MIB_TCPFASTRETRANS;
   3327			else
   3328				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
   3329		}
   3330
   3331		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
   3332			continue;
   3333
   3334		if (tcp_small_queue_check(sk, skb, 1))
   3335			break;
   3336
   3337		if (tcp_retransmit_skb(sk, skb, segs))
   3338			break;
   3339
   3340		NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
   3341
   3342		if (tcp_in_cwnd_reduction(sk))
   3343			tp->prr_out += tcp_skb_pcount(skb);
   3344
   3345		if (skb == rtx_head &&
   3346		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
   3347			rearm_timer = true;
   3348
   3349	}
   3350	if (rearm_timer)
   3351		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
   3352				     inet_csk(sk)->icsk_rto,
   3353				     TCP_RTO_MAX);
   3354}
   3355
   3356/* We allow to exceed memory limits for FIN packets to expedite
   3357 * connection tear down and (memory) recovery.
   3358 * Otherwise tcp_send_fin() could be tempted to either delay FIN
   3359 * or even be forced to close flow without any FIN.
   3360 * In general, we want to allow one skb per socket to avoid hangs
   3361 * with edge trigger epoll()
   3362 */
   3363void sk_forced_mem_schedule(struct sock *sk, int size)
   3364{
   3365	int amt;
   3366
   3367	if (size <= sk->sk_forward_alloc)
   3368		return;
   3369	amt = sk_mem_pages(size);
   3370	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
   3371	sk_memory_allocated_add(sk, amt);
   3372
   3373	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
   3374		mem_cgroup_charge_skmem(sk->sk_memcg, amt,
   3375					gfp_memcg_charge() | __GFP_NOFAIL);
   3376}
   3377
   3378/* Send a FIN. The caller locks the socket for us.
   3379 * We should try to send a FIN packet really hard, but eventually give up.
   3380 */
   3381void tcp_send_fin(struct sock *sk)
   3382{
   3383	struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
   3384	struct tcp_sock *tp = tcp_sk(sk);
   3385
   3386	/* Optimization, tack on the FIN if we have one skb in write queue and
   3387	 * this skb was not yet sent, or we are under memory pressure.
   3388	 * Note: in the latter case, FIN packet will be sent after a timeout,
   3389	 * as TCP stack thinks it has already been transmitted.
   3390	 */
   3391	tskb = tail;
   3392	if (!tskb && tcp_under_memory_pressure(sk))
   3393		tskb = skb_rb_last(&sk->tcp_rtx_queue);
   3394
   3395	if (tskb) {
   3396		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
   3397		TCP_SKB_CB(tskb)->end_seq++;
   3398		tp->write_seq++;
   3399		if (!tail) {
   3400			/* This means tskb was already sent.
   3401			 * Pretend we included the FIN on previous transmit.
   3402			 * We need to set tp->snd_nxt to the value it would have
   3403			 * if FIN had been sent. This is because retransmit path
   3404			 * does not change tp->snd_nxt.
   3405			 */
   3406			WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
   3407			return;
   3408		}
   3409	} else {
   3410		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
   3411		if (unlikely(!skb))
   3412			return;
   3413
   3414		INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
   3415		skb_reserve(skb, MAX_TCP_HEADER);
   3416		sk_forced_mem_schedule(sk, skb->truesize);
   3417		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
   3418		tcp_init_nondata_skb(skb, tp->write_seq,
   3419				     TCPHDR_ACK | TCPHDR_FIN);
   3420		tcp_queue_skb(sk, skb);
   3421	}
   3422	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
   3423}
   3424
   3425/* We get here when a process closes a file descriptor (either due to
   3426 * an explicit close() or as a byproduct of exit()'ing) and there
   3427 * was unread data in the receive queue.  This behavior is recommended
   3428 * by RFC 2525, section 2.17.  -DaveM
   3429 */
   3430void tcp_send_active_reset(struct sock *sk, gfp_t priority)
   3431{
   3432	struct sk_buff *skb;
   3433
   3434	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
   3435
   3436	/* NOTE: No TCP options attached and we never retransmit this. */
   3437	skb = alloc_skb(MAX_TCP_HEADER, priority);
   3438	if (!skb) {
   3439		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
   3440		return;
   3441	}
   3442
   3443	/* Reserve space for headers and prepare control bits. */
   3444	skb_reserve(skb, MAX_TCP_HEADER);
   3445	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
   3446			     TCPHDR_ACK | TCPHDR_RST);
   3447	tcp_mstamp_refresh(tcp_sk(sk));
   3448	/* Send it off. */
   3449	if (tcp_transmit_skb(sk, skb, 0, priority))
   3450		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
   3451
   3452	/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
   3453	 * skb here is different to the troublesome skb, so use NULL
   3454	 */
   3455	trace_tcp_send_reset(sk, NULL);
   3456}
   3457
   3458/* Send a crossed SYN-ACK during socket establishment.
   3459 * WARNING: This routine must only be called when we have already sent
   3460 * a SYN packet that crossed the incoming SYN that caused this routine
   3461 * to get called. If this assumption fails then the initial rcv_wnd
   3462 * and rcv_wscale values will not be correct.
   3463 */
   3464int tcp_send_synack(struct sock *sk)
   3465{
   3466	struct sk_buff *skb;
   3467
   3468	skb = tcp_rtx_queue_head(sk);
   3469	if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
   3470		pr_err("%s: wrong queue state\n", __func__);
   3471		return -EFAULT;
   3472	}
   3473	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
   3474		if (skb_cloned(skb)) {
   3475			struct sk_buff *nskb;
   3476
   3477			tcp_skb_tsorted_save(skb) {
   3478				nskb = skb_copy(skb, GFP_ATOMIC);
   3479			} tcp_skb_tsorted_restore(skb);
   3480			if (!nskb)
   3481				return -ENOMEM;
   3482			INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
   3483			tcp_highest_sack_replace(sk, skb, nskb);
   3484			tcp_rtx_queue_unlink_and_free(skb, sk);
   3485			__skb_header_release(nskb);
   3486			tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
   3487			sk_wmem_queued_add(sk, nskb->truesize);
   3488			sk_mem_charge(sk, nskb->truesize);
   3489			skb = nskb;
   3490		}
   3491
   3492		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
   3493		tcp_ecn_send_synack(sk, skb);
   3494	}
   3495	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
   3496}
   3497
   3498/**
   3499 * tcp_make_synack - Allocate one skb and build a SYNACK packet.
   3500 * @sk: listener socket
   3501 * @dst: dst entry attached to the SYNACK. It is consumed and caller
   3502 *       should not use it again.
   3503 * @req: request_sock pointer
   3504 * @foc: cookie for tcp fast open
   3505 * @synack_type: Type of synack to prepare
   3506 * @syn_skb: SYN packet just received.  It could be NULL for rtx case.
   3507 */
   3508struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
   3509				struct request_sock *req,
   3510				struct tcp_fastopen_cookie *foc,
   3511				enum tcp_synack_type synack_type,
   3512				struct sk_buff *syn_skb)
   3513{
   3514	struct inet_request_sock *ireq = inet_rsk(req);
   3515	const struct tcp_sock *tp = tcp_sk(sk);
   3516	struct tcp_md5sig_key *md5 = NULL;
   3517	struct tcp_out_options opts;
   3518	struct sk_buff *skb;
   3519	int tcp_header_size;
   3520	struct tcphdr *th;
   3521	int mss;
   3522	u64 now;
   3523
   3524	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
   3525	if (unlikely(!skb)) {
   3526		dst_release(dst);
   3527		return NULL;
   3528	}
   3529	/* Reserve space for headers. */
   3530	skb_reserve(skb, MAX_TCP_HEADER);
   3531
   3532	switch (synack_type) {
   3533	case TCP_SYNACK_NORMAL:
   3534		skb_set_owner_w(skb, req_to_sk(req));
   3535		break;
   3536	case TCP_SYNACK_COOKIE:
   3537		/* Under synflood, we do not attach skb to a socket,
   3538		 * to avoid false sharing.
   3539		 */
   3540		break;
   3541	case TCP_SYNACK_FASTOPEN:
   3542		/* sk is a const pointer, because we want to express multiple
   3543		 * cpu might call us concurrently.
   3544		 * sk->sk_wmem_alloc in an atomic, we can promote to rw.
   3545		 */
   3546		skb_set_owner_w(skb, (struct sock *)sk);
   3547		break;
   3548	}
   3549	skb_dst_set(skb, dst);
   3550
   3551	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
   3552
   3553	memset(&opts, 0, sizeof(opts));
   3554	now = tcp_clock_ns();
   3555#ifdef CONFIG_SYN_COOKIES
   3556	if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
   3557		skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
   3558				      true);
   3559	else
   3560#endif
   3561	{
   3562		skb_set_delivery_time(skb, now, true);
   3563		if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
   3564			tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
   3565	}
   3566
   3567#ifdef CONFIG_TCP_MD5SIG
   3568	rcu_read_lock();
   3569	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
   3570#endif
   3571	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
   3572	/* bpf program will be interested in the tcp_flags */
   3573	TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
   3574	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
   3575					     foc, synack_type,
   3576					     syn_skb) + sizeof(*th);
   3577
   3578	skb_push(skb, tcp_header_size);
   3579	skb_reset_transport_header(skb);
   3580
   3581	th = (struct tcphdr *)skb->data;
   3582	memset(th, 0, sizeof(struct tcphdr));
   3583	th->syn = 1;
   3584	th->ack = 1;
   3585	tcp_ecn_make_synack(req, th);
   3586	th->source = htons(ireq->ir_num);
   3587	th->dest = ireq->ir_rmt_port;
   3588	skb->mark = ireq->ir_mark;
   3589	skb->ip_summed = CHECKSUM_PARTIAL;
   3590	th->seq = htonl(tcp_rsk(req)->snt_isn);
   3591	/* XXX data is queued and acked as is. No buffer/window check */
   3592	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
   3593
   3594	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
   3595	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
   3596	tcp_options_write(th, NULL, &opts);
   3597	th->doff = (tcp_header_size >> 2);
   3598	__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
   3599
   3600#ifdef CONFIG_TCP_MD5SIG
   3601	/* Okay, we have all we need - do the md5 hash if needed */
   3602	if (md5)
   3603		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
   3604					       md5, req_to_sk(req), skb);
   3605	rcu_read_unlock();
   3606#endif
   3607
   3608	bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
   3609				synack_type, &opts);
   3610
   3611	skb_set_delivery_time(skb, now, true);
   3612	tcp_add_tx_delay(skb, tp);
   3613
   3614	return skb;
   3615}
   3616EXPORT_SYMBOL(tcp_make_synack);
   3617
   3618static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
   3619{
   3620	struct inet_connection_sock *icsk = inet_csk(sk);
   3621	const struct tcp_congestion_ops *ca;
   3622	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
   3623
   3624	if (ca_key == TCP_CA_UNSPEC)
   3625		return;
   3626
   3627	rcu_read_lock();
   3628	ca = tcp_ca_find_key(ca_key);
   3629	if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
   3630		bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
   3631		icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
   3632		icsk->icsk_ca_ops = ca;
   3633	}
   3634	rcu_read_unlock();
   3635}
   3636
   3637/* Do all connect socket setups that can be done AF independent. */
   3638static void tcp_connect_init(struct sock *sk)
   3639{
   3640	const struct dst_entry *dst = __sk_dst_get(sk);
   3641	struct tcp_sock *tp = tcp_sk(sk);
   3642	__u8 rcv_wscale;
   3643	u32 rcv_wnd;
   3644
   3645	/* We'll fix this up when we get a response from the other end.
   3646	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
   3647	 */
   3648	tp->tcp_header_len = sizeof(struct tcphdr);
   3649	if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
   3650		tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
   3651
   3652#ifdef CONFIG_TCP_MD5SIG
   3653	if (tp->af_specific->md5_lookup(sk, sk))
   3654		tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
   3655#endif
   3656
   3657	/* If user gave his TCP_MAXSEG, record it to clamp */
   3658	if (tp->rx_opt.user_mss)
   3659		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
   3660	tp->max_window = 0;
   3661	tcp_mtup_init(sk);
   3662	tcp_sync_mss(sk, dst_mtu(dst));
   3663
   3664	tcp_ca_dst_init(sk, dst);
   3665
   3666	if (!tp->window_clamp)
   3667		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
   3668	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
   3669
   3670	tcp_initialize_rcv_mss(sk);
   3671
   3672	/* limit the window selection if the user enforce a smaller rx buffer */
   3673	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
   3674	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
   3675		tp->window_clamp = tcp_full_space(sk);
   3676
   3677	rcv_wnd = tcp_rwnd_init_bpf(sk);
   3678	if (rcv_wnd == 0)
   3679		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
   3680
   3681	tcp_select_initial_window(sk, tcp_full_space(sk),
   3682				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
   3683				  &tp->rcv_wnd,
   3684				  &tp->window_clamp,
   3685				  sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
   3686				  &rcv_wscale,
   3687				  rcv_wnd);
   3688
   3689	tp->rx_opt.rcv_wscale = rcv_wscale;
   3690	tp->rcv_ssthresh = tp->rcv_wnd;
   3691
   3692	sk->sk_err = 0;
   3693	sock_reset_flag(sk, SOCK_DONE);
   3694	tp->snd_wnd = 0;
   3695	tcp_init_wl(tp, 0);
   3696	tcp_write_queue_purge(sk);
   3697	tp->snd_una = tp->write_seq;
   3698	tp->snd_sml = tp->write_seq;
   3699	tp->snd_up = tp->write_seq;
   3700	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
   3701
   3702	if (likely(!tp->repair))
   3703		tp->rcv_nxt = 0;
   3704	else
   3705		tp->rcv_tstamp = tcp_jiffies32;
   3706	tp->rcv_wup = tp->rcv_nxt;
   3707	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
   3708
   3709	inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
   3710	inet_csk(sk)->icsk_retransmits = 0;
   3711	tcp_clear_retrans(tp);
   3712}
   3713
   3714static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
   3715{
   3716	struct tcp_sock *tp = tcp_sk(sk);
   3717	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
   3718
   3719	tcb->end_seq += skb->len;
   3720	__skb_header_release(skb);
   3721	sk_wmem_queued_add(sk, skb->truesize);
   3722	sk_mem_charge(sk, skb->truesize);
   3723	WRITE_ONCE(tp->write_seq, tcb->end_seq);
   3724	tp->packets_out += tcp_skb_pcount(skb);
   3725}
   3726
   3727/* Build and send a SYN with data and (cached) Fast Open cookie. However,
   3728 * queue a data-only packet after the regular SYN, such that regular SYNs
   3729 * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
   3730 * only the SYN sequence, the data are retransmitted in the first ACK.
   3731 * If cookie is not cached or other error occurs, falls back to send a
   3732 * regular SYN with Fast Open cookie request option.
   3733 */
   3734static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
   3735{
   3736	struct inet_connection_sock *icsk = inet_csk(sk);
   3737	struct tcp_sock *tp = tcp_sk(sk);
   3738	struct tcp_fastopen_request *fo = tp->fastopen_req;
   3739	int space, err = 0;
   3740	struct sk_buff *syn_data;
   3741
   3742	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
   3743	if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
   3744		goto fallback;
   3745
   3746	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
   3747	 * user-MSS. Reserve maximum option space for middleboxes that add
   3748	 * private TCP options. The cost is reduced data space in SYN :(
   3749	 */
   3750	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
   3751	/* Sync mss_cache after updating the mss_clamp */
   3752	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
   3753
   3754	space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
   3755		MAX_TCP_OPTION_SPACE;
   3756
   3757	space = min_t(size_t, space, fo->size);
   3758
   3759	/* limit to order-0 allocations */
   3760	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
   3761
   3762	syn_data = tcp_stream_alloc_skb(sk, space, sk->sk_allocation, false);
   3763	if (!syn_data)
   3764		goto fallback;
   3765	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
   3766	if (space) {
   3767		int copied = copy_from_iter(skb_put(syn_data, space), space,
   3768					    &fo->data->msg_iter);
   3769		if (unlikely(!copied)) {
   3770			tcp_skb_tsorted_anchor_cleanup(syn_data);
   3771			kfree_skb(syn_data);
   3772			goto fallback;
   3773		}
   3774		if (copied != space) {
   3775			skb_trim(syn_data, copied);
   3776			space = copied;
   3777		}
   3778		skb_zcopy_set(syn_data, fo->uarg, NULL);
   3779	}
   3780	/* No more data pending in inet_wait_for_connect() */
   3781	if (space == fo->size)
   3782		fo->data = NULL;
   3783	fo->copied = space;
   3784
   3785	tcp_connect_queue_skb(sk, syn_data);
   3786	if (syn_data->len)
   3787		tcp_chrono_start(sk, TCP_CHRONO_BUSY);
   3788
   3789	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
   3790
   3791	skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true);
   3792
   3793	/* Now full SYN+DATA was cloned and sent (or not),
   3794	 * remove the SYN from the original skb (syn_data)
   3795	 * we keep in write queue in case of a retransmit, as we
   3796	 * also have the SYN packet (with no data) in the same queue.
   3797	 */
   3798	TCP_SKB_CB(syn_data)->seq++;
   3799	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
   3800	if (!err) {
   3801		tp->syn_data = (fo->copied > 0);
   3802		tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
   3803		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
   3804		goto done;
   3805	}
   3806
   3807	/* data was not sent, put it in write_queue */
   3808	__skb_queue_tail(&sk->sk_write_queue, syn_data);
   3809	tp->packets_out -= tcp_skb_pcount(syn_data);
   3810
   3811fallback:
   3812	/* Send a regular SYN with Fast Open cookie request option */
   3813	if (fo->cookie.len > 0)
   3814		fo->cookie.len = 0;
   3815	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
   3816	if (err)
   3817		tp->syn_fastopen = 0;
   3818done:
   3819	fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
   3820	return err;
   3821}
   3822
   3823/* Build a SYN and send it off. */
   3824int tcp_connect(struct sock *sk)
   3825{
   3826	struct tcp_sock *tp = tcp_sk(sk);
   3827	struct sk_buff *buff;
   3828	int err;
   3829
   3830	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
   3831
   3832	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
   3833		return -EHOSTUNREACH; /* Routing failure or similar. */
   3834
   3835	tcp_connect_init(sk);
   3836
   3837	if (unlikely(tp->repair)) {
   3838		tcp_finish_connect(sk, NULL);
   3839		return 0;
   3840	}
   3841
   3842	buff = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
   3843	if (unlikely(!buff))
   3844		return -ENOBUFS;
   3845
   3846	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
   3847	tcp_mstamp_refresh(tp);
   3848	tp->retrans_stamp = tcp_time_stamp(tp);
   3849	tcp_connect_queue_skb(sk, buff);
   3850	tcp_ecn_send_syn(sk, buff);
   3851	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
   3852
   3853	/* Send off SYN; include data in Fast Open. */
   3854	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
   3855	      tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
   3856	if (err == -ECONNREFUSED)
   3857		return err;
   3858
   3859	/* We change tp->snd_nxt after the tcp_transmit_skb() call
   3860	 * in order to make this packet get counted in tcpOutSegs.
   3861	 */
   3862	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
   3863	tp->pushed_seq = tp->write_seq;
   3864	buff = tcp_send_head(sk);
   3865	if (unlikely(buff)) {
   3866		WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
   3867		tp->pushed_seq	= TCP_SKB_CB(buff)->seq;
   3868	}
   3869	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
   3870
   3871	/* Timer for repeating the SYN until an answer. */
   3872	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
   3873				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
   3874	return 0;
   3875}
   3876EXPORT_SYMBOL(tcp_connect);
   3877
   3878/* Send out a delayed ack, the caller does the policy checking
   3879 * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
   3880 * for details.
   3881 */
   3882void tcp_send_delayed_ack(struct sock *sk)
   3883{
   3884	struct inet_connection_sock *icsk = inet_csk(sk);
   3885	int ato = icsk->icsk_ack.ato;
   3886	unsigned long timeout;
   3887
   3888	if (ato > TCP_DELACK_MIN) {
   3889		const struct tcp_sock *tp = tcp_sk(sk);
   3890		int max_ato = HZ / 2;
   3891
   3892		if (inet_csk_in_pingpong_mode(sk) ||
   3893		    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
   3894			max_ato = TCP_DELACK_MAX;
   3895
   3896		/* Slow path, intersegment interval is "high". */
   3897
   3898		/* If some rtt estimate is known, use it to bound delayed ack.
   3899		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
   3900		 * directly.
   3901		 */
   3902		if (tp->srtt_us) {
   3903			int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
   3904					TCP_DELACK_MIN);
   3905
   3906			if (rtt < max_ato)
   3907				max_ato = rtt;
   3908		}
   3909
   3910		ato = min(ato, max_ato);
   3911	}
   3912
   3913	ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
   3914
   3915	/* Stay within the limit we were given */
   3916	timeout = jiffies + ato;
   3917
   3918	/* Use new timeout only if there wasn't a older one earlier. */
   3919	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
   3920		/* If delack timer is about to expire, send ACK now. */
   3921		if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
   3922			tcp_send_ack(sk);
   3923			return;
   3924		}
   3925
   3926		if (!time_before(timeout, icsk->icsk_ack.timeout))
   3927			timeout = icsk->icsk_ack.timeout;
   3928	}
   3929	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
   3930	icsk->icsk_ack.timeout = timeout;
   3931	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
   3932}
   3933
   3934/* This routine sends an ack and also updates the window. */
   3935void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
   3936{
   3937	struct sk_buff *buff;
   3938
   3939	/* If we have been reset, we may not send again. */
   3940	if (sk->sk_state == TCP_CLOSE)
   3941		return;
   3942
   3943	/* We are not putting this on the write queue, so
   3944	 * tcp_transmit_skb() will set the ownership to this
   3945	 * sock.
   3946	 */
   3947	buff = alloc_skb(MAX_TCP_HEADER,
   3948			 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
   3949	if (unlikely(!buff)) {
   3950		struct inet_connection_sock *icsk = inet_csk(sk);
   3951		unsigned long delay;
   3952
   3953		delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
   3954		if (delay < TCP_RTO_MAX)
   3955			icsk->icsk_ack.retry++;
   3956		inet_csk_schedule_ack(sk);
   3957		icsk->icsk_ack.ato = TCP_ATO_MIN;
   3958		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
   3959		return;
   3960	}
   3961
   3962	/* Reserve space for headers and prepare control bits. */
   3963	skb_reserve(buff, MAX_TCP_HEADER);
   3964	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
   3965
   3966	/* We do not want pure acks influencing TCP Small Queues or fq/pacing
   3967	 * too much.
   3968	 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
   3969	 */
   3970	skb_set_tcp_pure_ack(buff);
   3971
   3972	/* Send it off, this clears delayed acks for us. */
   3973	__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
   3974}
   3975EXPORT_SYMBOL_GPL(__tcp_send_ack);
   3976
   3977void tcp_send_ack(struct sock *sk)
   3978{
   3979	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
   3980}
   3981
   3982/* This routine sends a packet with an out of date sequence
   3983 * number. It assumes the other end will try to ack it.
   3984 *
   3985 * Question: what should we make while urgent mode?
   3986 * 4.4BSD forces sending single byte of data. We cannot send
   3987 * out of window data, because we have SND.NXT==SND.MAX...
   3988 *
   3989 * Current solution: to send TWO zero-length segments in urgent mode:
   3990 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
   3991 * out-of-date with SND.UNA-1 to probe window.
   3992 */
   3993static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
   3994{
   3995	struct tcp_sock *tp = tcp_sk(sk);
   3996	struct sk_buff *skb;
   3997
   3998	/* We don't queue it, tcp_transmit_skb() sets ownership. */
   3999	skb = alloc_skb(MAX_TCP_HEADER,
   4000			sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
   4001	if (!skb)
   4002		return -1;
   4003
   4004	/* Reserve space for headers and set control bits. */
   4005	skb_reserve(skb, MAX_TCP_HEADER);
   4006	/* Use a previous sequence.  This should cause the other
   4007	 * end to send an ack.  Don't queue or clone SKB, just
   4008	 * send it.
   4009	 */
   4010	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
   4011	NET_INC_STATS(sock_net(sk), mib);
   4012	return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
   4013}
   4014
   4015/* Called from setsockopt( ... TCP_REPAIR ) */
   4016void tcp_send_window_probe(struct sock *sk)
   4017{
   4018	if (sk->sk_state == TCP_ESTABLISHED) {
   4019		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
   4020		tcp_mstamp_refresh(tcp_sk(sk));
   4021		tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
   4022	}
   4023}
   4024
   4025/* Initiate keepalive or window probe from timer. */
   4026int tcp_write_wakeup(struct sock *sk, int mib)
   4027{
   4028	struct tcp_sock *tp = tcp_sk(sk);
   4029	struct sk_buff *skb;
   4030
   4031	if (sk->sk_state == TCP_CLOSE)
   4032		return -1;
   4033
   4034	skb = tcp_send_head(sk);
   4035	if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
   4036		int err;
   4037		unsigned int mss = tcp_current_mss(sk);
   4038		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
   4039
   4040		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
   4041			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
   4042
   4043		/* We are probing the opening of a window
   4044		 * but the window size is != 0
   4045		 * must have been a result SWS avoidance ( sender )
   4046		 */
   4047		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
   4048		    skb->len > mss) {
   4049			seg_size = min(seg_size, mss);
   4050			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
   4051			if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
   4052					 skb, seg_size, mss, GFP_ATOMIC))
   4053				return -1;
   4054		} else if (!tcp_skb_pcount(skb))
   4055			tcp_set_skb_tso_segs(skb, mss);
   4056
   4057		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
   4058		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
   4059		if (!err)
   4060			tcp_event_new_data_sent(sk, skb);
   4061		return err;
   4062	} else {
   4063		if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
   4064			tcp_xmit_probe_skb(sk, 1, mib);
   4065		return tcp_xmit_probe_skb(sk, 0, mib);
   4066	}
   4067}
   4068
   4069/* A window probe timeout has occurred.  If window is not closed send
   4070 * a partial packet else a zero probe.
   4071 */
   4072void tcp_send_probe0(struct sock *sk)
   4073{
   4074	struct inet_connection_sock *icsk = inet_csk(sk);
   4075	struct tcp_sock *tp = tcp_sk(sk);
   4076	struct net *net = sock_net(sk);
   4077	unsigned long timeout;
   4078	int err;
   4079
   4080	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
   4081
   4082	if (tp->packets_out || tcp_write_queue_empty(sk)) {
   4083		/* Cancel probe timer, if it is not required. */
   4084		icsk->icsk_probes_out = 0;
   4085		icsk->icsk_backoff = 0;
   4086		icsk->icsk_probes_tstamp = 0;
   4087		return;
   4088	}
   4089
   4090	icsk->icsk_probes_out++;
   4091	if (err <= 0) {
   4092		if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
   4093			icsk->icsk_backoff++;
   4094		timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
   4095	} else {
   4096		/* If packet was not sent due to local congestion,
   4097		 * Let senders fight for local resources conservatively.
   4098		 */
   4099		timeout = TCP_RESOURCE_PROBE_INTERVAL;
   4100	}
   4101
   4102	timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
   4103	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
   4104}
   4105
   4106int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
   4107{
   4108	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
   4109	struct flowi fl;
   4110	int res;
   4111
   4112	/* Paired with WRITE_ONCE() in sock_setsockopt() */
   4113	if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
   4114		tcp_rsk(req)->txhash = net_tx_rndhash();
   4115	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
   4116				  NULL);
   4117	if (!res) {
   4118		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
   4119		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
   4120		if (unlikely(tcp_passive_fastopen(sk)))
   4121			tcp_sk(sk)->total_retrans++;
   4122		trace_tcp_retransmit_synack(sk, req);
   4123	}
   4124	return res;
   4125}
   4126EXPORT_SYMBOL(tcp_rtx_synack);