cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

tcp_minisocks.c (27234B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
      4 *		operating system.  INET is implemented using the  BSD Socket
      5 *		interface as the means of communication with the user level.
      6 *
      7 *		Implementation of the Transmission Control Protocol(TCP).
      8 *
      9 * Authors:	Ross Biro
     10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     11 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
     12 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
     13 *		Florian La Roche, <flla@stud.uni-sb.de>
     14 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
     15 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
     16 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
     17 *		Matthew Dillon, <dillon@apollo.west.oic.com>
     18 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
     19 *		Jorge Cwik, <jorge@laser.satlink.net>
     20 */
     21
     22#include <net/tcp.h>
     23#include <net/xfrm.h>
     24#include <net/busy_poll.h>
     25
     26static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
     27{
     28	if (seq == s_win)
     29		return true;
     30	if (after(end_seq, s_win) && before(seq, e_win))
     31		return true;
     32	return seq == e_win && seq == end_seq;
     33}
     34
     35static enum tcp_tw_status
     36tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
     37				  const struct sk_buff *skb, int mib_idx)
     38{
     39	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
     40
     41	if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
     42				  &tcptw->tw_last_oow_ack_time)) {
     43		/* Send ACK. Note, we do not put the bucket,
     44		 * it will be released by caller.
     45		 */
     46		return TCP_TW_ACK;
     47	}
     48
     49	/* We are rate-limiting, so just release the tw sock and drop skb. */
     50	inet_twsk_put(tw);
     51	return TCP_TW_SUCCESS;
     52}
     53
     54/*
     55 * * Main purpose of TIME-WAIT state is to close connection gracefully,
     56 *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
     57 *   (and, probably, tail of data) and one or more our ACKs are lost.
     58 * * What is TIME-WAIT timeout? It is associated with maximal packet
     59 *   lifetime in the internet, which results in wrong conclusion, that
     60 *   it is set to catch "old duplicate segments" wandering out of their path.
     61 *   It is not quite correct. This timeout is calculated so that it exceeds
     62 *   maximal retransmission timeout enough to allow to lose one (or more)
     63 *   segments sent by peer and our ACKs. This time may be calculated from RTO.
     64 * * When TIME-WAIT socket receives RST, it means that another end
     65 *   finally closed and we are allowed to kill TIME-WAIT too.
     66 * * Second purpose of TIME-WAIT is catching old duplicate segments.
     67 *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
     68 *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
     69 * * If we invented some more clever way to catch duplicates
     70 *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
     71 *
     72 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
     73 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
     74 * from the very beginning.
     75 *
     76 * NOTE. With recycling (and later with fin-wait-2) TW bucket
     77 * is _not_ stateless. It means, that strictly speaking we must
     78 * spinlock it. I do not want! Well, probability of misbehaviour
     79 * is ridiculously low and, seems, we could use some mb() tricks
     80 * to avoid misread sequence numbers, states etc.  --ANK
     81 *
     82 * We don't need to initialize tmp_out.sack_ok as we don't use the results
     83 */
     84enum tcp_tw_status
     85tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
     86			   const struct tcphdr *th)
     87{
     88	struct tcp_options_received tmp_opt;
     89	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
     90	bool paws_reject = false;
     91
     92	tmp_opt.saw_tstamp = 0;
     93	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
     94		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
     95
     96		if (tmp_opt.saw_tstamp) {
     97			if (tmp_opt.rcv_tsecr)
     98				tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
     99			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
    100			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
    101			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
    102		}
    103	}
    104
    105	if (tw->tw_substate == TCP_FIN_WAIT2) {
    106		/* Just repeat all the checks of tcp_rcv_state_process() */
    107
    108		/* Out of window, send ACK */
    109		if (paws_reject ||
    110		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
    111				   tcptw->tw_rcv_nxt,
    112				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
    113			return tcp_timewait_check_oow_rate_limit(
    114				tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
    115
    116		if (th->rst)
    117			goto kill;
    118
    119		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
    120			return TCP_TW_RST;
    121
    122		/* Dup ACK? */
    123		if (!th->ack ||
    124		    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
    125		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
    126			inet_twsk_put(tw);
    127			return TCP_TW_SUCCESS;
    128		}
    129
    130		/* New data or FIN. If new data arrive after half-duplex close,
    131		 * reset.
    132		 */
    133		if (!th->fin ||
    134		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1)
    135			return TCP_TW_RST;
    136
    137		/* FIN arrived, enter true time-wait state. */
    138		tw->tw_substate	  = TCP_TIME_WAIT;
    139		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
    140		if (tmp_opt.saw_tstamp) {
    141			tcptw->tw_ts_recent_stamp = ktime_get_seconds();
    142			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
    143		}
    144
    145		inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
    146		return TCP_TW_ACK;
    147	}
    148
    149	/*
    150	 *	Now real TIME-WAIT state.
    151	 *
    152	 *	RFC 1122:
    153	 *	"When a connection is [...] on TIME-WAIT state [...]
    154	 *	[a TCP] MAY accept a new SYN from the remote TCP to
    155	 *	reopen the connection directly, if it:
    156	 *
    157	 *	(1)  assigns its initial sequence number for the new
    158	 *	connection to be larger than the largest sequence
    159	 *	number it used on the previous connection incarnation,
    160	 *	and
    161	 *
    162	 *	(2)  returns to TIME-WAIT state if the SYN turns out
    163	 *	to be an old duplicate".
    164	 */
    165
    166	if (!paws_reject &&
    167	    (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
    168	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
    169		/* In window segment, it may be only reset or bare ack. */
    170
    171		if (th->rst) {
    172			/* This is TIME_WAIT assassination, in two flavors.
    173			 * Oh well... nobody has a sufficient solution to this
    174			 * protocol bug yet.
    175			 */
    176			if (twsk_net(tw)->ipv4.sysctl_tcp_rfc1337 == 0) {
    177kill:
    178				inet_twsk_deschedule_put(tw);
    179				return TCP_TW_SUCCESS;
    180			}
    181		} else {
    182			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
    183		}
    184
    185		if (tmp_opt.saw_tstamp) {
    186			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
    187			tcptw->tw_ts_recent_stamp = ktime_get_seconds();
    188		}
    189
    190		inet_twsk_put(tw);
    191		return TCP_TW_SUCCESS;
    192	}
    193
    194	/* Out of window segment.
    195
    196	   All the segments are ACKed immediately.
    197
    198	   The only exception is new SYN. We accept it, if it is
    199	   not old duplicate and we are not in danger to be killed
    200	   by delayed old duplicates. RFC check is that it has
    201	   newer sequence number works at rates <40Mbit/sec.
    202	   However, if paws works, it is reliable AND even more,
    203	   we even may relax silly seq space cutoff.
    204
    205	   RED-PEN: we violate main RFC requirement, if this SYN will appear
    206	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
    207	   we must return socket to time-wait state. It is not good,
    208	   but not fatal yet.
    209	 */
    210
    211	if (th->syn && !th->rst && !th->ack && !paws_reject &&
    212	    (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
    213	     (tmp_opt.saw_tstamp &&
    214	      (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
    215		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
    216		if (isn == 0)
    217			isn++;
    218		TCP_SKB_CB(skb)->tcp_tw_isn = isn;
    219		return TCP_TW_SYN;
    220	}
    221
    222	if (paws_reject)
    223		__NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
    224
    225	if (!th->rst) {
    226		/* In this case we must reset the TIMEWAIT timer.
    227		 *
    228		 * If it is ACKless SYN it may be both old duplicate
    229		 * and new good SYN with random sequence number <rcv_nxt.
    230		 * Do not reschedule in the last case.
    231		 */
    232		if (paws_reject || th->ack)
    233			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
    234
    235		return tcp_timewait_check_oow_rate_limit(
    236			tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
    237	}
    238	inet_twsk_put(tw);
    239	return TCP_TW_SUCCESS;
    240}
    241EXPORT_SYMBOL(tcp_timewait_state_process);
    242
    243/*
    244 * Move a socket to time-wait or dead fin-wait-2 state.
    245 */
    246void tcp_time_wait(struct sock *sk, int state, int timeo)
    247{
    248	const struct inet_connection_sock *icsk = inet_csk(sk);
    249	const struct tcp_sock *tp = tcp_sk(sk);
    250	struct inet_timewait_sock *tw;
    251	struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
    252
    253	tw = inet_twsk_alloc(sk, tcp_death_row, state);
    254
    255	if (tw) {
    256		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
    257		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
    258		struct inet_sock *inet = inet_sk(sk);
    259
    260		tw->tw_transparent	= inet->transparent;
    261		tw->tw_mark		= sk->sk_mark;
    262		tw->tw_priority		= sk->sk_priority;
    263		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
    264		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
    265		tcptw->tw_snd_nxt	= tp->snd_nxt;
    266		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
    267		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
    268		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
    269		tcptw->tw_ts_offset	= tp->tsoffset;
    270		tcptw->tw_last_oow_ack_time = 0;
    271		tcptw->tw_tx_delay	= tp->tcp_tx_delay;
    272#if IS_ENABLED(CONFIG_IPV6)
    273		if (tw->tw_family == PF_INET6) {
    274			struct ipv6_pinfo *np = inet6_sk(sk);
    275
    276			tw->tw_v6_daddr = sk->sk_v6_daddr;
    277			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
    278			tw->tw_tclass = np->tclass;
    279			tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
    280			tw->tw_txhash = sk->sk_txhash;
    281			tw->tw_ipv6only = sk->sk_ipv6only;
    282		}
    283#endif
    284
    285#ifdef CONFIG_TCP_MD5SIG
    286		/*
    287		 * The timewait bucket does not have the key DB from the
    288		 * sock structure. We just make a quick copy of the
    289		 * md5 key being used (if indeed we are using one)
    290		 * so the timewait ack generating code has the key.
    291		 */
    292		do {
    293			tcptw->tw_md5_key = NULL;
    294			if (static_branch_unlikely(&tcp_md5_needed)) {
    295				struct tcp_md5sig_key *key;
    296
    297				key = tp->af_specific->md5_lookup(sk, sk);
    298				if (key) {
    299					tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
    300					BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool());
    301				}
    302			}
    303		} while (0);
    304#endif
    305
    306		/* Get the TIME_WAIT timeout firing. */
    307		if (timeo < rto)
    308			timeo = rto;
    309
    310		if (state == TCP_TIME_WAIT)
    311			timeo = TCP_TIMEWAIT_LEN;
    312
    313		/* tw_timer is pinned, so we need to make sure BH are disabled
    314		 * in following section, otherwise timer handler could run before
    315		 * we complete the initialization.
    316		 */
    317		local_bh_disable();
    318		inet_twsk_schedule(tw, timeo);
    319		/* Linkage updates.
    320		 * Note that access to tw after this point is illegal.
    321		 */
    322		inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
    323		local_bh_enable();
    324	} else {
    325		/* Sorry, if we're out of memory, just CLOSE this
    326		 * socket up.  We've got bigger problems than
    327		 * non-graceful socket closings.
    328		 */
    329		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
    330	}
    331
    332	tcp_update_metrics(sk);
    333	tcp_done(sk);
    334}
    335EXPORT_SYMBOL(tcp_time_wait);
    336
    337void tcp_twsk_destructor(struct sock *sk)
    338{
    339#ifdef CONFIG_TCP_MD5SIG
    340	if (static_branch_unlikely(&tcp_md5_needed)) {
    341		struct tcp_timewait_sock *twsk = tcp_twsk(sk);
    342
    343		if (twsk->tw_md5_key)
    344			kfree_rcu(twsk->tw_md5_key, rcu);
    345	}
    346#endif
    347}
    348EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
    349
    350/* Warning : This function is called without sk_listener being locked.
    351 * Be sure to read socket fields once, as their value could change under us.
    352 */
    353void tcp_openreq_init_rwin(struct request_sock *req,
    354			   const struct sock *sk_listener,
    355			   const struct dst_entry *dst)
    356{
    357	struct inet_request_sock *ireq = inet_rsk(req);
    358	const struct tcp_sock *tp = tcp_sk(sk_listener);
    359	int full_space = tcp_full_space(sk_listener);
    360	u32 window_clamp;
    361	__u8 rcv_wscale;
    362	u32 rcv_wnd;
    363	int mss;
    364
    365	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
    366	window_clamp = READ_ONCE(tp->window_clamp);
    367	/* Set this up on the first call only */
    368	req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
    369
    370	/* limit the window selection if the user enforce a smaller rx buffer */
    371	if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
    372	    (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
    373		req->rsk_window_clamp = full_space;
    374
    375	rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req);
    376	if (rcv_wnd == 0)
    377		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
    378	else if (full_space < rcv_wnd * mss)
    379		full_space = rcv_wnd * mss;
    380
    381	/* tcp_full_space because it is guaranteed to be the first packet */
    382	tcp_select_initial_window(sk_listener, full_space,
    383		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
    384		&req->rsk_rcv_wnd,
    385		&req->rsk_window_clamp,
    386		ireq->wscale_ok,
    387		&rcv_wscale,
    388		rcv_wnd);
    389	ireq->rcv_wscale = rcv_wscale;
    390}
    391EXPORT_SYMBOL(tcp_openreq_init_rwin);
    392
    393static void tcp_ecn_openreq_child(struct tcp_sock *tp,
    394				  const struct request_sock *req)
    395{
    396	tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
    397}
    398
    399void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
    400{
    401	struct inet_connection_sock *icsk = inet_csk(sk);
    402	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
    403	bool ca_got_dst = false;
    404
    405	if (ca_key != TCP_CA_UNSPEC) {
    406		const struct tcp_congestion_ops *ca;
    407
    408		rcu_read_lock();
    409		ca = tcp_ca_find_key(ca_key);
    410		if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
    411			icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
    412			icsk->icsk_ca_ops = ca;
    413			ca_got_dst = true;
    414		}
    415		rcu_read_unlock();
    416	}
    417
    418	/* If no valid choice made yet, assign current system default ca. */
    419	if (!ca_got_dst &&
    420	    (!icsk->icsk_ca_setsockopt ||
    421	     !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
    422		tcp_assign_congestion_control(sk);
    423
    424	tcp_set_ca_state(sk, TCP_CA_Open);
    425}
    426EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
    427
    428static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
    429				    struct request_sock *req,
    430				    struct tcp_sock *newtp)
    431{
    432#if IS_ENABLED(CONFIG_SMC)
    433	struct inet_request_sock *ireq;
    434
    435	if (static_branch_unlikely(&tcp_have_smc)) {
    436		ireq = inet_rsk(req);
    437		if (oldtp->syn_smc && !ireq->smc_ok)
    438			newtp->syn_smc = 0;
    439	}
    440#endif
    441}
    442
    443/* This is not only more efficient than what we used to do, it eliminates
    444 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
    445 *
    446 * Actually, we could lots of memory writes here. tp of listening
    447 * socket contains all necessary default parameters.
    448 */
    449struct sock *tcp_create_openreq_child(const struct sock *sk,
    450				      struct request_sock *req,
    451				      struct sk_buff *skb)
    452{
    453	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
    454	const struct inet_request_sock *ireq = inet_rsk(req);
    455	struct tcp_request_sock *treq = tcp_rsk(req);
    456	struct inet_connection_sock *newicsk;
    457	struct tcp_sock *oldtp, *newtp;
    458	u32 seq;
    459
    460	if (!newsk)
    461		return NULL;
    462
    463	newicsk = inet_csk(newsk);
    464	newtp = tcp_sk(newsk);
    465	oldtp = tcp_sk(sk);
    466
    467	smc_check_reset_syn_req(oldtp, req, newtp);
    468
    469	/* Now setup tcp_sock */
    470	newtp->pred_flags = 0;
    471
    472	seq = treq->rcv_isn + 1;
    473	newtp->rcv_wup = seq;
    474	WRITE_ONCE(newtp->copied_seq, seq);
    475	WRITE_ONCE(newtp->rcv_nxt, seq);
    476	newtp->segs_in = 1;
    477
    478	seq = treq->snt_isn + 1;
    479	newtp->snd_sml = newtp->snd_una = seq;
    480	WRITE_ONCE(newtp->snd_nxt, seq);
    481	newtp->snd_up = seq;
    482
    483	INIT_LIST_HEAD(&newtp->tsq_node);
    484	INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
    485
    486	tcp_init_wl(newtp, treq->rcv_isn);
    487
    488	minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
    489	newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
    490
    491	newtp->lsndtime = tcp_jiffies32;
    492	newsk->sk_txhash = treq->txhash;
    493	newtp->total_retrans = req->num_retrans;
    494
    495	tcp_init_xmit_timers(newsk);
    496	WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);
    497
    498	if (sock_flag(newsk, SOCK_KEEPOPEN))
    499		inet_csk_reset_keepalive_timer(newsk,
    500					       keepalive_time_when(newtp));
    501
    502	newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
    503	newtp->rx_opt.sack_ok = ireq->sack_ok;
    504	newtp->window_clamp = req->rsk_window_clamp;
    505	newtp->rcv_ssthresh = req->rsk_rcv_wnd;
    506	newtp->rcv_wnd = req->rsk_rcv_wnd;
    507	newtp->rx_opt.wscale_ok = ireq->wscale_ok;
    508	if (newtp->rx_opt.wscale_ok) {
    509		newtp->rx_opt.snd_wscale = ireq->snd_wscale;
    510		newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
    511	} else {
    512		newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
    513		newtp->window_clamp = min(newtp->window_clamp, 65535U);
    514	}
    515	newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale;
    516	newtp->max_window = newtp->snd_wnd;
    517
    518	if (newtp->rx_opt.tstamp_ok) {
    519		newtp->rx_opt.ts_recent = req->ts_recent;
    520		newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
    521		newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
    522	} else {
    523		newtp->rx_opt.ts_recent_stamp = 0;
    524		newtp->tcp_header_len = sizeof(struct tcphdr);
    525	}
    526	if (req->num_timeout) {
    527		newtp->undo_marker = treq->snt_isn;
    528		newtp->retrans_stamp = div_u64(treq->snt_synack,
    529					       USEC_PER_SEC / TCP_TS_HZ);
    530	}
    531	newtp->tsoffset = treq->ts_off;
    532#ifdef CONFIG_TCP_MD5SIG
    533	newtp->md5sig_info = NULL;	/*XXX*/
    534	if (treq->af_specific->req_md5_lookup(sk, req_to_sk(req)))
    535		newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
    536#endif
    537	if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
    538		newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
    539	newtp->rx_opt.mss_clamp = req->mss;
    540	tcp_ecn_openreq_child(newtp, req);
    541	newtp->fastopen_req = NULL;
    542	RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
    543
    544	tcp_bpf_clone(sk, newsk);
    545
    546	__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
    547
    548	return newsk;
    549}
    550EXPORT_SYMBOL(tcp_create_openreq_child);
    551
    552/*
    553 * Process an incoming packet for SYN_RECV sockets represented as a
    554 * request_sock. Normally sk is the listener socket but for TFO it
    555 * points to the child socket.
    556 *
    557 * XXX (TFO) - The current impl contains a special check for ack
    558 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
    559 *
    560 * We don't need to initialize tmp_opt.sack_ok as we don't use the results
    561 */
    562
    563struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
    564			   struct request_sock *req,
    565			   bool fastopen, bool *req_stolen)
    566{
    567	struct tcp_options_received tmp_opt;
    568	struct sock *child;
    569	const struct tcphdr *th = tcp_hdr(skb);
    570	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
    571	bool paws_reject = false;
    572	bool own_req;
    573
    574	tmp_opt.saw_tstamp = 0;
    575	if (th->doff > (sizeof(struct tcphdr)>>2)) {
    576		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
    577
    578		if (tmp_opt.saw_tstamp) {
    579			tmp_opt.ts_recent = req->ts_recent;
    580			if (tmp_opt.rcv_tsecr)
    581				tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
    582			/* We do not store true stamp, but it is not required,
    583			 * it can be estimated (approximately)
    584			 * from another data.
    585			 */
    586			tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ;
    587			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
    588		}
    589	}
    590
    591	/* Check for pure retransmitted SYN. */
    592	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
    593	    flg == TCP_FLAG_SYN &&
    594	    !paws_reject) {
    595		/*
    596		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
    597		 * this case on figure 6 and figure 8, but formal
    598		 * protocol description says NOTHING.
    599		 * To be more exact, it says that we should send ACK,
    600		 * because this segment (at least, if it has no data)
    601		 * is out of window.
    602		 *
    603		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
    604		 *  describe SYN-RECV state. All the description
    605		 *  is wrong, we cannot believe to it and should
    606		 *  rely only on common sense and implementation
    607		 *  experience.
    608		 *
    609		 * Enforce "SYN-ACK" according to figure 8, figure 6
    610		 * of RFC793, fixed by RFC1122.
    611		 *
    612		 * Note that even if there is new data in the SYN packet
    613		 * they will be thrown away too.
    614		 *
    615		 * Reset timer after retransmitting SYNACK, similar to
    616		 * the idea of fast retransmit in recovery.
    617		 */
    618		if (!tcp_oow_rate_limited(sock_net(sk), skb,
    619					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
    620					  &tcp_rsk(req)->last_oow_ack_time) &&
    621
    622		    !inet_rtx_syn_ack(sk, req)) {
    623			unsigned long expires = jiffies;
    624
    625			expires += reqsk_timeout(req, TCP_RTO_MAX);
    626			if (!fastopen)
    627				mod_timer_pending(&req->rsk_timer, expires);
    628			else
    629				req->rsk_timer.expires = expires;
    630		}
    631		return NULL;
    632	}
    633
    634	/* Further reproduces section "SEGMENT ARRIVES"
    635	   for state SYN-RECEIVED of RFC793.
    636	   It is broken, however, it does not work only
    637	   when SYNs are crossed.
    638
    639	   You would think that SYN crossing is impossible here, since
    640	   we should have a SYN_SENT socket (from connect()) on our end,
    641	   but this is not true if the crossed SYNs were sent to both
    642	   ends by a malicious third party.  We must defend against this,
    643	   and to do that we first verify the ACK (as per RFC793, page
    644	   36) and reset if it is invalid.  Is this a true full defense?
    645	   To convince ourselves, let us consider a way in which the ACK
    646	   test can still pass in this 'malicious crossed SYNs' case.
    647	   Malicious sender sends identical SYNs (and thus identical sequence
    648	   numbers) to both A and B:
    649
    650		A: gets SYN, seq=7
    651		B: gets SYN, seq=7
    652
    653	   By our good fortune, both A and B select the same initial
    654	   send sequence number of seven :-)
    655
    656		A: sends SYN|ACK, seq=7, ack_seq=8
    657		B: sends SYN|ACK, seq=7, ack_seq=8
    658
    659	   So we are now A eating this SYN|ACK, ACK test passes.  So
    660	   does sequence test, SYN is truncated, and thus we consider
    661	   it a bare ACK.
    662
    663	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
    664	   bare ACK.  Otherwise, we create an established connection.  Both
    665	   ends (listening sockets) accept the new incoming connection and try
    666	   to talk to each other. 8-)
    667
    668	   Note: This case is both harmless, and rare.  Possibility is about the
    669	   same as us discovering intelligent life on another plant tomorrow.
    670
    671	   But generally, we should (RFC lies!) to accept ACK
    672	   from SYNACK both here and in tcp_rcv_state_process().
    673	   tcp_rcv_state_process() does not, hence, we do not too.
    674
    675	   Note that the case is absolutely generic:
    676	   we cannot optimize anything here without
    677	   violating protocol. All the checks must be made
    678	   before attempt to create socket.
    679	 */
    680
    681	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
    682	 *                  and the incoming segment acknowledges something not yet
    683	 *                  sent (the segment carries an unacceptable ACK) ...
    684	 *                  a reset is sent."
    685	 *
    686	 * Invalid ACK: reset will be sent by listening socket.
    687	 * Note that the ACK validity check for a Fast Open socket is done
    688	 * elsewhere and is checked directly against the child socket rather
    689	 * than req because user data may have been sent out.
    690	 */
    691	if ((flg & TCP_FLAG_ACK) && !fastopen &&
    692	    (TCP_SKB_CB(skb)->ack_seq !=
    693	     tcp_rsk(req)->snt_isn + 1))
    694		return sk;
    695
    696	/* Also, it would be not so bad idea to check rcv_tsecr, which
    697	 * is essentially ACK extension and too early or too late values
    698	 * should cause reset in unsynchronized states.
    699	 */
    700
    701	/* RFC793: "first check sequence number". */
    702
    703	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
    704					  tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
    705		/* Out of window: send ACK and drop. */
    706		if (!(flg & TCP_FLAG_RST) &&
    707		    !tcp_oow_rate_limited(sock_net(sk), skb,
    708					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
    709					  &tcp_rsk(req)->last_oow_ack_time))
    710			req->rsk_ops->send_ack(sk, skb, req);
    711		if (paws_reject)
    712			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
    713		return NULL;
    714	}
    715
    716	/* In sequence, PAWS is OK. */
    717
    718	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
    719		req->ts_recent = tmp_opt.rcv_tsval;
    720
    721	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
    722		/* Truncate SYN, it is out of window starting
    723		   at tcp_rsk(req)->rcv_isn + 1. */
    724		flg &= ~TCP_FLAG_SYN;
    725	}
    726
    727	/* RFC793: "second check the RST bit" and
    728	 *	   "fourth, check the SYN bit"
    729	 */
    730	if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
    731		__TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
    732		goto embryonic_reset;
    733	}
    734
    735	/* ACK sequence verified above, just make sure ACK is
    736	 * set.  If ACK not set, just silently drop the packet.
    737	 *
    738	 * XXX (TFO) - if we ever allow "data after SYN", the
    739	 * following check needs to be removed.
    740	 */
    741	if (!(flg & TCP_FLAG_ACK))
    742		return NULL;
    743
    744	/* For Fast Open no more processing is needed (sk is the
    745	 * child socket).
    746	 */
    747	if (fastopen)
    748		return sk;
    749
    750	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
    751	if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
    752	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
    753		inet_rsk(req)->acked = 1;
    754		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
    755		return NULL;
    756	}
    757
    758	/* OK, ACK is valid, create big socket and
    759	 * feed this segment to it. It will repeat all
    760	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
    761	 * ESTABLISHED STATE. If it will be dropped after
    762	 * socket is created, wait for troubles.
    763	 */
    764	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
    765							 req, &own_req);
    766	if (!child)
    767		goto listen_overflow;
    768
    769	if (own_req && rsk_drop_req(req)) {
    770		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
    771		inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
    772		return child;
    773	}
    774
    775	sock_rps_save_rxhash(child, skb);
    776	tcp_synack_rtt_meas(child, req);
    777	*req_stolen = !own_req;
    778	return inet_csk_complete_hashdance(sk, child, req, own_req);
    779
    780listen_overflow:
    781	if (sk != req->rsk_listener)
    782		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
    783
    784	if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) {
    785		inet_rsk(req)->acked = 1;
    786		return NULL;
    787	}
    788
    789embryonic_reset:
    790	if (!(flg & TCP_FLAG_RST)) {
    791		/* Received a bad SYN pkt - for TFO We try not to reset
    792		 * the local connection unless it's really necessary to
    793		 * avoid becoming vulnerable to outside attack aiming at
    794		 * resetting legit local connections.
    795		 */
    796		req->rsk_ops->send_reset(sk, skb);
    797	} else if (fastopen) { /* received a valid RST pkt */
    798		reqsk_fastopen_remove(sk, req, true);
    799		tcp_reset(sk, skb);
    800	}
    801	if (!fastopen) {
    802		bool unlinked = inet_csk_reqsk_queue_drop(sk, req);
    803
    804		if (unlinked)
    805			__NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
    806		*req_stolen = !unlinked;
    807	}
    808	return NULL;
    809}
    810EXPORT_SYMBOL(tcp_check_req);
    811
    812/*
    813 * Queue segment on the new socket if the new socket is active,
    814 * otherwise we just shortcircuit this and continue with
    815 * the new socket.
    816 *
    817 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
    818 * when entering. But other states are possible due to a race condition
    819 * where after __inet_lookup_established() fails but before the listener
    820 * locked is obtained, other packets cause the same connection to
    821 * be created.
    822 */
    823
    824int tcp_child_process(struct sock *parent, struct sock *child,
    825		      struct sk_buff *skb)
    826	__releases(&((child)->sk_lock.slock))
    827{
    828	int ret = 0;
    829	int state = child->sk_state;
    830
    831	/* record sk_napi_id and sk_rx_queue_mapping of child. */
    832	sk_mark_napi_id_set(child, skb);
    833
    834	tcp_segs_in(tcp_sk(child), skb);
    835	if (!sock_owned_by_user(child)) {
    836		ret = tcp_rcv_state_process(child, skb);
    837		/* Wakeup parent, send SIGIO */
    838		if (state == TCP_SYN_RECV && child->sk_state != state)
    839			parent->sk_data_ready(parent);
    840	} else {
    841		/* Alas, it is possible again, because we do lookup
    842		 * in main socket hash table and lock on listening
    843		 * socket does not protect us more.
    844		 */
    845		__sk_add_backlog(child, skb);
    846	}
    847
    848	bh_unlock_sock(child);
    849	sock_put(child);
    850	return ret;
    851}
    852EXPORT_SYMBOL(tcp_child_process);