cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

tcp_ipv4.c (87548B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
      4 *		operating system.  INET is implemented using the  BSD Socket
      5 *		interface as the means of communication with the user level.
      6 *
      7 *		Implementation of the Transmission Control Protocol(TCP).
      8 *
      9 *		IPv4 specific functions
     10 *
     11 *		code split from:
     12 *		linux/ipv4/tcp.c
     13 *		linux/ipv4/tcp_input.c
     14 *		linux/ipv4/tcp_output.c
     15 *
     16 *		See tcp.c for author information
     17 */
     18
     19/*
     20 * Changes:
     21 *		David S. Miller	:	New socket lookup architecture.
     22 *					This code is dedicated to John Dyson.
     23 *		David S. Miller :	Change semantics of established hash,
     24 *					half is devoted to TIME_WAIT sockets
     25 *					and the rest go in the other half.
     26 *		Andi Kleen :		Add support for syncookies and fixed
     27 *					some bugs: ip options weren't passed to
     28 *					the TCP layer, missed a check for an
     29 *					ACK bit.
     30 *		Andi Kleen :		Implemented fast path mtu discovery.
     31 *	     				Fixed many serious bugs in the
     32 *					request_sock handling and moved
     33 *					most of it into the af independent code.
     34 *					Added tail drop and some other bugfixes.
     35 *					Added new listen semantics.
     36 *		Mike McLagan	:	Routing by source
     37 *	Juan Jose Ciarlante:		ip_dynaddr bits
     38 *		Andi Kleen:		various fixes.
     39 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
     40 *					coma.
     41 *	Andi Kleen		:	Fix new listen.
     42 *	Andi Kleen		:	Fix accept error reporting.
     43 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
     44 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
     45 *					a single port at the same time.
     46 */
     47
     48#define pr_fmt(fmt) "TCP: " fmt
     49
     50#include <linux/bottom_half.h>
     51#include <linux/types.h>
     52#include <linux/fcntl.h>
     53#include <linux/module.h>
     54#include <linux/random.h>
     55#include <linux/cache.h>
     56#include <linux/jhash.h>
     57#include <linux/init.h>
     58#include <linux/times.h>
     59#include <linux/slab.h>
     60
     61#include <net/net_namespace.h>
     62#include <net/icmp.h>
     63#include <net/inet_hashtables.h>
     64#include <net/tcp.h>
     65#include <net/transp_v6.h>
     66#include <net/ipv6.h>
     67#include <net/inet_common.h>
     68#include <net/timewait_sock.h>
     69#include <net/xfrm.h>
     70#include <net/secure_seq.h>
     71#include <net/busy_poll.h>
     72
     73#include <linux/inet.h>
     74#include <linux/ipv6.h>
     75#include <linux/stddef.h>
     76#include <linux/proc_fs.h>
     77#include <linux/seq_file.h>
     78#include <linux/inetdevice.h>
     79#include <linux/btf_ids.h>
     80
     81#include <crypto/hash.h>
     82#include <linux/scatterlist.h>
     83
     84#include <trace/events/tcp.h>
     85
     86#ifdef CONFIG_TCP_MD5SIG
     87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
     88			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
     89#endif
     90
     91struct inet_hashinfo tcp_hashinfo;
     92EXPORT_SYMBOL(tcp_hashinfo);
     93
     94static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
     95
     96static u32 tcp_v4_init_seq(const struct sk_buff *skb)
     97{
     98	return secure_tcp_seq(ip_hdr(skb)->daddr,
     99			      ip_hdr(skb)->saddr,
    100			      tcp_hdr(skb)->dest,
    101			      tcp_hdr(skb)->source);
    102}
    103
    104static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
    105{
    106	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
    107}
    108
    109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
    110{
    111	const struct inet_timewait_sock *tw = inet_twsk(sktw);
    112	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
    113	struct tcp_sock *tp = tcp_sk(sk);
    114	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
    115
    116	if (reuse == 2) {
    117		/* Still does not detect *everything* that goes through
    118		 * lo, since we require a loopback src or dst address
    119		 * or direct binding to 'lo' interface.
    120		 */
    121		bool loopback = false;
    122		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
    123			loopback = true;
    124#if IS_ENABLED(CONFIG_IPV6)
    125		if (tw->tw_family == AF_INET6) {
    126			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
    127			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
    128			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
    129			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
    130				loopback = true;
    131		} else
    132#endif
    133		{
    134			if (ipv4_is_loopback(tw->tw_daddr) ||
    135			    ipv4_is_loopback(tw->tw_rcv_saddr))
    136				loopback = true;
    137		}
    138		if (!loopback)
    139			reuse = 0;
    140	}
    141
    142	/* With PAWS, it is safe from the viewpoint
    143	   of data integrity. Even without PAWS it is safe provided sequence
    144	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
    145
    146	   Actually, the idea is close to VJ's one, only timestamp cache is
    147	   held not per host, but per port pair and TW bucket is used as state
    148	   holder.
    149
    150	   If TW bucket has been already destroyed we fall back to VJ's scheme
    151	   and use initial timestamp retrieved from peer table.
    152	 */
    153	if (tcptw->tw_ts_recent_stamp &&
    154	    (!twp || (reuse && time_after32(ktime_get_seconds(),
    155					    tcptw->tw_ts_recent_stamp)))) {
    156		/* In case of repair and re-using TIME-WAIT sockets we still
    157		 * want to be sure that it is safe as above but honor the
    158		 * sequence numbers and time stamps set as part of the repair
    159		 * process.
    160		 *
    161		 * Without this check re-using a TIME-WAIT socket with TCP
    162		 * repair would accumulate a -1 on the repair assigned
    163		 * sequence number. The first time it is reused the sequence
    164		 * is -1, the second time -2, etc. This fixes that issue
    165		 * without appearing to create any others.
    166		 */
    167		if (likely(!tp->repair)) {
    168			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
    169
    170			if (!seq)
    171				seq = 1;
    172			WRITE_ONCE(tp->write_seq, seq);
    173			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
    174			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
    175		}
    176		sock_hold(sktw);
    177		return 1;
    178	}
    179
    180	return 0;
    181}
    182EXPORT_SYMBOL_GPL(tcp_twsk_unique);
    183
    184static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
    185			      int addr_len)
    186{
    187	/* This check is replicated from tcp_v4_connect() and intended to
    188	 * prevent BPF program called below from accessing bytes that are out
    189	 * of the bound specified by user in addr_len.
    190	 */
    191	if (addr_len < sizeof(struct sockaddr_in))
    192		return -EINVAL;
    193
    194	sock_owned_by_me(sk);
    195
    196	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
    197}
    198
    199/* This will initiate an outgoing connection. */
    200int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
    201{
    202	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
    203	struct inet_sock *inet = inet_sk(sk);
    204	struct tcp_sock *tp = tcp_sk(sk);
    205	__be16 orig_sport, orig_dport;
    206	__be32 daddr, nexthop;
    207	struct flowi4 *fl4;
    208	struct rtable *rt;
    209	int err;
    210	struct ip_options_rcu *inet_opt;
    211	struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
    212
    213	if (addr_len < sizeof(struct sockaddr_in))
    214		return -EINVAL;
    215
    216	if (usin->sin_family != AF_INET)
    217		return -EAFNOSUPPORT;
    218
    219	nexthop = daddr = usin->sin_addr.s_addr;
    220	inet_opt = rcu_dereference_protected(inet->inet_opt,
    221					     lockdep_sock_is_held(sk));
    222	if (inet_opt && inet_opt->opt.srr) {
    223		if (!daddr)
    224			return -EINVAL;
    225		nexthop = inet_opt->opt.faddr;
    226	}
    227
    228	orig_sport = inet->inet_sport;
    229	orig_dport = usin->sin_port;
    230	fl4 = &inet->cork.fl.u.ip4;
    231	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
    232			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
    233			      orig_dport, sk);
    234	if (IS_ERR(rt)) {
    235		err = PTR_ERR(rt);
    236		if (err == -ENETUNREACH)
    237			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
    238		return err;
    239	}
    240
    241	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
    242		ip_rt_put(rt);
    243		return -ENETUNREACH;
    244	}
    245
    246	if (!inet_opt || !inet_opt->opt.srr)
    247		daddr = fl4->daddr;
    248
    249	if (!inet->inet_saddr)
    250		inet->inet_saddr = fl4->saddr;
    251	sk_rcv_saddr_set(sk, inet->inet_saddr);
    252
    253	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
    254		/* Reset inherited state */
    255		tp->rx_opt.ts_recent	   = 0;
    256		tp->rx_opt.ts_recent_stamp = 0;
    257		if (likely(!tp->repair))
    258			WRITE_ONCE(tp->write_seq, 0);
    259	}
    260
    261	inet->inet_dport = usin->sin_port;
    262	sk_daddr_set(sk, daddr);
    263
    264	inet_csk(sk)->icsk_ext_hdr_len = 0;
    265	if (inet_opt)
    266		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
    267
    268	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
    269
    270	/* Socket identity is still unknown (sport may be zero).
    271	 * However we set state to SYN-SENT and not releasing socket
    272	 * lock select source port, enter ourselves into the hash tables and
    273	 * complete initialization after this.
    274	 */
    275	tcp_set_state(sk, TCP_SYN_SENT);
    276	err = inet_hash_connect(tcp_death_row, sk);
    277	if (err)
    278		goto failure;
    279
    280	sk_set_txhash(sk);
    281
    282	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
    283			       inet->inet_sport, inet->inet_dport, sk);
    284	if (IS_ERR(rt)) {
    285		err = PTR_ERR(rt);
    286		rt = NULL;
    287		goto failure;
    288	}
    289	/* OK, now commit destination to socket.  */
    290	sk->sk_gso_type = SKB_GSO_TCPV4;
    291	sk_setup_caps(sk, &rt->dst);
    292	rt = NULL;
    293
    294	if (likely(!tp->repair)) {
    295		if (!tp->write_seq)
    296			WRITE_ONCE(tp->write_seq,
    297				   secure_tcp_seq(inet->inet_saddr,
    298						  inet->inet_daddr,
    299						  inet->inet_sport,
    300						  usin->sin_port));
    301		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
    302						 inet->inet_saddr,
    303						 inet->inet_daddr);
    304	}
    305
    306	inet->inet_id = prandom_u32();
    307
    308	if (tcp_fastopen_defer_connect(sk, &err))
    309		return err;
    310	if (err)
    311		goto failure;
    312
    313	err = tcp_connect(sk);
    314
    315	if (err)
    316		goto failure;
    317
    318	return 0;
    319
    320failure:
    321	/*
    322	 * This unhashes the socket and releases the local port,
    323	 * if necessary.
    324	 */
    325	tcp_set_state(sk, TCP_CLOSE);
    326	ip_rt_put(rt);
    327	sk->sk_route_caps = 0;
    328	inet->inet_dport = 0;
    329	return err;
    330}
    331EXPORT_SYMBOL(tcp_v4_connect);
    332
    333/*
    334 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
    335 * It can be called through tcp_release_cb() if socket was owned by user
    336 * at the time tcp_v4_err() was called to handle ICMP message.
    337 */
    338void tcp_v4_mtu_reduced(struct sock *sk)
    339{
    340	struct inet_sock *inet = inet_sk(sk);
    341	struct dst_entry *dst;
    342	u32 mtu;
    343
    344	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
    345		return;
    346	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
    347	dst = inet_csk_update_pmtu(sk, mtu);
    348	if (!dst)
    349		return;
    350
    351	/* Something is about to be wrong... Remember soft error
    352	 * for the case, if this connection will not able to recover.
    353	 */
    354	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
    355		sk->sk_err_soft = EMSGSIZE;
    356
    357	mtu = dst_mtu(dst);
    358
    359	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
    360	    ip_sk_accept_pmtu(sk) &&
    361	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
    362		tcp_sync_mss(sk, mtu);
    363
    364		/* Resend the TCP packet because it's
    365		 * clear that the old packet has been
    366		 * dropped. This is the new "fast" path mtu
    367		 * discovery.
    368		 */
    369		tcp_simple_retransmit(sk);
    370	} /* else let the usual retransmit timer handle it */
    371}
    372EXPORT_SYMBOL(tcp_v4_mtu_reduced);
    373
    374static void do_redirect(struct sk_buff *skb, struct sock *sk)
    375{
    376	struct dst_entry *dst = __sk_dst_check(sk, 0);
    377
    378	if (dst)
    379		dst->ops->redirect(dst, sk, skb);
    380}
    381
    382
    383/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
    384void tcp_req_err(struct sock *sk, u32 seq, bool abort)
    385{
    386	struct request_sock *req = inet_reqsk(sk);
    387	struct net *net = sock_net(sk);
    388
    389	/* ICMPs are not backlogged, hence we cannot get
    390	 * an established socket here.
    391	 */
    392	if (seq != tcp_rsk(req)->snt_isn) {
    393		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
    394	} else if (abort) {
    395		/*
    396		 * Still in SYN_RECV, just remove it silently.
    397		 * There is no good way to pass the error to the newly
    398		 * created socket, and POSIX does not want network
    399		 * errors returned from accept().
    400		 */
    401		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
    402		tcp_listendrop(req->rsk_listener);
    403	}
    404	reqsk_put(req);
    405}
    406EXPORT_SYMBOL(tcp_req_err);
    407
    408/* TCP-LD (RFC 6069) logic */
    409void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
    410{
    411	struct inet_connection_sock *icsk = inet_csk(sk);
    412	struct tcp_sock *tp = tcp_sk(sk);
    413	struct sk_buff *skb;
    414	s32 remaining;
    415	u32 delta_us;
    416
    417	if (sock_owned_by_user(sk))
    418		return;
    419
    420	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
    421	    !icsk->icsk_backoff)
    422		return;
    423
    424	skb = tcp_rtx_queue_head(sk);
    425	if (WARN_ON_ONCE(!skb))
    426		return;
    427
    428	icsk->icsk_backoff--;
    429	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
    430	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
    431
    432	tcp_mstamp_refresh(tp);
    433	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
    434	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
    435
    436	if (remaining > 0) {
    437		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
    438					  remaining, TCP_RTO_MAX);
    439	} else {
    440		/* RTO revert clocked out retransmission.
    441		 * Will retransmit now.
    442		 */
    443		tcp_retransmit_timer(sk);
    444	}
    445}
    446EXPORT_SYMBOL(tcp_ld_RTO_revert);
    447
    448/*
    449 * This routine is called by the ICMP module when it gets some
    450 * sort of error condition.  If err < 0 then the socket should
    451 * be closed and the error returned to the user.  If err > 0
    452 * it's just the icmp type << 8 | icmp code.  After adjustment
    453 * header points to the first 8 bytes of the tcp header.  We need
    454 * to find the appropriate port.
    455 *
    456 * The locking strategy used here is very "optimistic". When
    457 * someone else accesses the socket the ICMP is just dropped
    458 * and for some paths there is no check at all.
    459 * A more general error queue to queue errors for later handling
    460 * is probably better.
    461 *
    462 */
    463
    464int tcp_v4_err(struct sk_buff *skb, u32 info)
    465{
    466	const struct iphdr *iph = (const struct iphdr *)skb->data;
    467	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
    468	struct tcp_sock *tp;
    469	struct inet_sock *inet;
    470	const int type = icmp_hdr(skb)->type;
    471	const int code = icmp_hdr(skb)->code;
    472	struct sock *sk;
    473	struct request_sock *fastopen;
    474	u32 seq, snd_una;
    475	int err;
    476	struct net *net = dev_net(skb->dev);
    477
    478	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
    479				       th->dest, iph->saddr, ntohs(th->source),
    480				       inet_iif(skb), 0);
    481	if (!sk) {
    482		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
    483		return -ENOENT;
    484	}
    485	if (sk->sk_state == TCP_TIME_WAIT) {
    486		inet_twsk_put(inet_twsk(sk));
    487		return 0;
    488	}
    489	seq = ntohl(th->seq);
    490	if (sk->sk_state == TCP_NEW_SYN_RECV) {
    491		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
    492				     type == ICMP_TIME_EXCEEDED ||
    493				     (type == ICMP_DEST_UNREACH &&
    494				      (code == ICMP_NET_UNREACH ||
    495				       code == ICMP_HOST_UNREACH)));
    496		return 0;
    497	}
    498
    499	bh_lock_sock(sk);
    500	/* If too many ICMPs get dropped on busy
    501	 * servers this needs to be solved differently.
    502	 * We do take care of PMTU discovery (RFC1191) special case :
    503	 * we can receive locally generated ICMP messages while socket is held.
    504	 */
    505	if (sock_owned_by_user(sk)) {
    506		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
    507			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
    508	}
    509	if (sk->sk_state == TCP_CLOSE)
    510		goto out;
    511
    512	if (static_branch_unlikely(&ip4_min_ttl)) {
    513		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
    514		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
    515			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
    516			goto out;
    517		}
    518	}
    519
    520	tp = tcp_sk(sk);
    521	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
    522	fastopen = rcu_dereference(tp->fastopen_rsk);
    523	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
    524	if (sk->sk_state != TCP_LISTEN &&
    525	    !between(seq, snd_una, tp->snd_nxt)) {
    526		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
    527		goto out;
    528	}
    529
    530	switch (type) {
    531	case ICMP_REDIRECT:
    532		if (!sock_owned_by_user(sk))
    533			do_redirect(skb, sk);
    534		goto out;
    535	case ICMP_SOURCE_QUENCH:
    536		/* Just silently ignore these. */
    537		goto out;
    538	case ICMP_PARAMETERPROB:
    539		err = EPROTO;
    540		break;
    541	case ICMP_DEST_UNREACH:
    542		if (code > NR_ICMP_UNREACH)
    543			goto out;
    544
    545		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
    546			/* We are not interested in TCP_LISTEN and open_requests
    547			 * (SYN-ACKs send out by Linux are always <576bytes so
    548			 * they should go through unfragmented).
    549			 */
    550			if (sk->sk_state == TCP_LISTEN)
    551				goto out;
    552
    553			WRITE_ONCE(tp->mtu_info, info);
    554			if (!sock_owned_by_user(sk)) {
    555				tcp_v4_mtu_reduced(sk);
    556			} else {
    557				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
    558					sock_hold(sk);
    559			}
    560			goto out;
    561		}
    562
    563		err = icmp_err_convert[code].errno;
    564		/* check if this ICMP message allows revert of backoff.
    565		 * (see RFC 6069)
    566		 */
    567		if (!fastopen &&
    568		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
    569			tcp_ld_RTO_revert(sk, seq);
    570		break;
    571	case ICMP_TIME_EXCEEDED:
    572		err = EHOSTUNREACH;
    573		break;
    574	default:
    575		goto out;
    576	}
    577
    578	switch (sk->sk_state) {
    579	case TCP_SYN_SENT:
    580	case TCP_SYN_RECV:
    581		/* Only in fast or simultaneous open. If a fast open socket is
    582		 * already accepted it is treated as a connected one below.
    583		 */
    584		if (fastopen && !fastopen->sk)
    585			break;
    586
    587		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
    588
    589		if (!sock_owned_by_user(sk)) {
    590			sk->sk_err = err;
    591
    592			sk_error_report(sk);
    593
    594			tcp_done(sk);
    595		} else {
    596			sk->sk_err_soft = err;
    597		}
    598		goto out;
    599	}
    600
    601	/* If we've already connected we will keep trying
    602	 * until we time out, or the user gives up.
    603	 *
    604	 * rfc1122 4.2.3.9 allows to consider as hard errors
    605	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
    606	 * but it is obsoleted by pmtu discovery).
    607	 *
    608	 * Note, that in modern internet, where routing is unreliable
    609	 * and in each dark corner broken firewalls sit, sending random
    610	 * errors ordered by their masters even this two messages finally lose
    611	 * their original sense (even Linux sends invalid PORT_UNREACHs)
    612	 *
    613	 * Now we are in compliance with RFCs.
    614	 *							--ANK (980905)
    615	 */
    616
    617	inet = inet_sk(sk);
    618	if (!sock_owned_by_user(sk) && inet->recverr) {
    619		sk->sk_err = err;
    620		sk_error_report(sk);
    621	} else	{ /* Only an error on timeout */
    622		sk->sk_err_soft = err;
    623	}
    624
    625out:
    626	bh_unlock_sock(sk);
    627	sock_put(sk);
    628	return 0;
    629}
    630
    631void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
    632{
    633	struct tcphdr *th = tcp_hdr(skb);
    634
    635	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
    636	skb->csum_start = skb_transport_header(skb) - skb->head;
    637	skb->csum_offset = offsetof(struct tcphdr, check);
    638}
    639
    640/* This routine computes an IPv4 TCP checksum. */
    641void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
    642{
    643	const struct inet_sock *inet = inet_sk(sk);
    644
    645	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
    646}
    647EXPORT_SYMBOL(tcp_v4_send_check);
    648
    649/*
    650 *	This routine will send an RST to the other tcp.
    651 *
    652 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
    653 *		      for reset.
    654 *	Answer: if a packet caused RST, it is not for a socket
    655 *		existing in our system, if it is matched to a socket,
    656 *		it is just duplicate segment or bug in other side's TCP.
    657 *		So that we build reply only basing on parameters
    658 *		arrived with segment.
    659 *	Exception: precedence violation. We do not implement it in any case.
    660 */
    661
    662#ifdef CONFIG_TCP_MD5SIG
    663#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
    664#else
    665#define OPTION_BYTES sizeof(__be32)
    666#endif
    667
    668static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
    669{
    670	const struct tcphdr *th = tcp_hdr(skb);
    671	struct {
    672		struct tcphdr th;
    673		__be32 opt[OPTION_BYTES / sizeof(__be32)];
    674	} rep;
    675	struct ip_reply_arg arg;
    676#ifdef CONFIG_TCP_MD5SIG
    677	struct tcp_md5sig_key *key = NULL;
    678	const __u8 *hash_location = NULL;
    679	unsigned char newhash[16];
    680	int genhash;
    681	struct sock *sk1 = NULL;
    682#endif
    683	u64 transmit_time = 0;
    684	struct sock *ctl_sk;
    685	struct net *net;
    686
    687	/* Never send a reset in response to a reset. */
    688	if (th->rst)
    689		return;
    690
    691	/* If sk not NULL, it means we did a successful lookup and incoming
    692	 * route had to be correct. prequeue might have dropped our dst.
    693	 */
    694	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
    695		return;
    696
    697	/* Swap the send and the receive. */
    698	memset(&rep, 0, sizeof(rep));
    699	rep.th.dest   = th->source;
    700	rep.th.source = th->dest;
    701	rep.th.doff   = sizeof(struct tcphdr) / 4;
    702	rep.th.rst    = 1;
    703
    704	if (th->ack) {
    705		rep.th.seq = th->ack_seq;
    706	} else {
    707		rep.th.ack = 1;
    708		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
    709				       skb->len - (th->doff << 2));
    710	}
    711
    712	memset(&arg, 0, sizeof(arg));
    713	arg.iov[0].iov_base = (unsigned char *)&rep;
    714	arg.iov[0].iov_len  = sizeof(rep.th);
    715
    716	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
    717#ifdef CONFIG_TCP_MD5SIG
    718	rcu_read_lock();
    719	hash_location = tcp_parse_md5sig_option(th);
    720	if (sk && sk_fullsock(sk)) {
    721		const union tcp_md5_addr *addr;
    722		int l3index;
    723
    724		/* sdif set, means packet ingressed via a device
    725		 * in an L3 domain and inet_iif is set to it.
    726		 */
    727		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
    728		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
    729		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
    730	} else if (hash_location) {
    731		const union tcp_md5_addr *addr;
    732		int sdif = tcp_v4_sdif(skb);
    733		int dif = inet_iif(skb);
    734		int l3index;
    735
    736		/*
    737		 * active side is lost. Try to find listening socket through
    738		 * source port, and then find md5 key through listening socket.
    739		 * we are not loose security here:
    740		 * Incoming packet is checked with md5 hash with finding key,
    741		 * no RST generated if md5 hash doesn't match.
    742		 */
    743		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
    744					     ip_hdr(skb)->saddr,
    745					     th->source, ip_hdr(skb)->daddr,
    746					     ntohs(th->source), dif, sdif);
    747		/* don't send rst if it can't find key */
    748		if (!sk1)
    749			goto out;
    750
    751		/* sdif set, means packet ingressed via a device
    752		 * in an L3 domain and dif is set to it.
    753		 */
    754		l3index = sdif ? dif : 0;
    755		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
    756		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
    757		if (!key)
    758			goto out;
    759
    760
    761		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
    762		if (genhash || memcmp(hash_location, newhash, 16) != 0)
    763			goto out;
    764
    765	}
    766
    767	if (key) {
    768		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
    769				   (TCPOPT_NOP << 16) |
    770				   (TCPOPT_MD5SIG << 8) |
    771				   TCPOLEN_MD5SIG);
    772		/* Update length and the length the header thinks exists */
    773		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
    774		rep.th.doff = arg.iov[0].iov_len / 4;
    775
    776		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
    777				     key, ip_hdr(skb)->saddr,
    778				     ip_hdr(skb)->daddr, &rep.th);
    779	}
    780#endif
    781	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
    782	if (rep.opt[0] == 0) {
    783		__be32 mrst = mptcp_reset_option(skb);
    784
    785		if (mrst) {
    786			rep.opt[0] = mrst;
    787			arg.iov[0].iov_len += sizeof(mrst);
    788			rep.th.doff = arg.iov[0].iov_len / 4;
    789		}
    790	}
    791
    792	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
    793				      ip_hdr(skb)->saddr, /* XXX */
    794				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
    795	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
    796	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
    797
    798	/* When socket is gone, all binding information is lost.
    799	 * routing might fail in this case. No choice here, if we choose to force
    800	 * input interface, we will misroute in case of asymmetric route.
    801	 */
    802	if (sk) {
    803		arg.bound_dev_if = sk->sk_bound_dev_if;
    804		if (sk_fullsock(sk))
    805			trace_tcp_send_reset(sk, skb);
    806	}
    807
    808	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
    809		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
    810
    811	arg.tos = ip_hdr(skb)->tos;
    812	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
    813	local_bh_disable();
    814	ctl_sk = this_cpu_read(ipv4_tcp_sk);
    815	sock_net_set(ctl_sk, net);
    816	if (sk) {
    817		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
    818				   inet_twsk(sk)->tw_mark : sk->sk_mark;
    819		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
    820				   inet_twsk(sk)->tw_priority : sk->sk_priority;
    821		transmit_time = tcp_transmit_time(sk);
    822	}
    823	ip_send_unicast_reply(ctl_sk,
    824			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
    825			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
    826			      &arg, arg.iov[0].iov_len,
    827			      transmit_time);
    828
    829	ctl_sk->sk_mark = 0;
    830	sock_net_set(ctl_sk, &init_net);
    831	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
    832	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
    833	local_bh_enable();
    834
    835#ifdef CONFIG_TCP_MD5SIG
    836out:
    837	rcu_read_unlock();
    838#endif
    839}
    840
    841/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
    842   outside socket context is ugly, certainly. What can I do?
    843 */
    844
    845static void tcp_v4_send_ack(const struct sock *sk,
    846			    struct sk_buff *skb, u32 seq, u32 ack,
    847			    u32 win, u32 tsval, u32 tsecr, int oif,
    848			    struct tcp_md5sig_key *key,
    849			    int reply_flags, u8 tos)
    850{
    851	const struct tcphdr *th = tcp_hdr(skb);
    852	struct {
    853		struct tcphdr th;
    854		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
    855#ifdef CONFIG_TCP_MD5SIG
    856			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
    857#endif
    858			];
    859	} rep;
    860	struct net *net = sock_net(sk);
    861	struct ip_reply_arg arg;
    862	struct sock *ctl_sk;
    863	u64 transmit_time;
    864
    865	memset(&rep.th, 0, sizeof(struct tcphdr));
    866	memset(&arg, 0, sizeof(arg));
    867
    868	arg.iov[0].iov_base = (unsigned char *)&rep;
    869	arg.iov[0].iov_len  = sizeof(rep.th);
    870	if (tsecr) {
    871		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
    872				   (TCPOPT_TIMESTAMP << 8) |
    873				   TCPOLEN_TIMESTAMP);
    874		rep.opt[1] = htonl(tsval);
    875		rep.opt[2] = htonl(tsecr);
    876		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
    877	}
    878
    879	/* Swap the send and the receive. */
    880	rep.th.dest    = th->source;
    881	rep.th.source  = th->dest;
    882	rep.th.doff    = arg.iov[0].iov_len / 4;
    883	rep.th.seq     = htonl(seq);
    884	rep.th.ack_seq = htonl(ack);
    885	rep.th.ack     = 1;
    886	rep.th.window  = htons(win);
    887
    888#ifdef CONFIG_TCP_MD5SIG
    889	if (key) {
    890		int offset = (tsecr) ? 3 : 0;
    891
    892		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
    893					  (TCPOPT_NOP << 16) |
    894					  (TCPOPT_MD5SIG << 8) |
    895					  TCPOLEN_MD5SIG);
    896		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
    897		rep.th.doff = arg.iov[0].iov_len/4;
    898
    899		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
    900				    key, ip_hdr(skb)->saddr,
    901				    ip_hdr(skb)->daddr, &rep.th);
    902	}
    903#endif
    904	arg.flags = reply_flags;
    905	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
    906				      ip_hdr(skb)->saddr, /* XXX */
    907				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
    908	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
    909	if (oif)
    910		arg.bound_dev_if = oif;
    911	arg.tos = tos;
    912	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
    913	local_bh_disable();
    914	ctl_sk = this_cpu_read(ipv4_tcp_sk);
    915	sock_net_set(ctl_sk, net);
    916	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
    917			   inet_twsk(sk)->tw_mark : sk->sk_mark;
    918	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
    919			   inet_twsk(sk)->tw_priority : sk->sk_priority;
    920	transmit_time = tcp_transmit_time(sk);
    921	ip_send_unicast_reply(ctl_sk,
    922			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
    923			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
    924			      &arg, arg.iov[0].iov_len,
    925			      transmit_time);
    926
    927	ctl_sk->sk_mark = 0;
    928	sock_net_set(ctl_sk, &init_net);
    929	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
    930	local_bh_enable();
    931}
    932
    933static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
    934{
    935	struct inet_timewait_sock *tw = inet_twsk(sk);
    936	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
    937
    938	tcp_v4_send_ack(sk, skb,
    939			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
    940			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
    941			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
    942			tcptw->tw_ts_recent,
    943			tw->tw_bound_dev_if,
    944			tcp_twsk_md5_key(tcptw),
    945			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
    946			tw->tw_tos
    947			);
    948
    949	inet_twsk_put(tw);
    950}
    951
    952static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
    953				  struct request_sock *req)
    954{
    955	const union tcp_md5_addr *addr;
    956	int l3index;
    957
    958	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
    959	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
    960	 */
    961	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
    962					     tcp_sk(sk)->snd_nxt;
    963
    964	/* RFC 7323 2.3
    965	 * The window field (SEG.WND) of every outgoing segment, with the
    966	 * exception of <SYN> segments, MUST be right-shifted by
    967	 * Rcv.Wind.Shift bits:
    968	 */
    969	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
    970	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
    971	tcp_v4_send_ack(sk, skb, seq,
    972			tcp_rsk(req)->rcv_nxt,
    973			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
    974			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
    975			req->ts_recent,
    976			0,
    977			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
    978			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
    979			ip_hdr(skb)->tos);
    980}
    981
    982/*
    983 *	Send a SYN-ACK after having received a SYN.
    984 *	This still operates on a request_sock only, not on a big
    985 *	socket.
    986 */
    987static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
    988			      struct flowi *fl,
    989			      struct request_sock *req,
    990			      struct tcp_fastopen_cookie *foc,
    991			      enum tcp_synack_type synack_type,
    992			      struct sk_buff *syn_skb)
    993{
    994	const struct inet_request_sock *ireq = inet_rsk(req);
    995	struct flowi4 fl4;
    996	int err = -1;
    997	struct sk_buff *skb;
    998	u8 tos;
    999
   1000	/* First, grab a route. */
   1001	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
   1002		return -1;
   1003
   1004	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
   1005
   1006	if (skb) {
   1007		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
   1008
   1009		tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
   1010				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
   1011				(inet_sk(sk)->tos & INET_ECN_MASK) :
   1012				inet_sk(sk)->tos;
   1013
   1014		if (!INET_ECN_is_capable(tos) &&
   1015		    tcp_bpf_ca_needs_ecn((struct sock *)req))
   1016			tos |= INET_ECN_ECT_0;
   1017
   1018		rcu_read_lock();
   1019		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
   1020					    ireq->ir_rmt_addr,
   1021					    rcu_dereference(ireq->ireq_opt),
   1022					    tos);
   1023		rcu_read_unlock();
   1024		err = net_xmit_eval(err);
   1025	}
   1026
   1027	return err;
   1028}
   1029
   1030/*
   1031 *	IPv4 request_sock destructor.
   1032 */
   1033static void tcp_v4_reqsk_destructor(struct request_sock *req)
   1034{
   1035	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
   1036}
   1037
   1038#ifdef CONFIG_TCP_MD5SIG
   1039/*
   1040 * RFC2385 MD5 checksumming requires a mapping of
   1041 * IP address->MD5 Key.
   1042 * We need to maintain these in the sk structure.
   1043 */
   1044
   1045DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
   1046EXPORT_SYMBOL(tcp_md5_needed);
   1047
   1048static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
   1049{
   1050	if (!old)
   1051		return true;
   1052
   1053	/* l3index always overrides non-l3index */
   1054	if (old->l3index && new->l3index == 0)
   1055		return false;
   1056	if (old->l3index == 0 && new->l3index)
   1057		return true;
   1058
   1059	return old->prefixlen < new->prefixlen;
   1060}
   1061
   1062/* Find the Key structure for an address.  */
   1063struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
   1064					   const union tcp_md5_addr *addr,
   1065					   int family)
   1066{
   1067	const struct tcp_sock *tp = tcp_sk(sk);
   1068	struct tcp_md5sig_key *key;
   1069	const struct tcp_md5sig_info *md5sig;
   1070	__be32 mask;
   1071	struct tcp_md5sig_key *best_match = NULL;
   1072	bool match;
   1073
   1074	/* caller either holds rcu_read_lock() or socket lock */
   1075	md5sig = rcu_dereference_check(tp->md5sig_info,
   1076				       lockdep_sock_is_held(sk));
   1077	if (!md5sig)
   1078		return NULL;
   1079
   1080	hlist_for_each_entry_rcu(key, &md5sig->head, node,
   1081				 lockdep_sock_is_held(sk)) {
   1082		if (key->family != family)
   1083			continue;
   1084		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
   1085			continue;
   1086		if (family == AF_INET) {
   1087			mask = inet_make_mask(key->prefixlen);
   1088			match = (key->addr.a4.s_addr & mask) ==
   1089				(addr->a4.s_addr & mask);
   1090#if IS_ENABLED(CONFIG_IPV6)
   1091		} else if (family == AF_INET6) {
   1092			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
   1093						  key->prefixlen);
   1094#endif
   1095		} else {
   1096			match = false;
   1097		}
   1098
   1099		if (match && better_md5_match(best_match, key))
   1100			best_match = key;
   1101	}
   1102	return best_match;
   1103}
   1104EXPORT_SYMBOL(__tcp_md5_do_lookup);
   1105
   1106static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
   1107						      const union tcp_md5_addr *addr,
   1108						      int family, u8 prefixlen,
   1109						      int l3index, u8 flags)
   1110{
   1111	const struct tcp_sock *tp = tcp_sk(sk);
   1112	struct tcp_md5sig_key *key;
   1113	unsigned int size = sizeof(struct in_addr);
   1114	const struct tcp_md5sig_info *md5sig;
   1115
   1116	/* caller either holds rcu_read_lock() or socket lock */
   1117	md5sig = rcu_dereference_check(tp->md5sig_info,
   1118				       lockdep_sock_is_held(sk));
   1119	if (!md5sig)
   1120		return NULL;
   1121#if IS_ENABLED(CONFIG_IPV6)
   1122	if (family == AF_INET6)
   1123		size = sizeof(struct in6_addr);
   1124#endif
   1125	hlist_for_each_entry_rcu(key, &md5sig->head, node,
   1126				 lockdep_sock_is_held(sk)) {
   1127		if (key->family != family)
   1128			continue;
   1129		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
   1130			continue;
   1131		if (key->l3index != l3index)
   1132			continue;
   1133		if (!memcmp(&key->addr, addr, size) &&
   1134		    key->prefixlen == prefixlen)
   1135			return key;
   1136	}
   1137	return NULL;
   1138}
   1139
   1140struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
   1141					 const struct sock *addr_sk)
   1142{
   1143	const union tcp_md5_addr *addr;
   1144	int l3index;
   1145
   1146	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
   1147						 addr_sk->sk_bound_dev_if);
   1148	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
   1149	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
   1150}
   1151EXPORT_SYMBOL(tcp_v4_md5_lookup);
   1152
   1153/* This can be called on a newly created socket, from other files */
   1154int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
   1155		   int family, u8 prefixlen, int l3index, u8 flags,
   1156		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
   1157{
   1158	/* Add Key to the list */
   1159	struct tcp_md5sig_key *key;
   1160	struct tcp_sock *tp = tcp_sk(sk);
   1161	struct tcp_md5sig_info *md5sig;
   1162
   1163	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
   1164	if (key) {
   1165		/* Pre-existing entry - just update that one.
   1166		 * Note that the key might be used concurrently.
   1167		 * data_race() is telling kcsan that we do not care of
   1168		 * key mismatches, since changing MD5 key on live flows
   1169		 * can lead to packet drops.
   1170		 */
   1171		data_race(memcpy(key->key, newkey, newkeylen));
   1172
   1173		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
   1174		 * Also note that a reader could catch new key->keylen value
   1175		 * but old key->key[], this is the reason we use __GFP_ZERO
   1176		 * at sock_kmalloc() time below these lines.
   1177		 */
   1178		WRITE_ONCE(key->keylen, newkeylen);
   1179
   1180		return 0;
   1181	}
   1182
   1183	md5sig = rcu_dereference_protected(tp->md5sig_info,
   1184					   lockdep_sock_is_held(sk));
   1185	if (!md5sig) {
   1186		md5sig = kmalloc(sizeof(*md5sig), gfp);
   1187		if (!md5sig)
   1188			return -ENOMEM;
   1189
   1190		sk_gso_disable(sk);
   1191		INIT_HLIST_HEAD(&md5sig->head);
   1192		rcu_assign_pointer(tp->md5sig_info, md5sig);
   1193	}
   1194
   1195	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
   1196	if (!key)
   1197		return -ENOMEM;
   1198	if (!tcp_alloc_md5sig_pool()) {
   1199		sock_kfree_s(sk, key, sizeof(*key));
   1200		return -ENOMEM;
   1201	}
   1202
   1203	memcpy(key->key, newkey, newkeylen);
   1204	key->keylen = newkeylen;
   1205	key->family = family;
   1206	key->prefixlen = prefixlen;
   1207	key->l3index = l3index;
   1208	key->flags = flags;
   1209	memcpy(&key->addr, addr,
   1210	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
   1211								 sizeof(struct in_addr));
   1212	hlist_add_head_rcu(&key->node, &md5sig->head);
   1213	return 0;
   1214}
   1215EXPORT_SYMBOL(tcp_md5_do_add);
   1216
   1217int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
   1218		   u8 prefixlen, int l3index, u8 flags)
   1219{
   1220	struct tcp_md5sig_key *key;
   1221
   1222	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
   1223	if (!key)
   1224		return -ENOENT;
   1225	hlist_del_rcu(&key->node);
   1226	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
   1227	kfree_rcu(key, rcu);
   1228	return 0;
   1229}
   1230EXPORT_SYMBOL(tcp_md5_do_del);
   1231
   1232static void tcp_clear_md5_list(struct sock *sk)
   1233{
   1234	struct tcp_sock *tp = tcp_sk(sk);
   1235	struct tcp_md5sig_key *key;
   1236	struct hlist_node *n;
   1237	struct tcp_md5sig_info *md5sig;
   1238
   1239	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
   1240
   1241	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
   1242		hlist_del_rcu(&key->node);
   1243		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
   1244		kfree_rcu(key, rcu);
   1245	}
   1246}
   1247
   1248static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
   1249				 sockptr_t optval, int optlen)
   1250{
   1251	struct tcp_md5sig cmd;
   1252	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
   1253	const union tcp_md5_addr *addr;
   1254	u8 prefixlen = 32;
   1255	int l3index = 0;
   1256	u8 flags;
   1257
   1258	if (optlen < sizeof(cmd))
   1259		return -EINVAL;
   1260
   1261	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
   1262		return -EFAULT;
   1263
   1264	if (sin->sin_family != AF_INET)
   1265		return -EINVAL;
   1266
   1267	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
   1268
   1269	if (optname == TCP_MD5SIG_EXT &&
   1270	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
   1271		prefixlen = cmd.tcpm_prefixlen;
   1272		if (prefixlen > 32)
   1273			return -EINVAL;
   1274	}
   1275
   1276	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
   1277	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
   1278		struct net_device *dev;
   1279
   1280		rcu_read_lock();
   1281		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
   1282		if (dev && netif_is_l3_master(dev))
   1283			l3index = dev->ifindex;
   1284
   1285		rcu_read_unlock();
   1286
   1287		/* ok to reference set/not set outside of rcu;
   1288		 * right now device MUST be an L3 master
   1289		 */
   1290		if (!dev || !l3index)
   1291			return -EINVAL;
   1292	}
   1293
   1294	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
   1295
   1296	if (!cmd.tcpm_keylen)
   1297		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
   1298
   1299	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
   1300		return -EINVAL;
   1301
   1302	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
   1303			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
   1304}
   1305
   1306static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
   1307				   __be32 daddr, __be32 saddr,
   1308				   const struct tcphdr *th, int nbytes)
   1309{
   1310	struct tcp4_pseudohdr *bp;
   1311	struct scatterlist sg;
   1312	struct tcphdr *_th;
   1313
   1314	bp = hp->scratch;
   1315	bp->saddr = saddr;
   1316	bp->daddr = daddr;
   1317	bp->pad = 0;
   1318	bp->protocol = IPPROTO_TCP;
   1319	bp->len = cpu_to_be16(nbytes);
   1320
   1321	_th = (struct tcphdr *)(bp + 1);
   1322	memcpy(_th, th, sizeof(*th));
   1323	_th->check = 0;
   1324
   1325	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
   1326	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
   1327				sizeof(*bp) + sizeof(*th));
   1328	return crypto_ahash_update(hp->md5_req);
   1329}
   1330
   1331static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
   1332			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
   1333{
   1334	struct tcp_md5sig_pool *hp;
   1335	struct ahash_request *req;
   1336
   1337	hp = tcp_get_md5sig_pool();
   1338	if (!hp)
   1339		goto clear_hash_noput;
   1340	req = hp->md5_req;
   1341
   1342	if (crypto_ahash_init(req))
   1343		goto clear_hash;
   1344	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
   1345		goto clear_hash;
   1346	if (tcp_md5_hash_key(hp, key))
   1347		goto clear_hash;
   1348	ahash_request_set_crypt(req, NULL, md5_hash, 0);
   1349	if (crypto_ahash_final(req))
   1350		goto clear_hash;
   1351
   1352	tcp_put_md5sig_pool();
   1353	return 0;
   1354
   1355clear_hash:
   1356	tcp_put_md5sig_pool();
   1357clear_hash_noput:
   1358	memset(md5_hash, 0, 16);
   1359	return 1;
   1360}
   1361
   1362int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
   1363			const struct sock *sk,
   1364			const struct sk_buff *skb)
   1365{
   1366	struct tcp_md5sig_pool *hp;
   1367	struct ahash_request *req;
   1368	const struct tcphdr *th = tcp_hdr(skb);
   1369	__be32 saddr, daddr;
   1370
   1371	if (sk) { /* valid for establish/request sockets */
   1372		saddr = sk->sk_rcv_saddr;
   1373		daddr = sk->sk_daddr;
   1374	} else {
   1375		const struct iphdr *iph = ip_hdr(skb);
   1376		saddr = iph->saddr;
   1377		daddr = iph->daddr;
   1378	}
   1379
   1380	hp = tcp_get_md5sig_pool();
   1381	if (!hp)
   1382		goto clear_hash_noput;
   1383	req = hp->md5_req;
   1384
   1385	if (crypto_ahash_init(req))
   1386		goto clear_hash;
   1387
   1388	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
   1389		goto clear_hash;
   1390	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
   1391		goto clear_hash;
   1392	if (tcp_md5_hash_key(hp, key))
   1393		goto clear_hash;
   1394	ahash_request_set_crypt(req, NULL, md5_hash, 0);
   1395	if (crypto_ahash_final(req))
   1396		goto clear_hash;
   1397
   1398	tcp_put_md5sig_pool();
   1399	return 0;
   1400
   1401clear_hash:
   1402	tcp_put_md5sig_pool();
   1403clear_hash_noput:
   1404	memset(md5_hash, 0, 16);
   1405	return 1;
   1406}
   1407EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
   1408
   1409#endif
   1410
   1411static void tcp_v4_init_req(struct request_sock *req,
   1412			    const struct sock *sk_listener,
   1413			    struct sk_buff *skb)
   1414{
   1415	struct inet_request_sock *ireq = inet_rsk(req);
   1416	struct net *net = sock_net(sk_listener);
   1417
   1418	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
   1419	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
   1420	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
   1421}
   1422
   1423static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
   1424					  struct sk_buff *skb,
   1425					  struct flowi *fl,
   1426					  struct request_sock *req)
   1427{
   1428	tcp_v4_init_req(req, sk, skb);
   1429
   1430	if (security_inet_conn_request(sk, skb, req))
   1431		return NULL;
   1432
   1433	return inet_csk_route_req(sk, &fl->u.ip4, req);
   1434}
   1435
   1436struct request_sock_ops tcp_request_sock_ops __read_mostly = {
   1437	.family		=	PF_INET,
   1438	.obj_size	=	sizeof(struct tcp_request_sock),
   1439	.rtx_syn_ack	=	tcp_rtx_synack,
   1440	.send_ack	=	tcp_v4_reqsk_send_ack,
   1441	.destructor	=	tcp_v4_reqsk_destructor,
   1442	.send_reset	=	tcp_v4_send_reset,
   1443	.syn_ack_timeout =	tcp_syn_ack_timeout,
   1444};
   1445
   1446const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
   1447	.mss_clamp	=	TCP_MSS_DEFAULT,
   1448#ifdef CONFIG_TCP_MD5SIG
   1449	.req_md5_lookup	=	tcp_v4_md5_lookup,
   1450	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
   1451#endif
   1452#ifdef CONFIG_SYN_COOKIES
   1453	.cookie_init_seq =	cookie_v4_init_sequence,
   1454#endif
   1455	.route_req	=	tcp_v4_route_req,
   1456	.init_seq	=	tcp_v4_init_seq,
   1457	.init_ts_off	=	tcp_v4_init_ts_off,
   1458	.send_synack	=	tcp_v4_send_synack,
   1459};
   1460
   1461int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
   1462{
   1463	/* Never answer to SYNs send to broadcast or multicast */
   1464	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
   1465		goto drop;
   1466
   1467	return tcp_conn_request(&tcp_request_sock_ops,
   1468				&tcp_request_sock_ipv4_ops, sk, skb);
   1469
   1470drop:
   1471	tcp_listendrop(sk);
   1472	return 0;
   1473}
   1474EXPORT_SYMBOL(tcp_v4_conn_request);
   1475
   1476
   1477/*
   1478 * The three way handshake has completed - we got a valid synack -
   1479 * now create the new socket.
   1480 */
   1481struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
   1482				  struct request_sock *req,
   1483				  struct dst_entry *dst,
   1484				  struct request_sock *req_unhash,
   1485				  bool *own_req)
   1486{
   1487	struct inet_request_sock *ireq;
   1488	bool found_dup_sk = false;
   1489	struct inet_sock *newinet;
   1490	struct tcp_sock *newtp;
   1491	struct sock *newsk;
   1492#ifdef CONFIG_TCP_MD5SIG
   1493	const union tcp_md5_addr *addr;
   1494	struct tcp_md5sig_key *key;
   1495	int l3index;
   1496#endif
   1497	struct ip_options_rcu *inet_opt;
   1498
   1499	if (sk_acceptq_is_full(sk))
   1500		goto exit_overflow;
   1501
   1502	newsk = tcp_create_openreq_child(sk, req, skb);
   1503	if (!newsk)
   1504		goto exit_nonewsk;
   1505
   1506	newsk->sk_gso_type = SKB_GSO_TCPV4;
   1507	inet_sk_rx_dst_set(newsk, skb);
   1508
   1509	newtp		      = tcp_sk(newsk);
   1510	newinet		      = inet_sk(newsk);
   1511	ireq		      = inet_rsk(req);
   1512	sk_daddr_set(newsk, ireq->ir_rmt_addr);
   1513	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
   1514	newsk->sk_bound_dev_if = ireq->ir_iif;
   1515	newinet->inet_saddr   = ireq->ir_loc_addr;
   1516	inet_opt	      = rcu_dereference(ireq->ireq_opt);
   1517	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
   1518	newinet->mc_index     = inet_iif(skb);
   1519	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
   1520	newinet->rcv_tos      = ip_hdr(skb)->tos;
   1521	inet_csk(newsk)->icsk_ext_hdr_len = 0;
   1522	if (inet_opt)
   1523		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
   1524	newinet->inet_id = prandom_u32();
   1525
   1526	/* Set ToS of the new socket based upon the value of incoming SYN.
   1527	 * ECT bits are set later in tcp_init_transfer().
   1528	 */
   1529	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
   1530		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
   1531
   1532	if (!dst) {
   1533		dst = inet_csk_route_child_sock(sk, newsk, req);
   1534		if (!dst)
   1535			goto put_and_exit;
   1536	} else {
   1537		/* syncookie case : see end of cookie_v4_check() */
   1538	}
   1539	sk_setup_caps(newsk, dst);
   1540
   1541	tcp_ca_openreq_child(newsk, dst);
   1542
   1543	tcp_sync_mss(newsk, dst_mtu(dst));
   1544	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
   1545
   1546	tcp_initialize_rcv_mss(newsk);
   1547
   1548#ifdef CONFIG_TCP_MD5SIG
   1549	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
   1550	/* Copy over the MD5 key from the original socket */
   1551	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
   1552	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
   1553	if (key) {
   1554		/*
   1555		 * We're using one, so create a matching key
   1556		 * on the newsk structure. If we fail to get
   1557		 * memory, then we end up not copying the key
   1558		 * across. Shucks.
   1559		 */
   1560		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
   1561			       key->key, key->keylen, GFP_ATOMIC);
   1562		sk_gso_disable(newsk);
   1563	}
   1564#endif
   1565
   1566	if (__inet_inherit_port(sk, newsk) < 0)
   1567		goto put_and_exit;
   1568	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
   1569				       &found_dup_sk);
   1570	if (likely(*own_req)) {
   1571		tcp_move_syn(newtp, req);
   1572		ireq->ireq_opt = NULL;
   1573	} else {
   1574		newinet->inet_opt = NULL;
   1575
   1576		if (!req_unhash && found_dup_sk) {
   1577			/* This code path should only be executed in the
   1578			 * syncookie case only
   1579			 */
   1580			bh_unlock_sock(newsk);
   1581			sock_put(newsk);
   1582			newsk = NULL;
   1583		}
   1584	}
   1585	return newsk;
   1586
   1587exit_overflow:
   1588	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
   1589exit_nonewsk:
   1590	dst_release(dst);
   1591exit:
   1592	tcp_listendrop(sk);
   1593	return NULL;
   1594put_and_exit:
   1595	newinet->inet_opt = NULL;
   1596	inet_csk_prepare_forced_close(newsk);
   1597	tcp_done(newsk);
   1598	goto exit;
   1599}
   1600EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
   1601
   1602static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
   1603{
   1604#ifdef CONFIG_SYN_COOKIES
   1605	const struct tcphdr *th = tcp_hdr(skb);
   1606
   1607	if (!th->syn)
   1608		sk = cookie_v4_check(sk, skb);
   1609#endif
   1610	return sk;
   1611}
   1612
   1613u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
   1614			 struct tcphdr *th, u32 *cookie)
   1615{
   1616	u16 mss = 0;
   1617#ifdef CONFIG_SYN_COOKIES
   1618	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
   1619				    &tcp_request_sock_ipv4_ops, sk, th);
   1620	if (mss) {
   1621		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
   1622		tcp_synq_overflow(sk);
   1623	}
   1624#endif
   1625	return mss;
   1626}
   1627
   1628INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
   1629							   u32));
   1630/* The socket must have it's spinlock held when we get
   1631 * here, unless it is a TCP_LISTEN socket.
   1632 *
   1633 * We have a potential double-lock case here, so even when
   1634 * doing backlog processing we use the BH locking scheme.
   1635 * This is because we cannot sleep with the original spinlock
   1636 * held.
   1637 */
   1638int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
   1639{
   1640	enum skb_drop_reason reason;
   1641	struct sock *rsk;
   1642
   1643	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
   1644		struct dst_entry *dst;
   1645
   1646		dst = rcu_dereference_protected(sk->sk_rx_dst,
   1647						lockdep_sock_is_held(sk));
   1648
   1649		sock_rps_save_rxhash(sk, skb);
   1650		sk_mark_napi_id(sk, skb);
   1651		if (dst) {
   1652			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
   1653			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
   1654					     dst, 0)) {
   1655				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
   1656				dst_release(dst);
   1657			}
   1658		}
   1659		tcp_rcv_established(sk, skb);
   1660		return 0;
   1661	}
   1662
   1663	reason = SKB_DROP_REASON_NOT_SPECIFIED;
   1664	if (tcp_checksum_complete(skb))
   1665		goto csum_err;
   1666
   1667	if (sk->sk_state == TCP_LISTEN) {
   1668		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
   1669
   1670		if (!nsk)
   1671			goto discard;
   1672		if (nsk != sk) {
   1673			if (tcp_child_process(sk, nsk, skb)) {
   1674				rsk = nsk;
   1675				goto reset;
   1676			}
   1677			return 0;
   1678		}
   1679	} else
   1680		sock_rps_save_rxhash(sk, skb);
   1681
   1682	if (tcp_rcv_state_process(sk, skb)) {
   1683		rsk = sk;
   1684		goto reset;
   1685	}
   1686	return 0;
   1687
   1688reset:
   1689	tcp_v4_send_reset(rsk, skb);
   1690discard:
   1691	kfree_skb_reason(skb, reason);
   1692	/* Be careful here. If this function gets more complicated and
   1693	 * gcc suffers from register pressure on the x86, sk (in %ebx)
   1694	 * might be destroyed here. This current version compiles correctly,
   1695	 * but you have been warned.
   1696	 */
   1697	return 0;
   1698
   1699csum_err:
   1700	reason = SKB_DROP_REASON_TCP_CSUM;
   1701	trace_tcp_bad_csum(skb);
   1702	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
   1703	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
   1704	goto discard;
   1705}
   1706EXPORT_SYMBOL(tcp_v4_do_rcv);
   1707
   1708int tcp_v4_early_demux(struct sk_buff *skb)
   1709{
   1710	const struct iphdr *iph;
   1711	const struct tcphdr *th;
   1712	struct sock *sk;
   1713
   1714	if (skb->pkt_type != PACKET_HOST)
   1715		return 0;
   1716
   1717	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
   1718		return 0;
   1719
   1720	iph = ip_hdr(skb);
   1721	th = tcp_hdr(skb);
   1722
   1723	if (th->doff < sizeof(struct tcphdr) / 4)
   1724		return 0;
   1725
   1726	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
   1727				       iph->saddr, th->source,
   1728				       iph->daddr, ntohs(th->dest),
   1729				       skb->skb_iif, inet_sdif(skb));
   1730	if (sk) {
   1731		skb->sk = sk;
   1732		skb->destructor = sock_edemux;
   1733		if (sk_fullsock(sk)) {
   1734			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
   1735
   1736			if (dst)
   1737				dst = dst_check(dst, 0);
   1738			if (dst &&
   1739			    sk->sk_rx_dst_ifindex == skb->skb_iif)
   1740				skb_dst_set_noref(skb, dst);
   1741		}
   1742	}
   1743	return 0;
   1744}
   1745
   1746bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
   1747		     enum skb_drop_reason *reason)
   1748{
   1749	u32 limit, tail_gso_size, tail_gso_segs;
   1750	struct skb_shared_info *shinfo;
   1751	const struct tcphdr *th;
   1752	struct tcphdr *thtail;
   1753	struct sk_buff *tail;
   1754	unsigned int hdrlen;
   1755	bool fragstolen;
   1756	u32 gso_segs;
   1757	u32 gso_size;
   1758	int delta;
   1759
   1760	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
   1761	 * we can fix skb->truesize to its real value to avoid future drops.
   1762	 * This is valid because skb is not yet charged to the socket.
   1763	 * It has been noticed pure SACK packets were sometimes dropped
   1764	 * (if cooked by drivers without copybreak feature).
   1765	 */
   1766	skb_condense(skb);
   1767
   1768	skb_dst_drop(skb);
   1769
   1770	if (unlikely(tcp_checksum_complete(skb))) {
   1771		bh_unlock_sock(sk);
   1772		trace_tcp_bad_csum(skb);
   1773		*reason = SKB_DROP_REASON_TCP_CSUM;
   1774		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
   1775		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
   1776		return true;
   1777	}
   1778
   1779	/* Attempt coalescing to last skb in backlog, even if we are
   1780	 * above the limits.
   1781	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
   1782	 */
   1783	th = (const struct tcphdr *)skb->data;
   1784	hdrlen = th->doff * 4;
   1785
   1786	tail = sk->sk_backlog.tail;
   1787	if (!tail)
   1788		goto no_coalesce;
   1789	thtail = (struct tcphdr *)tail->data;
   1790
   1791	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
   1792	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
   1793	    ((TCP_SKB_CB(tail)->tcp_flags |
   1794	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
   1795	    !((TCP_SKB_CB(tail)->tcp_flags &
   1796	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
   1797	    ((TCP_SKB_CB(tail)->tcp_flags ^
   1798	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
   1799#ifdef CONFIG_TLS_DEVICE
   1800	    tail->decrypted != skb->decrypted ||
   1801#endif
   1802	    thtail->doff != th->doff ||
   1803	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
   1804		goto no_coalesce;
   1805
   1806	__skb_pull(skb, hdrlen);
   1807
   1808	shinfo = skb_shinfo(skb);
   1809	gso_size = shinfo->gso_size ?: skb->len;
   1810	gso_segs = shinfo->gso_segs ?: 1;
   1811
   1812	shinfo = skb_shinfo(tail);
   1813	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
   1814	tail_gso_segs = shinfo->gso_segs ?: 1;
   1815
   1816	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
   1817		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
   1818
   1819		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
   1820			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
   1821			thtail->window = th->window;
   1822		}
   1823
   1824		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
   1825		 * thtail->fin, so that the fast path in tcp_rcv_established()
   1826		 * is not entered if we append a packet with a FIN.
   1827		 * SYN, RST, URG are not present.
   1828		 * ACK is set on both packets.
   1829		 * PSH : we do not really care in TCP stack,
   1830		 *       at least for 'GRO' packets.
   1831		 */
   1832		thtail->fin |= th->fin;
   1833		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
   1834
   1835		if (TCP_SKB_CB(skb)->has_rxtstamp) {
   1836			TCP_SKB_CB(tail)->has_rxtstamp = true;
   1837			tail->tstamp = skb->tstamp;
   1838			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
   1839		}
   1840
   1841		/* Not as strict as GRO. We only need to carry mss max value */
   1842		shinfo->gso_size = max(gso_size, tail_gso_size);
   1843		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
   1844
   1845		sk->sk_backlog.len += delta;
   1846		__NET_INC_STATS(sock_net(sk),
   1847				LINUX_MIB_TCPBACKLOGCOALESCE);
   1848		kfree_skb_partial(skb, fragstolen);
   1849		return false;
   1850	}
   1851	__skb_push(skb, hdrlen);
   1852
   1853no_coalesce:
   1854	/* Only socket owner can try to collapse/prune rx queues
   1855	 * to reduce memory overhead, so add a little headroom here.
   1856	 * Few sockets backlog are possibly concurrently non empty.
   1857	 */
   1858	limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
   1859
   1860	if (unlikely(sk_add_backlog(sk, skb, limit))) {
   1861		bh_unlock_sock(sk);
   1862		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
   1863		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
   1864		return true;
   1865	}
   1866	return false;
   1867}
   1868EXPORT_SYMBOL(tcp_add_backlog);
   1869
   1870int tcp_filter(struct sock *sk, struct sk_buff *skb)
   1871{
   1872	struct tcphdr *th = (struct tcphdr *)skb->data;
   1873
   1874	return sk_filter_trim_cap(sk, skb, th->doff * 4);
   1875}
   1876EXPORT_SYMBOL(tcp_filter);
   1877
   1878static void tcp_v4_restore_cb(struct sk_buff *skb)
   1879{
   1880	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
   1881		sizeof(struct inet_skb_parm));
   1882}
   1883
   1884static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
   1885			   const struct tcphdr *th)
   1886{
   1887	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
   1888	 * barrier() makes sure compiler wont play fool^Waliasing games.
   1889	 */
   1890	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
   1891		sizeof(struct inet_skb_parm));
   1892	barrier();
   1893
   1894	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
   1895	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
   1896				    skb->len - th->doff * 4);
   1897	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
   1898	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
   1899	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
   1900	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
   1901	TCP_SKB_CB(skb)->sacked	 = 0;
   1902	TCP_SKB_CB(skb)->has_rxtstamp =
   1903			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
   1904}
   1905
   1906/*
   1907 *	From tcp_input.c
   1908 */
   1909
   1910int tcp_v4_rcv(struct sk_buff *skb)
   1911{
   1912	struct net *net = dev_net(skb->dev);
   1913	enum skb_drop_reason drop_reason;
   1914	int sdif = inet_sdif(skb);
   1915	int dif = inet_iif(skb);
   1916	const struct iphdr *iph;
   1917	const struct tcphdr *th;
   1918	bool refcounted;
   1919	struct sock *sk;
   1920	int ret;
   1921
   1922	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
   1923	if (skb->pkt_type != PACKET_HOST)
   1924		goto discard_it;
   1925
   1926	/* Count it even if it's bad */
   1927	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
   1928
   1929	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
   1930		goto discard_it;
   1931
   1932	th = (const struct tcphdr *)skb->data;
   1933
   1934	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
   1935		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
   1936		goto bad_packet;
   1937	}
   1938	if (!pskb_may_pull(skb, th->doff * 4))
   1939		goto discard_it;
   1940
   1941	/* An explanation is required here, I think.
   1942	 * Packet length and doff are validated by header prediction,
   1943	 * provided case of th->doff==0 is eliminated.
   1944	 * So, we defer the checks. */
   1945
   1946	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
   1947		goto csum_error;
   1948
   1949	th = (const struct tcphdr *)skb->data;
   1950	iph = ip_hdr(skb);
   1951lookup:
   1952	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
   1953			       th->dest, sdif, &refcounted);
   1954	if (!sk)
   1955		goto no_tcp_socket;
   1956
   1957process:
   1958	if (sk->sk_state == TCP_TIME_WAIT)
   1959		goto do_time_wait;
   1960
   1961	if (sk->sk_state == TCP_NEW_SYN_RECV) {
   1962		struct request_sock *req = inet_reqsk(sk);
   1963		bool req_stolen = false;
   1964		struct sock *nsk;
   1965
   1966		sk = req->rsk_listener;
   1967		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
   1968			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
   1969		else
   1970			drop_reason = tcp_inbound_md5_hash(sk, skb,
   1971						   &iph->saddr, &iph->daddr,
   1972						   AF_INET, dif, sdif);
   1973		if (unlikely(drop_reason)) {
   1974			sk_drops_add(sk, skb);
   1975			reqsk_put(req);
   1976			goto discard_it;
   1977		}
   1978		if (tcp_checksum_complete(skb)) {
   1979			reqsk_put(req);
   1980			goto csum_error;
   1981		}
   1982		if (unlikely(sk->sk_state != TCP_LISTEN)) {
   1983			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
   1984			if (!nsk) {
   1985				inet_csk_reqsk_queue_drop_and_put(sk, req);
   1986				goto lookup;
   1987			}
   1988			sk = nsk;
   1989			/* reuseport_migrate_sock() has already held one sk_refcnt
   1990			 * before returning.
   1991			 */
   1992		} else {
   1993			/* We own a reference on the listener, increase it again
   1994			 * as we might lose it too soon.
   1995			 */
   1996			sock_hold(sk);
   1997		}
   1998		refcounted = true;
   1999		nsk = NULL;
   2000		if (!tcp_filter(sk, skb)) {
   2001			th = (const struct tcphdr *)skb->data;
   2002			iph = ip_hdr(skb);
   2003			tcp_v4_fill_cb(skb, iph, th);
   2004			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
   2005		} else {
   2006			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
   2007		}
   2008		if (!nsk) {
   2009			reqsk_put(req);
   2010			if (req_stolen) {
   2011				/* Another cpu got exclusive access to req
   2012				 * and created a full blown socket.
   2013				 * Try to feed this packet to this socket
   2014				 * instead of discarding it.
   2015				 */
   2016				tcp_v4_restore_cb(skb);
   2017				sock_put(sk);
   2018				goto lookup;
   2019			}
   2020			goto discard_and_relse;
   2021		}
   2022		nf_reset_ct(skb);
   2023		if (nsk == sk) {
   2024			reqsk_put(req);
   2025			tcp_v4_restore_cb(skb);
   2026		} else if (tcp_child_process(sk, nsk, skb)) {
   2027			tcp_v4_send_reset(nsk, skb);
   2028			goto discard_and_relse;
   2029		} else {
   2030			sock_put(sk);
   2031			return 0;
   2032		}
   2033	}
   2034
   2035	if (static_branch_unlikely(&ip4_min_ttl)) {
   2036		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
   2037		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
   2038			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
   2039			goto discard_and_relse;
   2040		}
   2041	}
   2042
   2043	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
   2044		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
   2045		goto discard_and_relse;
   2046	}
   2047
   2048	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
   2049					   &iph->daddr, AF_INET, dif, sdif);
   2050	if (drop_reason)
   2051		goto discard_and_relse;
   2052
   2053	nf_reset_ct(skb);
   2054
   2055	if (tcp_filter(sk, skb)) {
   2056		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
   2057		goto discard_and_relse;
   2058	}
   2059	th = (const struct tcphdr *)skb->data;
   2060	iph = ip_hdr(skb);
   2061	tcp_v4_fill_cb(skb, iph, th);
   2062
   2063	skb->dev = NULL;
   2064
   2065	if (sk->sk_state == TCP_LISTEN) {
   2066		ret = tcp_v4_do_rcv(sk, skb);
   2067		goto put_and_return;
   2068	}
   2069
   2070	sk_incoming_cpu_update(sk);
   2071
   2072	bh_lock_sock_nested(sk);
   2073	tcp_segs_in(tcp_sk(sk), skb);
   2074	ret = 0;
   2075	if (!sock_owned_by_user(sk)) {
   2076		ret = tcp_v4_do_rcv(sk, skb);
   2077	} else {
   2078		if (tcp_add_backlog(sk, skb, &drop_reason))
   2079			goto discard_and_relse;
   2080	}
   2081	bh_unlock_sock(sk);
   2082
   2083put_and_return:
   2084	if (refcounted)
   2085		sock_put(sk);
   2086
   2087	return ret;
   2088
   2089no_tcp_socket:
   2090	drop_reason = SKB_DROP_REASON_NO_SOCKET;
   2091	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
   2092		goto discard_it;
   2093
   2094	tcp_v4_fill_cb(skb, iph, th);
   2095
   2096	if (tcp_checksum_complete(skb)) {
   2097csum_error:
   2098		drop_reason = SKB_DROP_REASON_TCP_CSUM;
   2099		trace_tcp_bad_csum(skb);
   2100		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
   2101bad_packet:
   2102		__TCP_INC_STATS(net, TCP_MIB_INERRS);
   2103	} else {
   2104		tcp_v4_send_reset(NULL, skb);
   2105	}
   2106
   2107discard_it:
   2108	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
   2109	/* Discard frame. */
   2110	kfree_skb_reason(skb, drop_reason);
   2111	return 0;
   2112
   2113discard_and_relse:
   2114	sk_drops_add(sk, skb);
   2115	if (refcounted)
   2116		sock_put(sk);
   2117	goto discard_it;
   2118
   2119do_time_wait:
   2120	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
   2121		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
   2122		inet_twsk_put(inet_twsk(sk));
   2123		goto discard_it;
   2124	}
   2125
   2126	tcp_v4_fill_cb(skb, iph, th);
   2127
   2128	if (tcp_checksum_complete(skb)) {
   2129		inet_twsk_put(inet_twsk(sk));
   2130		goto csum_error;
   2131	}
   2132	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
   2133	case TCP_TW_SYN: {
   2134		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
   2135							&tcp_hashinfo, skb,
   2136							__tcp_hdrlen(th),
   2137							iph->saddr, th->source,
   2138							iph->daddr, th->dest,
   2139							inet_iif(skb),
   2140							sdif);
   2141		if (sk2) {
   2142			inet_twsk_deschedule_put(inet_twsk(sk));
   2143			sk = sk2;
   2144			tcp_v4_restore_cb(skb);
   2145			refcounted = false;
   2146			goto process;
   2147		}
   2148	}
   2149		/* to ACK */
   2150		fallthrough;
   2151	case TCP_TW_ACK:
   2152		tcp_v4_timewait_ack(sk, skb);
   2153		break;
   2154	case TCP_TW_RST:
   2155		tcp_v4_send_reset(sk, skb);
   2156		inet_twsk_deschedule_put(inet_twsk(sk));
   2157		goto discard_it;
   2158	case TCP_TW_SUCCESS:;
   2159	}
   2160	goto discard_it;
   2161}
   2162
   2163static struct timewait_sock_ops tcp_timewait_sock_ops = {
   2164	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
   2165	.twsk_unique	= tcp_twsk_unique,
   2166	.twsk_destructor= tcp_twsk_destructor,
   2167};
   2168
   2169void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
   2170{
   2171	struct dst_entry *dst = skb_dst(skb);
   2172
   2173	if (dst && dst_hold_safe(dst)) {
   2174		rcu_assign_pointer(sk->sk_rx_dst, dst);
   2175		sk->sk_rx_dst_ifindex = skb->skb_iif;
   2176	}
   2177}
   2178EXPORT_SYMBOL(inet_sk_rx_dst_set);
   2179
   2180const struct inet_connection_sock_af_ops ipv4_specific = {
   2181	.queue_xmit	   = ip_queue_xmit,
   2182	.send_check	   = tcp_v4_send_check,
   2183	.rebuild_header	   = inet_sk_rebuild_header,
   2184	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
   2185	.conn_request	   = tcp_v4_conn_request,
   2186	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
   2187	.net_header_len	   = sizeof(struct iphdr),
   2188	.setsockopt	   = ip_setsockopt,
   2189	.getsockopt	   = ip_getsockopt,
   2190	.addr2sockaddr	   = inet_csk_addr2sockaddr,
   2191	.sockaddr_len	   = sizeof(struct sockaddr_in),
   2192	.mtu_reduced	   = tcp_v4_mtu_reduced,
   2193};
   2194EXPORT_SYMBOL(ipv4_specific);
   2195
   2196#ifdef CONFIG_TCP_MD5SIG
   2197static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
   2198	.md5_lookup		= tcp_v4_md5_lookup,
   2199	.calc_md5_hash		= tcp_v4_md5_hash_skb,
   2200	.md5_parse		= tcp_v4_parse_md5_keys,
   2201};
   2202#endif
   2203
   2204/* NOTE: A lot of things set to zero explicitly by call to
   2205 *       sk_alloc() so need not be done here.
   2206 */
   2207static int tcp_v4_init_sock(struct sock *sk)
   2208{
   2209	struct inet_connection_sock *icsk = inet_csk(sk);
   2210
   2211	tcp_init_sock(sk);
   2212
   2213	icsk->icsk_af_ops = &ipv4_specific;
   2214
   2215#ifdef CONFIG_TCP_MD5SIG
   2216	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
   2217#endif
   2218
   2219	return 0;
   2220}
   2221
   2222void tcp_v4_destroy_sock(struct sock *sk)
   2223{
   2224	struct tcp_sock *tp = tcp_sk(sk);
   2225
   2226	trace_tcp_destroy_sock(sk);
   2227
   2228	tcp_clear_xmit_timers(sk);
   2229
   2230	tcp_cleanup_congestion_control(sk);
   2231
   2232	tcp_cleanup_ulp(sk);
   2233
   2234	/* Cleanup up the write buffer. */
   2235	tcp_write_queue_purge(sk);
   2236
   2237	/* Check if we want to disable active TFO */
   2238	tcp_fastopen_active_disable_ofo_check(sk);
   2239
   2240	/* Cleans up our, hopefully empty, out_of_order_queue. */
   2241	skb_rbtree_purge(&tp->out_of_order_queue);
   2242
   2243#ifdef CONFIG_TCP_MD5SIG
   2244	/* Clean up the MD5 key list, if any */
   2245	if (tp->md5sig_info) {
   2246		tcp_clear_md5_list(sk);
   2247		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
   2248		tp->md5sig_info = NULL;
   2249	}
   2250#endif
   2251
   2252	/* Clean up a referenced TCP bind bucket. */
   2253	if (inet_csk(sk)->icsk_bind_hash)
   2254		inet_put_port(sk);
   2255
   2256	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
   2257
   2258	/* If socket is aborted during connect operation */
   2259	tcp_free_fastopen_req(tp);
   2260	tcp_fastopen_destroy_cipher(sk);
   2261	tcp_saved_syn_free(tp);
   2262
   2263	sk_sockets_allocated_dec(sk);
   2264}
   2265EXPORT_SYMBOL(tcp_v4_destroy_sock);
   2266
   2267#ifdef CONFIG_PROC_FS
   2268/* Proc filesystem TCP sock list dumping. */
   2269
   2270static unsigned short seq_file_family(const struct seq_file *seq);
   2271
   2272static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
   2273{
   2274	unsigned short family = seq_file_family(seq);
   2275
   2276	/* AF_UNSPEC is used as a match all */
   2277	return ((family == AF_UNSPEC || family == sk->sk_family) &&
   2278		net_eq(sock_net(sk), seq_file_net(seq)));
   2279}
   2280
   2281/* Find a non empty bucket (starting from st->bucket)
   2282 * and return the first sk from it.
   2283 */
   2284static void *listening_get_first(struct seq_file *seq)
   2285{
   2286	struct tcp_iter_state *st = seq->private;
   2287
   2288	st->offset = 0;
   2289	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
   2290		struct inet_listen_hashbucket *ilb2;
   2291		struct hlist_nulls_node *node;
   2292		struct sock *sk;
   2293
   2294		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
   2295		if (hlist_nulls_empty(&ilb2->nulls_head))
   2296			continue;
   2297
   2298		spin_lock(&ilb2->lock);
   2299		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
   2300			if (seq_sk_match(seq, sk))
   2301				return sk;
   2302		}
   2303		spin_unlock(&ilb2->lock);
   2304	}
   2305
   2306	return NULL;
   2307}
   2308
   2309/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
   2310 * If "cur" is the last one in the st->bucket,
   2311 * call listening_get_first() to return the first sk of the next
   2312 * non empty bucket.
   2313 */
   2314static void *listening_get_next(struct seq_file *seq, void *cur)
   2315{
   2316	struct tcp_iter_state *st = seq->private;
   2317	struct inet_listen_hashbucket *ilb2;
   2318	struct hlist_nulls_node *node;
   2319	struct sock *sk = cur;
   2320
   2321	++st->num;
   2322	++st->offset;
   2323
   2324	sk = sk_nulls_next(sk);
   2325	sk_nulls_for_each_from(sk, node) {
   2326		if (seq_sk_match(seq, sk))
   2327			return sk;
   2328	}
   2329
   2330	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
   2331	spin_unlock(&ilb2->lock);
   2332	++st->bucket;
   2333	return listening_get_first(seq);
   2334}
   2335
   2336static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
   2337{
   2338	struct tcp_iter_state *st = seq->private;
   2339	void *rc;
   2340
   2341	st->bucket = 0;
   2342	st->offset = 0;
   2343	rc = listening_get_first(seq);
   2344
   2345	while (rc && *pos) {
   2346		rc = listening_get_next(seq, rc);
   2347		--*pos;
   2348	}
   2349	return rc;
   2350}
   2351
   2352static inline bool empty_bucket(const struct tcp_iter_state *st)
   2353{
   2354	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
   2355}
   2356
   2357/*
   2358 * Get first established socket starting from bucket given in st->bucket.
   2359 * If st->bucket is zero, the very first socket in the hash is returned.
   2360 */
   2361static void *established_get_first(struct seq_file *seq)
   2362{
   2363	struct tcp_iter_state *st = seq->private;
   2364
   2365	st->offset = 0;
   2366	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
   2367		struct sock *sk;
   2368		struct hlist_nulls_node *node;
   2369		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
   2370
   2371		/* Lockless fast path for the common case of empty buckets */
   2372		if (empty_bucket(st))
   2373			continue;
   2374
   2375		spin_lock_bh(lock);
   2376		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
   2377			if (seq_sk_match(seq, sk))
   2378				return sk;
   2379		}
   2380		spin_unlock_bh(lock);
   2381	}
   2382
   2383	return NULL;
   2384}
   2385
   2386static void *established_get_next(struct seq_file *seq, void *cur)
   2387{
   2388	struct sock *sk = cur;
   2389	struct hlist_nulls_node *node;
   2390	struct tcp_iter_state *st = seq->private;
   2391
   2392	++st->num;
   2393	++st->offset;
   2394
   2395	sk = sk_nulls_next(sk);
   2396
   2397	sk_nulls_for_each_from(sk, node) {
   2398		if (seq_sk_match(seq, sk))
   2399			return sk;
   2400	}
   2401
   2402	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
   2403	++st->bucket;
   2404	return established_get_first(seq);
   2405}
   2406
   2407static void *established_get_idx(struct seq_file *seq, loff_t pos)
   2408{
   2409	struct tcp_iter_state *st = seq->private;
   2410	void *rc;
   2411
   2412	st->bucket = 0;
   2413	rc = established_get_first(seq);
   2414
   2415	while (rc && pos) {
   2416		rc = established_get_next(seq, rc);
   2417		--pos;
   2418	}
   2419	return rc;
   2420}
   2421
   2422static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
   2423{
   2424	void *rc;
   2425	struct tcp_iter_state *st = seq->private;
   2426
   2427	st->state = TCP_SEQ_STATE_LISTENING;
   2428	rc	  = listening_get_idx(seq, &pos);
   2429
   2430	if (!rc) {
   2431		st->state = TCP_SEQ_STATE_ESTABLISHED;
   2432		rc	  = established_get_idx(seq, pos);
   2433	}
   2434
   2435	return rc;
   2436}
   2437
   2438static void *tcp_seek_last_pos(struct seq_file *seq)
   2439{
   2440	struct tcp_iter_state *st = seq->private;
   2441	int bucket = st->bucket;
   2442	int offset = st->offset;
   2443	int orig_num = st->num;
   2444	void *rc = NULL;
   2445
   2446	switch (st->state) {
   2447	case TCP_SEQ_STATE_LISTENING:
   2448		if (st->bucket > tcp_hashinfo.lhash2_mask)
   2449			break;
   2450		st->state = TCP_SEQ_STATE_LISTENING;
   2451		rc = listening_get_first(seq);
   2452		while (offset-- && rc && bucket == st->bucket)
   2453			rc = listening_get_next(seq, rc);
   2454		if (rc)
   2455			break;
   2456		st->bucket = 0;
   2457		st->state = TCP_SEQ_STATE_ESTABLISHED;
   2458		fallthrough;
   2459	case TCP_SEQ_STATE_ESTABLISHED:
   2460		if (st->bucket > tcp_hashinfo.ehash_mask)
   2461			break;
   2462		rc = established_get_first(seq);
   2463		while (offset-- && rc && bucket == st->bucket)
   2464			rc = established_get_next(seq, rc);
   2465	}
   2466
   2467	st->num = orig_num;
   2468
   2469	return rc;
   2470}
   2471
   2472void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
   2473{
   2474	struct tcp_iter_state *st = seq->private;
   2475	void *rc;
   2476
   2477	if (*pos && *pos == st->last_pos) {
   2478		rc = tcp_seek_last_pos(seq);
   2479		if (rc)
   2480			goto out;
   2481	}
   2482
   2483	st->state = TCP_SEQ_STATE_LISTENING;
   2484	st->num = 0;
   2485	st->bucket = 0;
   2486	st->offset = 0;
   2487	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
   2488
   2489out:
   2490	st->last_pos = *pos;
   2491	return rc;
   2492}
   2493EXPORT_SYMBOL(tcp_seq_start);
   2494
   2495void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
   2496{
   2497	struct tcp_iter_state *st = seq->private;
   2498	void *rc = NULL;
   2499
   2500	if (v == SEQ_START_TOKEN) {
   2501		rc = tcp_get_idx(seq, 0);
   2502		goto out;
   2503	}
   2504
   2505	switch (st->state) {
   2506	case TCP_SEQ_STATE_LISTENING:
   2507		rc = listening_get_next(seq, v);
   2508		if (!rc) {
   2509			st->state = TCP_SEQ_STATE_ESTABLISHED;
   2510			st->bucket = 0;
   2511			st->offset = 0;
   2512			rc	  = established_get_first(seq);
   2513		}
   2514		break;
   2515	case TCP_SEQ_STATE_ESTABLISHED:
   2516		rc = established_get_next(seq, v);
   2517		break;
   2518	}
   2519out:
   2520	++*pos;
   2521	st->last_pos = *pos;
   2522	return rc;
   2523}
   2524EXPORT_SYMBOL(tcp_seq_next);
   2525
   2526void tcp_seq_stop(struct seq_file *seq, void *v)
   2527{
   2528	struct tcp_iter_state *st = seq->private;
   2529
   2530	switch (st->state) {
   2531	case TCP_SEQ_STATE_LISTENING:
   2532		if (v != SEQ_START_TOKEN)
   2533			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
   2534		break;
   2535	case TCP_SEQ_STATE_ESTABLISHED:
   2536		if (v)
   2537			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
   2538		break;
   2539	}
   2540}
   2541EXPORT_SYMBOL(tcp_seq_stop);
   2542
   2543static void get_openreq4(const struct request_sock *req,
   2544			 struct seq_file *f, int i)
   2545{
   2546	const struct inet_request_sock *ireq = inet_rsk(req);
   2547	long delta = req->rsk_timer.expires - jiffies;
   2548
   2549	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
   2550		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
   2551		i,
   2552		ireq->ir_loc_addr,
   2553		ireq->ir_num,
   2554		ireq->ir_rmt_addr,
   2555		ntohs(ireq->ir_rmt_port),
   2556		TCP_SYN_RECV,
   2557		0, 0, /* could print option size, but that is af dependent. */
   2558		1,    /* timers active (only the expire timer) */
   2559		jiffies_delta_to_clock_t(delta),
   2560		req->num_timeout,
   2561		from_kuid_munged(seq_user_ns(f),
   2562				 sock_i_uid(req->rsk_listener)),
   2563		0,  /* non standard timer */
   2564		0, /* open_requests have no inode */
   2565		0,
   2566		req);
   2567}
   2568
   2569static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
   2570{
   2571	int timer_active;
   2572	unsigned long timer_expires;
   2573	const struct tcp_sock *tp = tcp_sk(sk);
   2574	const struct inet_connection_sock *icsk = inet_csk(sk);
   2575	const struct inet_sock *inet = inet_sk(sk);
   2576	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
   2577	__be32 dest = inet->inet_daddr;
   2578	__be32 src = inet->inet_rcv_saddr;
   2579	__u16 destp = ntohs(inet->inet_dport);
   2580	__u16 srcp = ntohs(inet->inet_sport);
   2581	int rx_queue;
   2582	int state;
   2583
   2584	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
   2585	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
   2586	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
   2587		timer_active	= 1;
   2588		timer_expires	= icsk->icsk_timeout;
   2589	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
   2590		timer_active	= 4;
   2591		timer_expires	= icsk->icsk_timeout;
   2592	} else if (timer_pending(&sk->sk_timer)) {
   2593		timer_active	= 2;
   2594		timer_expires	= sk->sk_timer.expires;
   2595	} else {
   2596		timer_active	= 0;
   2597		timer_expires = jiffies;
   2598	}
   2599
   2600	state = inet_sk_state_load(sk);
   2601	if (state == TCP_LISTEN)
   2602		rx_queue = READ_ONCE(sk->sk_ack_backlog);
   2603	else
   2604		/* Because we don't lock the socket,
   2605		 * we might find a transient negative value.
   2606		 */
   2607		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
   2608				      READ_ONCE(tp->copied_seq), 0);
   2609
   2610	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
   2611			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
   2612		i, src, srcp, dest, destp, state,
   2613		READ_ONCE(tp->write_seq) - tp->snd_una,
   2614		rx_queue,
   2615		timer_active,
   2616		jiffies_delta_to_clock_t(timer_expires - jiffies),
   2617		icsk->icsk_retransmits,
   2618		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
   2619		icsk->icsk_probes_out,
   2620		sock_i_ino(sk),
   2621		refcount_read(&sk->sk_refcnt), sk,
   2622		jiffies_to_clock_t(icsk->icsk_rto),
   2623		jiffies_to_clock_t(icsk->icsk_ack.ato),
   2624		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
   2625		tcp_snd_cwnd(tp),
   2626		state == TCP_LISTEN ?
   2627		    fastopenq->max_qlen :
   2628		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
   2629}
   2630
   2631static void get_timewait4_sock(const struct inet_timewait_sock *tw,
   2632			       struct seq_file *f, int i)
   2633{
   2634	long delta = tw->tw_timer.expires - jiffies;
   2635	__be32 dest, src;
   2636	__u16 destp, srcp;
   2637
   2638	dest  = tw->tw_daddr;
   2639	src   = tw->tw_rcv_saddr;
   2640	destp = ntohs(tw->tw_dport);
   2641	srcp  = ntohs(tw->tw_sport);
   2642
   2643	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
   2644		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
   2645		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
   2646		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
   2647		refcount_read(&tw->tw_refcnt), tw);
   2648}
   2649
   2650#define TMPSZ 150
   2651
   2652static int tcp4_seq_show(struct seq_file *seq, void *v)
   2653{
   2654	struct tcp_iter_state *st;
   2655	struct sock *sk = v;
   2656
   2657	seq_setwidth(seq, TMPSZ - 1);
   2658	if (v == SEQ_START_TOKEN) {
   2659		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
   2660			   "rx_queue tr tm->when retrnsmt   uid  timeout "
   2661			   "inode");
   2662		goto out;
   2663	}
   2664	st = seq->private;
   2665
   2666	if (sk->sk_state == TCP_TIME_WAIT)
   2667		get_timewait4_sock(v, seq, st->num);
   2668	else if (sk->sk_state == TCP_NEW_SYN_RECV)
   2669		get_openreq4(v, seq, st->num);
   2670	else
   2671		get_tcp4_sock(v, seq, st->num);
   2672out:
   2673	seq_pad(seq, '\n');
   2674	return 0;
   2675}
   2676
   2677#ifdef CONFIG_BPF_SYSCALL
   2678struct bpf_tcp_iter_state {
   2679	struct tcp_iter_state state;
   2680	unsigned int cur_sk;
   2681	unsigned int end_sk;
   2682	unsigned int max_sk;
   2683	struct sock **batch;
   2684	bool st_bucket_done;
   2685};
   2686
   2687struct bpf_iter__tcp {
   2688	__bpf_md_ptr(struct bpf_iter_meta *, meta);
   2689	__bpf_md_ptr(struct sock_common *, sk_common);
   2690	uid_t uid __aligned(8);
   2691};
   2692
   2693static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
   2694			     struct sock_common *sk_common, uid_t uid)
   2695{
   2696	struct bpf_iter__tcp ctx;
   2697
   2698	meta->seq_num--;  /* skip SEQ_START_TOKEN */
   2699	ctx.meta = meta;
   2700	ctx.sk_common = sk_common;
   2701	ctx.uid = uid;
   2702	return bpf_iter_run_prog(prog, &ctx);
   2703}
   2704
   2705static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
   2706{
   2707	while (iter->cur_sk < iter->end_sk)
   2708		sock_put(iter->batch[iter->cur_sk++]);
   2709}
   2710
   2711static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
   2712				      unsigned int new_batch_sz)
   2713{
   2714	struct sock **new_batch;
   2715
   2716	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
   2717			     GFP_USER | __GFP_NOWARN);
   2718	if (!new_batch)
   2719		return -ENOMEM;
   2720
   2721	bpf_iter_tcp_put_batch(iter);
   2722	kvfree(iter->batch);
   2723	iter->batch = new_batch;
   2724	iter->max_sk = new_batch_sz;
   2725
   2726	return 0;
   2727}
   2728
   2729static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
   2730						 struct sock *start_sk)
   2731{
   2732	struct bpf_tcp_iter_state *iter = seq->private;
   2733	struct tcp_iter_state *st = &iter->state;
   2734	struct hlist_nulls_node *node;
   2735	unsigned int expected = 1;
   2736	struct sock *sk;
   2737
   2738	sock_hold(start_sk);
   2739	iter->batch[iter->end_sk++] = start_sk;
   2740
   2741	sk = sk_nulls_next(start_sk);
   2742	sk_nulls_for_each_from(sk, node) {
   2743		if (seq_sk_match(seq, sk)) {
   2744			if (iter->end_sk < iter->max_sk) {
   2745				sock_hold(sk);
   2746				iter->batch[iter->end_sk++] = sk;
   2747			}
   2748			expected++;
   2749		}
   2750	}
   2751	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
   2752
   2753	return expected;
   2754}
   2755
   2756static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
   2757						   struct sock *start_sk)
   2758{
   2759	struct bpf_tcp_iter_state *iter = seq->private;
   2760	struct tcp_iter_state *st = &iter->state;
   2761	struct hlist_nulls_node *node;
   2762	unsigned int expected = 1;
   2763	struct sock *sk;
   2764
   2765	sock_hold(start_sk);
   2766	iter->batch[iter->end_sk++] = start_sk;
   2767
   2768	sk = sk_nulls_next(start_sk);
   2769	sk_nulls_for_each_from(sk, node) {
   2770		if (seq_sk_match(seq, sk)) {
   2771			if (iter->end_sk < iter->max_sk) {
   2772				sock_hold(sk);
   2773				iter->batch[iter->end_sk++] = sk;
   2774			}
   2775			expected++;
   2776		}
   2777	}
   2778	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
   2779
   2780	return expected;
   2781}
   2782
   2783static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
   2784{
   2785	struct bpf_tcp_iter_state *iter = seq->private;
   2786	struct tcp_iter_state *st = &iter->state;
   2787	unsigned int expected;
   2788	bool resized = false;
   2789	struct sock *sk;
   2790
   2791	/* The st->bucket is done.  Directly advance to the next
   2792	 * bucket instead of having the tcp_seek_last_pos() to skip
   2793	 * one by one in the current bucket and eventually find out
   2794	 * it has to advance to the next bucket.
   2795	 */
   2796	if (iter->st_bucket_done) {
   2797		st->offset = 0;
   2798		st->bucket++;
   2799		if (st->state == TCP_SEQ_STATE_LISTENING &&
   2800		    st->bucket > tcp_hashinfo.lhash2_mask) {
   2801			st->state = TCP_SEQ_STATE_ESTABLISHED;
   2802			st->bucket = 0;
   2803		}
   2804	}
   2805
   2806again:
   2807	/* Get a new batch */
   2808	iter->cur_sk = 0;
   2809	iter->end_sk = 0;
   2810	iter->st_bucket_done = false;
   2811
   2812	sk = tcp_seek_last_pos(seq);
   2813	if (!sk)
   2814		return NULL; /* Done */
   2815
   2816	if (st->state == TCP_SEQ_STATE_LISTENING)
   2817		expected = bpf_iter_tcp_listening_batch(seq, sk);
   2818	else
   2819		expected = bpf_iter_tcp_established_batch(seq, sk);
   2820
   2821	if (iter->end_sk == expected) {
   2822		iter->st_bucket_done = true;
   2823		return sk;
   2824	}
   2825
   2826	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
   2827		resized = true;
   2828		goto again;
   2829	}
   2830
   2831	return sk;
   2832}
   2833
   2834static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
   2835{
   2836	/* bpf iter does not support lseek, so it always
   2837	 * continue from where it was stop()-ped.
   2838	 */
   2839	if (*pos)
   2840		return bpf_iter_tcp_batch(seq);
   2841
   2842	return SEQ_START_TOKEN;
   2843}
   2844
   2845static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
   2846{
   2847	struct bpf_tcp_iter_state *iter = seq->private;
   2848	struct tcp_iter_state *st = &iter->state;
   2849	struct sock *sk;
   2850
   2851	/* Whenever seq_next() is called, the iter->cur_sk is
   2852	 * done with seq_show(), so advance to the next sk in
   2853	 * the batch.
   2854	 */
   2855	if (iter->cur_sk < iter->end_sk) {
   2856		/* Keeping st->num consistent in tcp_iter_state.
   2857		 * bpf_iter_tcp does not use st->num.
   2858		 * meta.seq_num is used instead.
   2859		 */
   2860		st->num++;
   2861		/* Move st->offset to the next sk in the bucket such that
   2862		 * the future start() will resume at st->offset in
   2863		 * st->bucket.  See tcp_seek_last_pos().
   2864		 */
   2865		st->offset++;
   2866		sock_put(iter->batch[iter->cur_sk++]);
   2867	}
   2868
   2869	if (iter->cur_sk < iter->end_sk)
   2870		sk = iter->batch[iter->cur_sk];
   2871	else
   2872		sk = bpf_iter_tcp_batch(seq);
   2873
   2874	++*pos;
   2875	/* Keeping st->last_pos consistent in tcp_iter_state.
   2876	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
   2877	 */
   2878	st->last_pos = *pos;
   2879	return sk;
   2880}
   2881
   2882static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
   2883{
   2884	struct bpf_iter_meta meta;
   2885	struct bpf_prog *prog;
   2886	struct sock *sk = v;
   2887	bool slow;
   2888	uid_t uid;
   2889	int ret;
   2890
   2891	if (v == SEQ_START_TOKEN)
   2892		return 0;
   2893
   2894	if (sk_fullsock(sk))
   2895		slow = lock_sock_fast(sk);
   2896
   2897	if (unlikely(sk_unhashed(sk))) {
   2898		ret = SEQ_SKIP;
   2899		goto unlock;
   2900	}
   2901
   2902	if (sk->sk_state == TCP_TIME_WAIT) {
   2903		uid = 0;
   2904	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
   2905		const struct request_sock *req = v;
   2906
   2907		uid = from_kuid_munged(seq_user_ns(seq),
   2908				       sock_i_uid(req->rsk_listener));
   2909	} else {
   2910		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
   2911	}
   2912
   2913	meta.seq = seq;
   2914	prog = bpf_iter_get_info(&meta, false);
   2915	ret = tcp_prog_seq_show(prog, &meta, v, uid);
   2916
   2917unlock:
   2918	if (sk_fullsock(sk))
   2919		unlock_sock_fast(sk, slow);
   2920	return ret;
   2921
   2922}
   2923
   2924static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
   2925{
   2926	struct bpf_tcp_iter_state *iter = seq->private;
   2927	struct bpf_iter_meta meta;
   2928	struct bpf_prog *prog;
   2929
   2930	if (!v) {
   2931		meta.seq = seq;
   2932		prog = bpf_iter_get_info(&meta, true);
   2933		if (prog)
   2934			(void)tcp_prog_seq_show(prog, &meta, v, 0);
   2935	}
   2936
   2937	if (iter->cur_sk < iter->end_sk) {
   2938		bpf_iter_tcp_put_batch(iter);
   2939		iter->st_bucket_done = false;
   2940	}
   2941}
   2942
   2943static const struct seq_operations bpf_iter_tcp_seq_ops = {
   2944	.show		= bpf_iter_tcp_seq_show,
   2945	.start		= bpf_iter_tcp_seq_start,
   2946	.next		= bpf_iter_tcp_seq_next,
   2947	.stop		= bpf_iter_tcp_seq_stop,
   2948};
   2949#endif
   2950static unsigned short seq_file_family(const struct seq_file *seq)
   2951{
   2952	const struct tcp_seq_afinfo *afinfo;
   2953
   2954#ifdef CONFIG_BPF_SYSCALL
   2955	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
   2956	if (seq->op == &bpf_iter_tcp_seq_ops)
   2957		return AF_UNSPEC;
   2958#endif
   2959
   2960	/* Iterated from proc fs */
   2961	afinfo = pde_data(file_inode(seq->file));
   2962	return afinfo->family;
   2963}
   2964
   2965static const struct seq_operations tcp4_seq_ops = {
   2966	.show		= tcp4_seq_show,
   2967	.start		= tcp_seq_start,
   2968	.next		= tcp_seq_next,
   2969	.stop		= tcp_seq_stop,
   2970};
   2971
   2972static struct tcp_seq_afinfo tcp4_seq_afinfo = {
   2973	.family		= AF_INET,
   2974};
   2975
   2976static int __net_init tcp4_proc_init_net(struct net *net)
   2977{
   2978	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
   2979			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
   2980		return -ENOMEM;
   2981	return 0;
   2982}
   2983
   2984static void __net_exit tcp4_proc_exit_net(struct net *net)
   2985{
   2986	remove_proc_entry("tcp", net->proc_net);
   2987}
   2988
   2989static struct pernet_operations tcp4_net_ops = {
   2990	.init = tcp4_proc_init_net,
   2991	.exit = tcp4_proc_exit_net,
   2992};
   2993
   2994int __init tcp4_proc_init(void)
   2995{
   2996	return register_pernet_subsys(&tcp4_net_ops);
   2997}
   2998
   2999void tcp4_proc_exit(void)
   3000{
   3001	unregister_pernet_subsys(&tcp4_net_ops);
   3002}
   3003#endif /* CONFIG_PROC_FS */
   3004
   3005/* @wake is one when sk_stream_write_space() calls us.
   3006 * This sends EPOLLOUT only if notsent_bytes is half the limit.
   3007 * This mimics the strategy used in sock_def_write_space().
   3008 */
   3009bool tcp_stream_memory_free(const struct sock *sk, int wake)
   3010{
   3011	const struct tcp_sock *tp = tcp_sk(sk);
   3012	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
   3013			    READ_ONCE(tp->snd_nxt);
   3014
   3015	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
   3016}
   3017EXPORT_SYMBOL(tcp_stream_memory_free);
   3018
   3019struct proto tcp_prot = {
   3020	.name			= "TCP",
   3021	.owner			= THIS_MODULE,
   3022	.close			= tcp_close,
   3023	.pre_connect		= tcp_v4_pre_connect,
   3024	.connect		= tcp_v4_connect,
   3025	.disconnect		= tcp_disconnect,
   3026	.accept			= inet_csk_accept,
   3027	.ioctl			= tcp_ioctl,
   3028	.init			= tcp_v4_init_sock,
   3029	.destroy		= tcp_v4_destroy_sock,
   3030	.shutdown		= tcp_shutdown,
   3031	.setsockopt		= tcp_setsockopt,
   3032	.getsockopt		= tcp_getsockopt,
   3033	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
   3034	.keepalive		= tcp_set_keepalive,
   3035	.recvmsg		= tcp_recvmsg,
   3036	.sendmsg		= tcp_sendmsg,
   3037	.sendpage		= tcp_sendpage,
   3038	.backlog_rcv		= tcp_v4_do_rcv,
   3039	.release_cb		= tcp_release_cb,
   3040	.hash			= inet_hash,
   3041	.unhash			= inet_unhash,
   3042	.get_port		= inet_csk_get_port,
   3043	.put_port		= inet_put_port,
   3044#ifdef CONFIG_BPF_SYSCALL
   3045	.psock_update_sk_prot	= tcp_bpf_update_proto,
   3046#endif
   3047	.enter_memory_pressure	= tcp_enter_memory_pressure,
   3048	.leave_memory_pressure	= tcp_leave_memory_pressure,
   3049	.stream_memory_free	= tcp_stream_memory_free,
   3050	.sockets_allocated	= &tcp_sockets_allocated,
   3051	.orphan_count		= &tcp_orphan_count,
   3052	.memory_allocated	= &tcp_memory_allocated,
   3053	.memory_pressure	= &tcp_memory_pressure,
   3054	.sysctl_mem		= sysctl_tcp_mem,
   3055	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
   3056	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
   3057	.max_header		= MAX_TCP_HEADER,
   3058	.obj_size		= sizeof(struct tcp_sock),
   3059	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
   3060	.twsk_prot		= &tcp_timewait_sock_ops,
   3061	.rsk_prot		= &tcp_request_sock_ops,
   3062	.h.hashinfo		= &tcp_hashinfo,
   3063	.no_autobind		= true,
   3064	.diag_destroy		= tcp_abort,
   3065};
   3066EXPORT_SYMBOL(tcp_prot);
   3067
   3068static void __net_exit tcp_sk_exit(struct net *net)
   3069{
   3070	struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
   3071
   3072	if (net->ipv4.tcp_congestion_control)
   3073		bpf_module_put(net->ipv4.tcp_congestion_control,
   3074			       net->ipv4.tcp_congestion_control->owner);
   3075	if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
   3076		kfree(tcp_death_row);
   3077}
   3078
   3079static int __net_init tcp_sk_init(struct net *net)
   3080{
   3081	int cnt;
   3082
   3083	net->ipv4.sysctl_tcp_ecn = 2;
   3084	net->ipv4.sysctl_tcp_ecn_fallback = 1;
   3085
   3086	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
   3087	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
   3088	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
   3089	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
   3090	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
   3091
   3092	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
   3093	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
   3094	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
   3095
   3096	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
   3097	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
   3098	net->ipv4.sysctl_tcp_syncookies = 1;
   3099	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
   3100	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
   3101	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
   3102	net->ipv4.sysctl_tcp_orphan_retries = 0;
   3103	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
   3104	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
   3105	net->ipv4.sysctl_tcp_tw_reuse = 2;
   3106	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
   3107
   3108	net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
   3109	if (!net->ipv4.tcp_death_row)
   3110		return -ENOMEM;
   3111	refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
   3112	cnt = tcp_hashinfo.ehash_mask + 1;
   3113	net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
   3114	net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
   3115
   3116	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
   3117	net->ipv4.sysctl_tcp_sack = 1;
   3118	net->ipv4.sysctl_tcp_window_scaling = 1;
   3119	net->ipv4.sysctl_tcp_timestamps = 1;
   3120	net->ipv4.sysctl_tcp_early_retrans = 3;
   3121	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
   3122	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
   3123	net->ipv4.sysctl_tcp_retrans_collapse = 1;
   3124	net->ipv4.sysctl_tcp_max_reordering = 300;
   3125	net->ipv4.sysctl_tcp_dsack = 1;
   3126	net->ipv4.sysctl_tcp_app_win = 31;
   3127	net->ipv4.sysctl_tcp_adv_win_scale = 1;
   3128	net->ipv4.sysctl_tcp_frto = 2;
   3129	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
   3130	/* This limits the percentage of the congestion window which we
   3131	 * will allow a single TSO frame to consume.  Building TSO frames
   3132	 * which are too large can cause TCP streams to be bursty.
   3133	 */
   3134	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
   3135	/* Default TSQ limit of 16 TSO segments */
   3136	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
   3137	/* rfc5961 challenge ack rate limiting */
   3138	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
   3139	net->ipv4.sysctl_tcp_min_tso_segs = 2;
   3140	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
   3141	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
   3142	net->ipv4.sysctl_tcp_autocorking = 1;
   3143	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
   3144	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
   3145	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
   3146	if (net != &init_net) {
   3147		memcpy(net->ipv4.sysctl_tcp_rmem,
   3148		       init_net.ipv4.sysctl_tcp_rmem,
   3149		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
   3150		memcpy(net->ipv4.sysctl_tcp_wmem,
   3151		       init_net.ipv4.sysctl_tcp_wmem,
   3152		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
   3153	}
   3154	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
   3155	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
   3156	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
   3157	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
   3158	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
   3159	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
   3160
   3161	/* Reno is always built in */
   3162	if (!net_eq(net, &init_net) &&
   3163	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
   3164			       init_net.ipv4.tcp_congestion_control->owner))
   3165		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
   3166	else
   3167		net->ipv4.tcp_congestion_control = &tcp_reno;
   3168
   3169	return 0;
   3170}
   3171
   3172static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
   3173{
   3174	struct net *net;
   3175
   3176	inet_twsk_purge(&tcp_hashinfo, AF_INET);
   3177
   3178	list_for_each_entry(net, net_exit_list, exit_list)
   3179		tcp_fastopen_ctx_destroy(net);
   3180}
   3181
   3182static struct pernet_operations __net_initdata tcp_sk_ops = {
   3183       .init	   = tcp_sk_init,
   3184       .exit	   = tcp_sk_exit,
   3185       .exit_batch = tcp_sk_exit_batch,
   3186};
   3187
   3188#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
   3189DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
   3190		     struct sock_common *sk_common, uid_t uid)
   3191
   3192#define INIT_BATCH_SZ 16
   3193
   3194static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
   3195{
   3196	struct bpf_tcp_iter_state *iter = priv_data;
   3197	int err;
   3198
   3199	err = bpf_iter_init_seq_net(priv_data, aux);
   3200	if (err)
   3201		return err;
   3202
   3203	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
   3204	if (err) {
   3205		bpf_iter_fini_seq_net(priv_data);
   3206		return err;
   3207	}
   3208
   3209	return 0;
   3210}
   3211
   3212static void bpf_iter_fini_tcp(void *priv_data)
   3213{
   3214	struct bpf_tcp_iter_state *iter = priv_data;
   3215
   3216	bpf_iter_fini_seq_net(priv_data);
   3217	kvfree(iter->batch);
   3218}
   3219
   3220static const struct bpf_iter_seq_info tcp_seq_info = {
   3221	.seq_ops		= &bpf_iter_tcp_seq_ops,
   3222	.init_seq_private	= bpf_iter_init_tcp,
   3223	.fini_seq_private	= bpf_iter_fini_tcp,
   3224	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
   3225};
   3226
   3227static const struct bpf_func_proto *
   3228bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
   3229			    const struct bpf_prog *prog)
   3230{
   3231	switch (func_id) {
   3232	case BPF_FUNC_setsockopt:
   3233		return &bpf_sk_setsockopt_proto;
   3234	case BPF_FUNC_getsockopt:
   3235		return &bpf_sk_getsockopt_proto;
   3236	default:
   3237		return NULL;
   3238	}
   3239}
   3240
   3241static struct bpf_iter_reg tcp_reg_info = {
   3242	.target			= "tcp",
   3243	.ctx_arg_info_size	= 1,
   3244	.ctx_arg_info		= {
   3245		{ offsetof(struct bpf_iter__tcp, sk_common),
   3246		  PTR_TO_BTF_ID_OR_NULL },
   3247	},
   3248	.get_func_proto		= bpf_iter_tcp_get_func_proto,
   3249	.seq_info		= &tcp_seq_info,
   3250};
   3251
   3252static void __init bpf_iter_register(void)
   3253{
   3254	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
   3255	if (bpf_iter_reg_target(&tcp_reg_info))
   3256		pr_warn("Warning: could not register bpf iterator tcp\n");
   3257}
   3258
   3259#endif
   3260
   3261void __init tcp_v4_init(void)
   3262{
   3263	int cpu, res;
   3264
   3265	for_each_possible_cpu(cpu) {
   3266		struct sock *sk;
   3267
   3268		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
   3269					   IPPROTO_TCP, &init_net);
   3270		if (res)
   3271			panic("Failed to create the TCP control socket.\n");
   3272		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
   3273
   3274		/* Please enforce IP_DF and IPID==0 for RST and
   3275		 * ACK sent in SYN-RECV and TIME-WAIT state.
   3276		 */
   3277		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
   3278
   3279		per_cpu(ipv4_tcp_sk, cpu) = sk;
   3280	}
   3281	if (register_pernet_subsys(&tcp_sk_ops))
   3282		panic("Failed to create the TCP control socket.\n");
   3283
   3284#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
   3285	bpf_iter_register();
   3286#endif
   3287}