cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

inet_hashtables.c (24502B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
      4 *		operating system.  INET is implemented using the BSD Socket
      5 *		interface as the means of communication with the user level.
      6 *
      7 *		Generic INET transport hashtables
      8 *
      9 * Authors:	Lotsa people, from code originally in tcp
     10 */
     11
     12#include <linux/module.h>
     13#include <linux/random.h>
     14#include <linux/sched.h>
     15#include <linux/slab.h>
     16#include <linux/wait.h>
     17#include <linux/vmalloc.h>
     18#include <linux/memblock.h>
     19
     20#include <net/addrconf.h>
     21#include <net/inet_connection_sock.h>
     22#include <net/inet_hashtables.h>
     23#if IS_ENABLED(CONFIG_IPV6)
     24#include <net/inet6_hashtables.h>
     25#endif
     26#include <net/secure_seq.h>
     27#include <net/ip.h>
     28#include <net/tcp.h>
     29#include <net/sock_reuseport.h>
     30
     31static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
     32			const __u16 lport, const __be32 faddr,
     33			const __be16 fport)
     34{
     35	static u32 inet_ehash_secret __read_mostly;
     36
     37	net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
     38
     39	return __inet_ehashfn(laddr, lport, faddr, fport,
     40			      inet_ehash_secret + net_hash_mix(net));
     41}
     42
     43/* This function handles inet_sock, but also timewait and request sockets
     44 * for IPv4/IPv6.
     45 */
     46static u32 sk_ehashfn(const struct sock *sk)
     47{
     48#if IS_ENABLED(CONFIG_IPV6)
     49	if (sk->sk_family == AF_INET6 &&
     50	    !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
     51		return inet6_ehashfn(sock_net(sk),
     52				     &sk->sk_v6_rcv_saddr, sk->sk_num,
     53				     &sk->sk_v6_daddr, sk->sk_dport);
     54#endif
     55	return inet_ehashfn(sock_net(sk),
     56			    sk->sk_rcv_saddr, sk->sk_num,
     57			    sk->sk_daddr, sk->sk_dport);
     58}
     59
     60/*
     61 * Allocate and initialize a new local port bind bucket.
     62 * The bindhash mutex for snum's hash chain must be held here.
     63 */
     64struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
     65						 struct net *net,
     66						 struct inet_bind_hashbucket *head,
     67						 const unsigned short snum,
     68						 int l3mdev)
     69{
     70	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
     71
     72	if (tb) {
     73		write_pnet(&tb->ib_net, net);
     74		tb->l3mdev    = l3mdev;
     75		tb->port      = snum;
     76		tb->fastreuse = 0;
     77		tb->fastreuseport = 0;
     78		INIT_HLIST_HEAD(&tb->owners);
     79		hlist_add_head(&tb->node, &head->chain);
     80	}
     81	return tb;
     82}
     83
     84/*
     85 * Caller must hold hashbucket lock for this tb with local BH disabled
     86 */
     87void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
     88{
     89	if (hlist_empty(&tb->owners)) {
     90		__hlist_del(&tb->node);
     91		kmem_cache_free(cachep, tb);
     92	}
     93}
     94
     95void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
     96		    const unsigned short snum)
     97{
     98	inet_sk(sk)->inet_num = snum;
     99	sk_add_bind_node(sk, &tb->owners);
    100	inet_csk(sk)->icsk_bind_hash = tb;
    101}
    102
    103/*
    104 * Get rid of any references to a local port held by the given sock.
    105 */
    106static void __inet_put_port(struct sock *sk)
    107{
    108	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
    109	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
    110			hashinfo->bhash_size);
    111	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
    112	struct inet_bind_bucket *tb;
    113
    114	spin_lock(&head->lock);
    115	tb = inet_csk(sk)->icsk_bind_hash;
    116	__sk_del_bind_node(sk);
    117	inet_csk(sk)->icsk_bind_hash = NULL;
    118	inet_sk(sk)->inet_num = 0;
    119	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
    120	spin_unlock(&head->lock);
    121}
    122
    123void inet_put_port(struct sock *sk)
    124{
    125	local_bh_disable();
    126	__inet_put_port(sk);
    127	local_bh_enable();
    128}
    129EXPORT_SYMBOL(inet_put_port);
    130
    131int __inet_inherit_port(const struct sock *sk, struct sock *child)
    132{
    133	struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
    134	unsigned short port = inet_sk(child)->inet_num;
    135	const int bhash = inet_bhashfn(sock_net(sk), port,
    136			table->bhash_size);
    137	struct inet_bind_hashbucket *head = &table->bhash[bhash];
    138	struct inet_bind_bucket *tb;
    139	int l3mdev;
    140
    141	spin_lock(&head->lock);
    142	tb = inet_csk(sk)->icsk_bind_hash;
    143	if (unlikely(!tb)) {
    144		spin_unlock(&head->lock);
    145		return -ENOENT;
    146	}
    147	if (tb->port != port) {
    148		l3mdev = inet_sk_bound_l3mdev(sk);
    149
    150		/* NOTE: using tproxy and redirecting skbs to a proxy
    151		 * on a different listener port breaks the assumption
    152		 * that the listener socket's icsk_bind_hash is the same
    153		 * as that of the child socket. We have to look up or
    154		 * create a new bind bucket for the child here. */
    155		inet_bind_bucket_for_each(tb, &head->chain) {
    156			if (net_eq(ib_net(tb), sock_net(sk)) &&
    157			    tb->l3mdev == l3mdev && tb->port == port)
    158				break;
    159		}
    160		if (!tb) {
    161			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
    162						     sock_net(sk), head, port,
    163						     l3mdev);
    164			if (!tb) {
    165				spin_unlock(&head->lock);
    166				return -ENOMEM;
    167			}
    168		}
    169		inet_csk_update_fastreuse(tb, child);
    170	}
    171	inet_bind_hash(child, tb, port);
    172	spin_unlock(&head->lock);
    173
    174	return 0;
    175}
    176EXPORT_SYMBOL_GPL(__inet_inherit_port);
    177
    178static struct inet_listen_hashbucket *
    179inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
    180{
    181	u32 hash;
    182
    183#if IS_ENABLED(CONFIG_IPV6)
    184	if (sk->sk_family == AF_INET6)
    185		hash = ipv6_portaddr_hash(sock_net(sk),
    186					  &sk->sk_v6_rcv_saddr,
    187					  inet_sk(sk)->inet_num);
    188	else
    189#endif
    190		hash = ipv4_portaddr_hash(sock_net(sk),
    191					  inet_sk(sk)->inet_rcv_saddr,
    192					  inet_sk(sk)->inet_num);
    193	return inet_lhash2_bucket(h, hash);
    194}
    195
    196static inline int compute_score(struct sock *sk, struct net *net,
    197				const unsigned short hnum, const __be32 daddr,
    198				const int dif, const int sdif)
    199{
    200	int score = -1;
    201
    202	if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
    203			!ipv6_only_sock(sk)) {
    204		if (sk->sk_rcv_saddr != daddr)
    205			return -1;
    206
    207		if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
    208			return -1;
    209		score =  sk->sk_bound_dev_if ? 2 : 1;
    210
    211		if (sk->sk_family == PF_INET)
    212			score++;
    213		if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
    214			score++;
    215	}
    216	return score;
    217}
    218
    219static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk,
    220					    struct sk_buff *skb, int doff,
    221					    __be32 saddr, __be16 sport,
    222					    __be32 daddr, unsigned short hnum)
    223{
    224	struct sock *reuse_sk = NULL;
    225	u32 phash;
    226
    227	if (sk->sk_reuseport) {
    228		phash = inet_ehashfn(net, daddr, hnum, saddr, sport);
    229		reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
    230	}
    231	return reuse_sk;
    232}
    233
    234/*
    235 * Here are some nice properties to exploit here. The BSD API
    236 * does not allow a listening sock to specify the remote port nor the
    237 * remote address for the connection. So always assume those are both
    238 * wildcarded during the search since they can never be otherwise.
    239 */
    240
    241/* called with rcu_read_lock() : No refcount taken on the socket */
    242static struct sock *inet_lhash2_lookup(struct net *net,
    243				struct inet_listen_hashbucket *ilb2,
    244				struct sk_buff *skb, int doff,
    245				const __be32 saddr, __be16 sport,
    246				const __be32 daddr, const unsigned short hnum,
    247				const int dif, const int sdif)
    248{
    249	struct sock *sk, *result = NULL;
    250	struct hlist_nulls_node *node;
    251	int score, hiscore = 0;
    252
    253	sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
    254		score = compute_score(sk, net, hnum, daddr, dif, sdif);
    255		if (score > hiscore) {
    256			result = lookup_reuseport(net, sk, skb, doff,
    257						  saddr, sport, daddr, hnum);
    258			if (result)
    259				return result;
    260
    261			result = sk;
    262			hiscore = score;
    263		}
    264	}
    265
    266	return result;
    267}
    268
    269static inline struct sock *inet_lookup_run_bpf(struct net *net,
    270					       struct inet_hashinfo *hashinfo,
    271					       struct sk_buff *skb, int doff,
    272					       __be32 saddr, __be16 sport,
    273					       __be32 daddr, u16 hnum, const int dif)
    274{
    275	struct sock *sk, *reuse_sk;
    276	bool no_reuseport;
    277
    278	if (hashinfo != &tcp_hashinfo)
    279		return NULL; /* only TCP is supported */
    280
    281	no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport,
    282					    daddr, hnum, dif, &sk);
    283	if (no_reuseport || IS_ERR_OR_NULL(sk))
    284		return sk;
    285
    286	reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum);
    287	if (reuse_sk)
    288		sk = reuse_sk;
    289	return sk;
    290}
    291
    292struct sock *__inet_lookup_listener(struct net *net,
    293				    struct inet_hashinfo *hashinfo,
    294				    struct sk_buff *skb, int doff,
    295				    const __be32 saddr, __be16 sport,
    296				    const __be32 daddr, const unsigned short hnum,
    297				    const int dif, const int sdif)
    298{
    299	struct inet_listen_hashbucket *ilb2;
    300	struct sock *result = NULL;
    301	unsigned int hash2;
    302
    303	/* Lookup redirect from BPF */
    304	if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
    305		result = inet_lookup_run_bpf(net, hashinfo, skb, doff,
    306					     saddr, sport, daddr, hnum, dif);
    307		if (result)
    308			goto done;
    309	}
    310
    311	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
    312	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
    313
    314	result = inet_lhash2_lookup(net, ilb2, skb, doff,
    315				    saddr, sport, daddr, hnum,
    316				    dif, sdif);
    317	if (result)
    318		goto done;
    319
    320	/* Lookup lhash2 with INADDR_ANY */
    321	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
    322	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
    323
    324	result = inet_lhash2_lookup(net, ilb2, skb, doff,
    325				    saddr, sport, htonl(INADDR_ANY), hnum,
    326				    dif, sdif);
    327done:
    328	if (IS_ERR(result))
    329		return NULL;
    330	return result;
    331}
    332EXPORT_SYMBOL_GPL(__inet_lookup_listener);
    333
    334/* All sockets share common refcount, but have different destructors */
    335void sock_gen_put(struct sock *sk)
    336{
    337	if (!refcount_dec_and_test(&sk->sk_refcnt))
    338		return;
    339
    340	if (sk->sk_state == TCP_TIME_WAIT)
    341		inet_twsk_free(inet_twsk(sk));
    342	else if (sk->sk_state == TCP_NEW_SYN_RECV)
    343		reqsk_free(inet_reqsk(sk));
    344	else
    345		sk_free(sk);
    346}
    347EXPORT_SYMBOL_GPL(sock_gen_put);
    348
    349void sock_edemux(struct sk_buff *skb)
    350{
    351	sock_gen_put(skb->sk);
    352}
    353EXPORT_SYMBOL(sock_edemux);
    354
    355struct sock *__inet_lookup_established(struct net *net,
    356				  struct inet_hashinfo *hashinfo,
    357				  const __be32 saddr, const __be16 sport,
    358				  const __be32 daddr, const u16 hnum,
    359				  const int dif, const int sdif)
    360{
    361	INET_ADDR_COOKIE(acookie, saddr, daddr);
    362	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
    363	struct sock *sk;
    364	const struct hlist_nulls_node *node;
    365	/* Optimize here for direct hit, only listening connections can
    366	 * have wildcards anyways.
    367	 */
    368	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
    369	unsigned int slot = hash & hashinfo->ehash_mask;
    370	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
    371
    372begin:
    373	sk_nulls_for_each_rcu(sk, node, &head->chain) {
    374		if (sk->sk_hash != hash)
    375			continue;
    376		if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
    377			if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
    378				goto out;
    379			if (unlikely(!inet_match(net, sk, acookie,
    380						 ports, dif, sdif))) {
    381				sock_gen_put(sk);
    382				goto begin;
    383			}
    384			goto found;
    385		}
    386	}
    387	/*
    388	 * if the nulls value we got at the end of this lookup is
    389	 * not the expected one, we must restart lookup.
    390	 * We probably met an item that was moved to another chain.
    391	 */
    392	if (get_nulls_value(node) != slot)
    393		goto begin;
    394out:
    395	sk = NULL;
    396found:
    397	return sk;
    398}
    399EXPORT_SYMBOL_GPL(__inet_lookup_established);
    400
    401/* called with local bh disabled */
    402static int __inet_check_established(struct inet_timewait_death_row *death_row,
    403				    struct sock *sk, __u16 lport,
    404				    struct inet_timewait_sock **twp)
    405{
    406	struct inet_hashinfo *hinfo = death_row->hashinfo;
    407	struct inet_sock *inet = inet_sk(sk);
    408	__be32 daddr = inet->inet_rcv_saddr;
    409	__be32 saddr = inet->inet_daddr;
    410	int dif = sk->sk_bound_dev_if;
    411	struct net *net = sock_net(sk);
    412	int sdif = l3mdev_master_ifindex_by_index(net, dif);
    413	INET_ADDR_COOKIE(acookie, saddr, daddr);
    414	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
    415	unsigned int hash = inet_ehashfn(net, daddr, lport,
    416					 saddr, inet->inet_dport);
    417	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
    418	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
    419	struct sock *sk2;
    420	const struct hlist_nulls_node *node;
    421	struct inet_timewait_sock *tw = NULL;
    422
    423	spin_lock(lock);
    424
    425	sk_nulls_for_each(sk2, node, &head->chain) {
    426		if (sk2->sk_hash != hash)
    427			continue;
    428
    429		if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
    430			if (sk2->sk_state == TCP_TIME_WAIT) {
    431				tw = inet_twsk(sk2);
    432				if (twsk_unique(sk, sk2, twp))
    433					break;
    434			}
    435			goto not_unique;
    436		}
    437	}
    438
    439	/* Must record num and sport now. Otherwise we will see
    440	 * in hash table socket with a funny identity.
    441	 */
    442	inet->inet_num = lport;
    443	inet->inet_sport = htons(lport);
    444	sk->sk_hash = hash;
    445	WARN_ON(!sk_unhashed(sk));
    446	__sk_nulls_add_node_rcu(sk, &head->chain);
    447	if (tw) {
    448		sk_nulls_del_node_init_rcu((struct sock *)tw);
    449		__NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
    450	}
    451	spin_unlock(lock);
    452	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    453
    454	if (twp) {
    455		*twp = tw;
    456	} else if (tw) {
    457		/* Silly. Should hash-dance instead... */
    458		inet_twsk_deschedule_put(tw);
    459	}
    460	return 0;
    461
    462not_unique:
    463	spin_unlock(lock);
    464	return -EADDRNOTAVAIL;
    465}
    466
    467static u64 inet_sk_port_offset(const struct sock *sk)
    468{
    469	const struct inet_sock *inet = inet_sk(sk);
    470
    471	return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
    472					  inet->inet_daddr,
    473					  inet->inet_dport);
    474}
    475
    476/* Searches for an exsiting socket in the ehash bucket list.
    477 * Returns true if found, false otherwise.
    478 */
    479static bool inet_ehash_lookup_by_sk(struct sock *sk,
    480				    struct hlist_nulls_head *list)
    481{
    482	const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
    483	const int sdif = sk->sk_bound_dev_if;
    484	const int dif = sk->sk_bound_dev_if;
    485	const struct hlist_nulls_node *node;
    486	struct net *net = sock_net(sk);
    487	struct sock *esk;
    488
    489	INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
    490
    491	sk_nulls_for_each_rcu(esk, node, list) {
    492		if (esk->sk_hash != sk->sk_hash)
    493			continue;
    494		if (sk->sk_family == AF_INET) {
    495			if (unlikely(inet_match(net, esk, acookie,
    496						ports, dif, sdif))) {
    497				return true;
    498			}
    499		}
    500#if IS_ENABLED(CONFIG_IPV6)
    501		else if (sk->sk_family == AF_INET6) {
    502			if (unlikely(inet6_match(net, esk,
    503						 &sk->sk_v6_daddr,
    504						 &sk->sk_v6_rcv_saddr,
    505						 ports, dif, sdif))) {
    506				return true;
    507			}
    508		}
    509#endif
    510	}
    511	return false;
    512}
    513
    514/* Insert a socket into ehash, and eventually remove another one
    515 * (The another one can be a SYN_RECV or TIMEWAIT)
    516 * If an existing socket already exists, socket sk is not inserted,
    517 * and sets found_dup_sk parameter to true.
    518 */
    519bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
    520{
    521	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
    522	struct hlist_nulls_head *list;
    523	struct inet_ehash_bucket *head;
    524	spinlock_t *lock;
    525	bool ret = true;
    526
    527	WARN_ON_ONCE(!sk_unhashed(sk));
    528
    529	sk->sk_hash = sk_ehashfn(sk);
    530	head = inet_ehash_bucket(hashinfo, sk->sk_hash);
    531	list = &head->chain;
    532	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
    533
    534	spin_lock(lock);
    535	if (osk) {
    536		WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
    537		ret = sk_nulls_del_node_init_rcu(osk);
    538	} else if (found_dup_sk) {
    539		*found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
    540		if (*found_dup_sk)
    541			ret = false;
    542	}
    543
    544	if (ret)
    545		__sk_nulls_add_node_rcu(sk, list);
    546
    547	spin_unlock(lock);
    548
    549	return ret;
    550}
    551
    552bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
    553{
    554	bool ok = inet_ehash_insert(sk, osk, found_dup_sk);
    555
    556	if (ok) {
    557		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    558	} else {
    559		this_cpu_inc(*sk->sk_prot->orphan_count);
    560		inet_sk_set_state(sk, TCP_CLOSE);
    561		sock_set_flag(sk, SOCK_DEAD);
    562		inet_csk_destroy_sock(sk);
    563	}
    564	return ok;
    565}
    566EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
    567
    568static int inet_reuseport_add_sock(struct sock *sk,
    569				   struct inet_listen_hashbucket *ilb)
    570{
    571	struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
    572	const struct hlist_nulls_node *node;
    573	struct sock *sk2;
    574	kuid_t uid = sock_i_uid(sk);
    575
    576	sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
    577		if (sk2 != sk &&
    578		    sk2->sk_family == sk->sk_family &&
    579		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
    580		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
    581		    inet_csk(sk2)->icsk_bind_hash == tb &&
    582		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
    583		    inet_rcv_saddr_equal(sk, sk2, false))
    584			return reuseport_add_sock(sk, sk2,
    585						  inet_rcv_saddr_any(sk));
    586	}
    587
    588	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
    589}
    590
    591int __inet_hash(struct sock *sk, struct sock *osk)
    592{
    593	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
    594	struct inet_listen_hashbucket *ilb2;
    595	int err = 0;
    596
    597	if (sk->sk_state != TCP_LISTEN) {
    598		local_bh_disable();
    599		inet_ehash_nolisten(sk, osk, NULL);
    600		local_bh_enable();
    601		return 0;
    602	}
    603	WARN_ON(!sk_unhashed(sk));
    604	ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
    605
    606	spin_lock(&ilb2->lock);
    607	if (sk->sk_reuseport) {
    608		err = inet_reuseport_add_sock(sk, ilb2);
    609		if (err)
    610			goto unlock;
    611	}
    612	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
    613		sk->sk_family == AF_INET6)
    614		__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
    615	else
    616		__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
    617	sock_set_flag(sk, SOCK_RCU_FREE);
    618	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    619unlock:
    620	spin_unlock(&ilb2->lock);
    621
    622	return err;
    623}
    624EXPORT_SYMBOL(__inet_hash);
    625
    626int inet_hash(struct sock *sk)
    627{
    628	int err = 0;
    629
    630	if (sk->sk_state != TCP_CLOSE)
    631		err = __inet_hash(sk, NULL);
    632
    633	return err;
    634}
    635EXPORT_SYMBOL_GPL(inet_hash);
    636
    637void inet_unhash(struct sock *sk)
    638{
    639	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
    640
    641	if (sk_unhashed(sk))
    642		return;
    643
    644	if (sk->sk_state == TCP_LISTEN) {
    645		struct inet_listen_hashbucket *ilb2;
    646
    647		ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
    648		/* Don't disable bottom halves while acquiring the lock to
    649		 * avoid circular locking dependency on PREEMPT_RT.
    650		 */
    651		spin_lock(&ilb2->lock);
    652		if (sk_unhashed(sk)) {
    653			spin_unlock(&ilb2->lock);
    654			return;
    655		}
    656
    657		if (rcu_access_pointer(sk->sk_reuseport_cb))
    658			reuseport_stop_listen_sock(sk);
    659
    660		__sk_nulls_del_node_init_rcu(sk);
    661		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
    662		spin_unlock(&ilb2->lock);
    663	} else {
    664		spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
    665
    666		spin_lock_bh(lock);
    667		if (sk_unhashed(sk)) {
    668			spin_unlock_bh(lock);
    669			return;
    670		}
    671		__sk_nulls_del_node_init_rcu(sk);
    672		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
    673		spin_unlock_bh(lock);
    674	}
    675}
    676EXPORT_SYMBOL_GPL(inet_unhash);
    677
    678/* RFC 6056 3.3.4.  Algorithm 4: Double-Hash Port Selection Algorithm
    679 * Note that we use 32bit integers (vs RFC 'short integers')
    680 * because 2^16 is not a multiple of num_ephemeral and this
    681 * property might be used by clever attacker.
    682 * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
    683 * attacks were since demonstrated, thus we use 65536 instead to really
    684 * give more isolation and privacy, at the expense of 256kB of kernel
    685 * memory.
    686 */
    687#define INET_TABLE_PERTURB_SHIFT 16
    688#define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT)
    689static u32 *table_perturb;
    690
    691int __inet_hash_connect(struct inet_timewait_death_row *death_row,
    692		struct sock *sk, u64 port_offset,
    693		int (*check_established)(struct inet_timewait_death_row *,
    694			struct sock *, __u16, struct inet_timewait_sock **))
    695{
    696	struct inet_hashinfo *hinfo = death_row->hashinfo;
    697	struct inet_timewait_sock *tw = NULL;
    698	struct inet_bind_hashbucket *head;
    699	int port = inet_sk(sk)->inet_num;
    700	struct net *net = sock_net(sk);
    701	struct inet_bind_bucket *tb;
    702	u32 remaining, offset;
    703	int ret, i, low, high;
    704	int l3mdev;
    705	u32 index;
    706
    707	if (port) {
    708		head = &hinfo->bhash[inet_bhashfn(net, port,
    709						  hinfo->bhash_size)];
    710		tb = inet_csk(sk)->icsk_bind_hash;
    711		spin_lock_bh(&head->lock);
    712		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
    713			inet_ehash_nolisten(sk, NULL, NULL);
    714			spin_unlock_bh(&head->lock);
    715			return 0;
    716		}
    717		spin_unlock(&head->lock);
    718		/* No definite answer... Walk to established hash table */
    719		ret = check_established(death_row, sk, port, NULL);
    720		local_bh_enable();
    721		return ret;
    722	}
    723
    724	l3mdev = inet_sk_bound_l3mdev(sk);
    725
    726	inet_get_local_port_range(net, &low, &high);
    727	high++; /* [32768, 60999] -> [32768, 61000[ */
    728	remaining = high - low;
    729	if (likely(remaining > 1))
    730		remaining &= ~1U;
    731
    732	net_get_random_once(table_perturb,
    733			    INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
    734	index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
    735
    736	offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
    737	offset %= remaining;
    738
    739	/* In first pass we try ports of @low parity.
    740	 * inet_csk_get_port() does the opposite choice.
    741	 */
    742	offset &= ~1U;
    743other_parity_scan:
    744	port = low + offset;
    745	for (i = 0; i < remaining; i += 2, port += 2) {
    746		if (unlikely(port >= high))
    747			port -= remaining;
    748		if (inet_is_local_reserved_port(net, port))
    749			continue;
    750		head = &hinfo->bhash[inet_bhashfn(net, port,
    751						  hinfo->bhash_size)];
    752		spin_lock_bh(&head->lock);
    753
    754		/* Does not bother with rcv_saddr checks, because
    755		 * the established check is already unique enough.
    756		 */
    757		inet_bind_bucket_for_each(tb, &head->chain) {
    758			if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
    759			    tb->port == port) {
    760				if (tb->fastreuse >= 0 ||
    761				    tb->fastreuseport >= 0)
    762					goto next_port;
    763				WARN_ON(hlist_empty(&tb->owners));
    764				if (!check_established(death_row, sk,
    765						       port, &tw))
    766					goto ok;
    767				goto next_port;
    768			}
    769		}
    770
    771		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
    772					     net, head, port, l3mdev);
    773		if (!tb) {
    774			spin_unlock_bh(&head->lock);
    775			return -ENOMEM;
    776		}
    777		tb->fastreuse = -1;
    778		tb->fastreuseport = -1;
    779		goto ok;
    780next_port:
    781		spin_unlock_bh(&head->lock);
    782		cond_resched();
    783	}
    784
    785	offset++;
    786	if ((offset & 1) && remaining > 1)
    787		goto other_parity_scan;
    788
    789	return -EADDRNOTAVAIL;
    790
    791ok:
    792	/* Here we want to add a little bit of randomness to the next source
    793	 * port that will be chosen. We use a max() with a random here so that
    794	 * on low contention the randomness is maximal and on high contention
    795	 * it may be inexistent.
    796	 */
    797	i = max_t(int, i, (prandom_u32() & 7) * 2);
    798	WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
    799
    800	/* Head lock still held and bh's disabled */
    801	inet_bind_hash(sk, tb, port);
    802	if (sk_unhashed(sk)) {
    803		inet_sk(sk)->inet_sport = htons(port);
    804		inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
    805	}
    806	if (tw)
    807		inet_twsk_bind_unhash(tw, hinfo);
    808	spin_unlock(&head->lock);
    809	if (tw)
    810		inet_twsk_deschedule_put(tw);
    811	local_bh_enable();
    812	return 0;
    813}
    814
    815/*
    816 * Bind a port for a connect operation and hash it.
    817 */
    818int inet_hash_connect(struct inet_timewait_death_row *death_row,
    819		      struct sock *sk)
    820{
    821	u64 port_offset = 0;
    822
    823	if (!inet_sk(sk)->inet_num)
    824		port_offset = inet_sk_port_offset(sk);
    825	return __inet_hash_connect(death_row, sk, port_offset,
    826				   __inet_check_established);
    827}
    828EXPORT_SYMBOL_GPL(inet_hash_connect);
    829
    830static void init_hashinfo_lhash2(struct inet_hashinfo *h)
    831{
    832	int i;
    833
    834	for (i = 0; i <= h->lhash2_mask; i++) {
    835		spin_lock_init(&h->lhash2[i].lock);
    836		INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
    837				      i + LISTENING_NULLS_BASE);
    838	}
    839}
    840
    841void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
    842				unsigned long numentries, int scale,
    843				unsigned long low_limit,
    844				unsigned long high_limit)
    845{
    846	h->lhash2 = alloc_large_system_hash(name,
    847					    sizeof(*h->lhash2),
    848					    numentries,
    849					    scale,
    850					    0,
    851					    NULL,
    852					    &h->lhash2_mask,
    853					    low_limit,
    854					    high_limit);
    855	init_hashinfo_lhash2(h);
    856
    857	/* this one is used for source ports of outgoing connections */
    858	table_perturb = alloc_large_system_hash("Table-perturb",
    859						sizeof(*table_perturb),
    860						INET_TABLE_PERTURB_SIZE,
    861						0, 0, NULL, NULL,
    862						INET_TABLE_PERTURB_SIZE,
    863						INET_TABLE_PERTURB_SIZE);
    864}
    865
    866int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
    867{
    868	h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL);
    869	if (!h->lhash2)
    870		return -ENOMEM;
    871
    872	h->lhash2_mask = INET_LHTABLE_SIZE - 1;
    873	/* INET_LHTABLE_SIZE must be a power of 2 */
    874	BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);
    875
    876	init_hashinfo_lhash2(h);
    877	return 0;
    878}
    879EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod);
    880
    881int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
    882{
    883	unsigned int locksz = sizeof(spinlock_t);
    884	unsigned int i, nblocks = 1;
    885
    886	if (locksz != 0) {
    887		/* allocate 2 cache lines or at least one spinlock per cpu */
    888		nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
    889		nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
    890
    891		/* no more locks than number of hash buckets */
    892		nblocks = min(nblocks, hashinfo->ehash_mask + 1);
    893
    894		hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
    895		if (!hashinfo->ehash_locks)
    896			return -ENOMEM;
    897
    898		for (i = 0; i < nblocks; i++)
    899			spin_lock_init(&hashinfo->ehash_locks[i]);
    900	}
    901	hashinfo->ehash_locks_mask = nblocks - 1;
    902	return 0;
    903}
    904EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);