cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sock_reuseport.c (16865B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * To speed up listener socket lookup, create an array to store all sockets
      4 * listening on the same port.  This allows a decision to be made after finding
      5 * the first socket.  An optional BPF program can also be configured for
      6 * selecting the socket index from the array of available sockets.
      7 */
      8
      9#include <net/ip.h>
     10#include <net/sock_reuseport.h>
     11#include <linux/bpf.h>
     12#include <linux/idr.h>
     13#include <linux/filter.h>
     14#include <linux/rcupdate.h>
     15
     16#define INIT_SOCKS 128
     17
     18DEFINE_SPINLOCK(reuseport_lock);
     19
     20static DEFINE_IDA(reuseport_ida);
     21static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
     22			       struct sock_reuseport *reuse, bool bind_inany);
     23
     24static int reuseport_sock_index(struct sock *sk,
     25				const struct sock_reuseport *reuse,
     26				bool closed)
     27{
     28	int left, right;
     29
     30	if (!closed) {
     31		left = 0;
     32		right = reuse->num_socks;
     33	} else {
     34		left = reuse->max_socks - reuse->num_closed_socks;
     35		right = reuse->max_socks;
     36	}
     37
     38	for (; left < right; left++)
     39		if (reuse->socks[left] == sk)
     40			return left;
     41	return -1;
     42}
     43
     44static void __reuseport_add_sock(struct sock *sk,
     45				 struct sock_reuseport *reuse)
     46{
     47	reuse->socks[reuse->num_socks] = sk;
     48	/* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
     49	smp_wmb();
     50	reuse->num_socks++;
     51}
     52
     53static bool __reuseport_detach_sock(struct sock *sk,
     54				    struct sock_reuseport *reuse)
     55{
     56	int i = reuseport_sock_index(sk, reuse, false);
     57
     58	if (i == -1)
     59		return false;
     60
     61	reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
     62	reuse->num_socks--;
     63
     64	return true;
     65}
     66
     67static void __reuseport_add_closed_sock(struct sock *sk,
     68					struct sock_reuseport *reuse)
     69{
     70	reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
     71	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
     72	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
     73}
     74
     75static bool __reuseport_detach_closed_sock(struct sock *sk,
     76					   struct sock_reuseport *reuse)
     77{
     78	int i = reuseport_sock_index(sk, reuse, true);
     79
     80	if (i == -1)
     81		return false;
     82
     83	reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
     84	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
     85	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
     86
     87	return true;
     88}
     89
     90static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
     91{
     92	unsigned int size = sizeof(struct sock_reuseport) +
     93		      sizeof(struct sock *) * max_socks;
     94	struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
     95
     96	if (!reuse)
     97		return NULL;
     98
     99	reuse->max_socks = max_socks;
    100
    101	RCU_INIT_POINTER(reuse->prog, NULL);
    102	return reuse;
    103}
    104
    105int reuseport_alloc(struct sock *sk, bool bind_inany)
    106{
    107	struct sock_reuseport *reuse;
    108	int id, ret = 0;
    109
    110	/* bh lock used since this function call may precede hlist lock in
    111	 * soft irq of receive path or setsockopt from process context
    112	 */
    113	spin_lock_bh(&reuseport_lock);
    114
    115	/* Allocation attempts can occur concurrently via the setsockopt path
    116	 * and the bind/hash path.  Nothing to do when we lose the race.
    117	 */
    118	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
    119					  lockdep_is_held(&reuseport_lock));
    120	if (reuse) {
    121		if (reuse->num_closed_socks) {
    122			/* sk was shutdown()ed before */
    123			ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
    124			goto out;
    125		}
    126
    127		/* Only set reuse->bind_inany if the bind_inany is true.
    128		 * Otherwise, it will overwrite the reuse->bind_inany
    129		 * which was set by the bind/hash path.
    130		 */
    131		if (bind_inany)
    132			reuse->bind_inany = bind_inany;
    133		goto out;
    134	}
    135
    136	reuse = __reuseport_alloc(INIT_SOCKS);
    137	if (!reuse) {
    138		ret = -ENOMEM;
    139		goto out;
    140	}
    141
    142	id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
    143	if (id < 0) {
    144		kfree(reuse);
    145		ret = id;
    146		goto out;
    147	}
    148
    149	reuse->reuseport_id = id;
    150	reuse->bind_inany = bind_inany;
    151	reuse->socks[0] = sk;
    152	reuse->num_socks = 1;
    153	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
    154
    155out:
    156	spin_unlock_bh(&reuseport_lock);
    157
    158	return ret;
    159}
    160EXPORT_SYMBOL(reuseport_alloc);
    161
    162static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
    163{
    164	struct sock_reuseport *more_reuse;
    165	u32 more_socks_size, i;
    166
    167	more_socks_size = reuse->max_socks * 2U;
    168	if (more_socks_size > U16_MAX) {
    169		if (reuse->num_closed_socks) {
    170			/* Make room by removing a closed sk.
    171			 * The child has already been migrated.
    172			 * Only reqsk left at this point.
    173			 */
    174			struct sock *sk;
    175
    176			sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
    177			RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
    178			__reuseport_detach_closed_sock(sk, reuse);
    179
    180			return reuse;
    181		}
    182
    183		return NULL;
    184	}
    185
    186	more_reuse = __reuseport_alloc(more_socks_size);
    187	if (!more_reuse)
    188		return NULL;
    189
    190	more_reuse->num_socks = reuse->num_socks;
    191	more_reuse->num_closed_socks = reuse->num_closed_socks;
    192	more_reuse->prog = reuse->prog;
    193	more_reuse->reuseport_id = reuse->reuseport_id;
    194	more_reuse->bind_inany = reuse->bind_inany;
    195	more_reuse->has_conns = reuse->has_conns;
    196
    197	memcpy(more_reuse->socks, reuse->socks,
    198	       reuse->num_socks * sizeof(struct sock *));
    199	memcpy(more_reuse->socks +
    200	       (more_reuse->max_socks - more_reuse->num_closed_socks),
    201	       reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
    202	       reuse->num_closed_socks * sizeof(struct sock *));
    203	more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
    204
    205	for (i = 0; i < reuse->max_socks; ++i)
    206		rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
    207				   more_reuse);
    208
    209	/* Note: we use kfree_rcu here instead of reuseport_free_rcu so
    210	 * that reuse and more_reuse can temporarily share a reference
    211	 * to prog.
    212	 */
    213	kfree_rcu(reuse, rcu);
    214	return more_reuse;
    215}
    216
    217static void reuseport_free_rcu(struct rcu_head *head)
    218{
    219	struct sock_reuseport *reuse;
    220
    221	reuse = container_of(head, struct sock_reuseport, rcu);
    222	sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
    223	ida_free(&reuseport_ida, reuse->reuseport_id);
    224	kfree(reuse);
    225}
    226
    227/**
    228 *  reuseport_add_sock - Add a socket to the reuseport group of another.
    229 *  @sk:  New socket to add to the group.
    230 *  @sk2: Socket belonging to the existing reuseport group.
    231 *  @bind_inany: Whether or not the group is bound to a local INANY address.
    232 *
    233 *  May return ENOMEM and not add socket to group under memory pressure.
    234 */
    235int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
    236{
    237	struct sock_reuseport *old_reuse, *reuse;
    238
    239	if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
    240		int err = reuseport_alloc(sk2, bind_inany);
    241
    242		if (err)
    243			return err;
    244	}
    245
    246	spin_lock_bh(&reuseport_lock);
    247	reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
    248					  lockdep_is_held(&reuseport_lock));
    249	old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
    250					      lockdep_is_held(&reuseport_lock));
    251	if (old_reuse && old_reuse->num_closed_socks) {
    252		/* sk was shutdown()ed before */
    253		int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
    254
    255		spin_unlock_bh(&reuseport_lock);
    256		return err;
    257	}
    258
    259	if (old_reuse && old_reuse->num_socks != 1) {
    260		spin_unlock_bh(&reuseport_lock);
    261		return -EBUSY;
    262	}
    263
    264	if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
    265		reuse = reuseport_grow(reuse);
    266		if (!reuse) {
    267			spin_unlock_bh(&reuseport_lock);
    268			return -ENOMEM;
    269		}
    270	}
    271
    272	__reuseport_add_sock(sk, reuse);
    273	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
    274
    275	spin_unlock_bh(&reuseport_lock);
    276
    277	if (old_reuse)
    278		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
    279	return 0;
    280}
    281EXPORT_SYMBOL(reuseport_add_sock);
    282
    283static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
    284			       struct sock_reuseport *reuse, bool bind_inany)
    285{
    286	if (old_reuse == reuse) {
    287		/* If sk was in the same reuseport group, just pop sk out of
    288		 * the closed section and push sk into the listening section.
    289		 */
    290		__reuseport_detach_closed_sock(sk, old_reuse);
    291		__reuseport_add_sock(sk, old_reuse);
    292		return 0;
    293	}
    294
    295	if (!reuse) {
    296		/* In bind()/listen() path, we cannot carry over the eBPF prog
    297		 * for the shutdown()ed socket. In setsockopt() path, we should
    298		 * not change the eBPF prog of listening sockets by attaching a
    299		 * prog to the shutdown()ed socket. Thus, we will allocate a new
    300		 * reuseport group and detach sk from the old group.
    301		 */
    302		int id;
    303
    304		reuse = __reuseport_alloc(INIT_SOCKS);
    305		if (!reuse)
    306			return -ENOMEM;
    307
    308		id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
    309		if (id < 0) {
    310			kfree(reuse);
    311			return id;
    312		}
    313
    314		reuse->reuseport_id = id;
    315		reuse->bind_inany = bind_inany;
    316	} else {
    317		/* Move sk from the old group to the new one if
    318		 * - all the other listeners in the old group were close()d or
    319		 *   shutdown()ed, and then sk2 has listen()ed on the same port
    320		 * OR
    321		 * - sk listen()ed without bind() (or with autobind), was
    322		 *   shutdown()ed, and then listen()s on another port which
    323		 *   sk2 listen()s on.
    324		 */
    325		if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
    326			reuse = reuseport_grow(reuse);
    327			if (!reuse)
    328				return -ENOMEM;
    329		}
    330	}
    331
    332	__reuseport_detach_closed_sock(sk, old_reuse);
    333	__reuseport_add_sock(sk, reuse);
    334	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
    335
    336	if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
    337		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
    338
    339	return 0;
    340}
    341
    342void reuseport_detach_sock(struct sock *sk)
    343{
    344	struct sock_reuseport *reuse;
    345
    346	spin_lock_bh(&reuseport_lock);
    347	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
    348					  lockdep_is_held(&reuseport_lock));
    349
    350	/* reuseport_grow() has detached a closed sk */
    351	if (!reuse)
    352		goto out;
    353
    354	/* Notify the bpf side. The sk may be added to a sockarray
    355	 * map. If so, sockarray logic will remove it from the map.
    356	 *
    357	 * Other bpf map types that work with reuseport, like sockmap,
    358	 * don't need an explicit callback from here. They override sk
    359	 * unhash/close ops to remove the sk from the map before we
    360	 * get to this point.
    361	 */
    362	bpf_sk_reuseport_detach(sk);
    363
    364	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
    365
    366	if (!__reuseport_detach_closed_sock(sk, reuse))
    367		__reuseport_detach_sock(sk, reuse);
    368
    369	if (reuse->num_socks + reuse->num_closed_socks == 0)
    370		call_rcu(&reuse->rcu, reuseport_free_rcu);
    371
    372out:
    373	spin_unlock_bh(&reuseport_lock);
    374}
    375EXPORT_SYMBOL(reuseport_detach_sock);
    376
    377void reuseport_stop_listen_sock(struct sock *sk)
    378{
    379	if (sk->sk_protocol == IPPROTO_TCP) {
    380		struct sock_reuseport *reuse;
    381		struct bpf_prog *prog;
    382
    383		spin_lock_bh(&reuseport_lock);
    384
    385		reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
    386						  lockdep_is_held(&reuseport_lock));
    387		prog = rcu_dereference_protected(reuse->prog,
    388						 lockdep_is_held(&reuseport_lock));
    389
    390		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req ||
    391		    (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
    392			/* Migration capable, move sk from the listening section
    393			 * to the closed section.
    394			 */
    395			bpf_sk_reuseport_detach(sk);
    396
    397			__reuseport_detach_sock(sk, reuse);
    398			__reuseport_add_closed_sock(sk, reuse);
    399
    400			spin_unlock_bh(&reuseport_lock);
    401			return;
    402		}
    403
    404		spin_unlock_bh(&reuseport_lock);
    405	}
    406
    407	/* Not capable to do migration, detach immediately */
    408	reuseport_detach_sock(sk);
    409}
    410EXPORT_SYMBOL(reuseport_stop_listen_sock);
    411
    412static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
    413				   struct bpf_prog *prog, struct sk_buff *skb,
    414				   int hdr_len)
    415{
    416	struct sk_buff *nskb = NULL;
    417	u32 index;
    418
    419	if (skb_shared(skb)) {
    420		nskb = skb_clone(skb, GFP_ATOMIC);
    421		if (!nskb)
    422			return NULL;
    423		skb = nskb;
    424	}
    425
    426	/* temporarily advance data past protocol header */
    427	if (!pskb_pull(skb, hdr_len)) {
    428		kfree_skb(nskb);
    429		return NULL;
    430	}
    431	index = bpf_prog_run_save_cb(prog, skb);
    432	__skb_push(skb, hdr_len);
    433
    434	consume_skb(nskb);
    435
    436	if (index >= socks)
    437		return NULL;
    438
    439	return reuse->socks[index];
    440}
    441
    442static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
    443						  u32 hash, u16 num_socks)
    444{
    445	int i, j;
    446
    447	i = j = reciprocal_scale(hash, num_socks);
    448	while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
    449		i++;
    450		if (i >= num_socks)
    451			i = 0;
    452		if (i == j)
    453			return NULL;
    454	}
    455
    456	return reuse->socks[i];
    457}
    458
    459/**
    460 *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
    461 *  @sk: First socket in the group.
    462 *  @hash: When no BPF filter is available, use this hash to select.
    463 *  @skb: skb to run through BPF filter.
    464 *  @hdr_len: BPF filter expects skb data pointer at payload data.  If
    465 *    the skb does not yet point at the payload, this parameter represents
    466 *    how far the pointer needs to advance to reach the payload.
    467 *  Returns a socket that should receive the packet (or NULL on error).
    468 */
    469struct sock *reuseport_select_sock(struct sock *sk,
    470				   u32 hash,
    471				   struct sk_buff *skb,
    472				   int hdr_len)
    473{
    474	struct sock_reuseport *reuse;
    475	struct bpf_prog *prog;
    476	struct sock *sk2 = NULL;
    477	u16 socks;
    478
    479	rcu_read_lock();
    480	reuse = rcu_dereference(sk->sk_reuseport_cb);
    481
    482	/* if memory allocation failed or add call is not yet complete */
    483	if (!reuse)
    484		goto out;
    485
    486	prog = rcu_dereference(reuse->prog);
    487	socks = READ_ONCE(reuse->num_socks);
    488	if (likely(socks)) {
    489		/* paired with smp_wmb() in __reuseport_add_sock() */
    490		smp_rmb();
    491
    492		if (!prog || !skb)
    493			goto select_by_hash;
    494
    495		if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
    496			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
    497		else
    498			sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
    499
    500select_by_hash:
    501		/* no bpf or invalid bpf result: fall back to hash usage */
    502		if (!sk2)
    503			sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
    504	}
    505
    506out:
    507	rcu_read_unlock();
    508	return sk2;
    509}
    510EXPORT_SYMBOL(reuseport_select_sock);
    511
    512/**
    513 *  reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
    514 *  @sk: close()ed or shutdown()ed socket in the group.
    515 *  @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
    516 *    NEW_SYN_RECV request socket during 3WHS.
    517 *  @skb: skb to run through BPF filter.
    518 *  Returns a socket (with sk_refcnt +1) that should accept the child socket
    519 *  (or NULL on error).
    520 */
    521struct sock *reuseport_migrate_sock(struct sock *sk,
    522				    struct sock *migrating_sk,
    523				    struct sk_buff *skb)
    524{
    525	struct sock_reuseport *reuse;
    526	struct sock *nsk = NULL;
    527	bool allocated = false;
    528	struct bpf_prog *prog;
    529	u16 socks;
    530	u32 hash;
    531
    532	rcu_read_lock();
    533
    534	reuse = rcu_dereference(sk->sk_reuseport_cb);
    535	if (!reuse)
    536		goto out;
    537
    538	socks = READ_ONCE(reuse->num_socks);
    539	if (unlikely(!socks))
    540		goto failure;
    541
    542	/* paired with smp_wmb() in __reuseport_add_sock() */
    543	smp_rmb();
    544
    545	hash = migrating_sk->sk_hash;
    546	prog = rcu_dereference(reuse->prog);
    547	if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
    548		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
    549			goto select_by_hash;
    550		goto failure;
    551	}
    552
    553	if (!skb) {
    554		skb = alloc_skb(0, GFP_ATOMIC);
    555		if (!skb)
    556			goto failure;
    557		allocated = true;
    558	}
    559
    560	nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
    561
    562	if (allocated)
    563		kfree_skb(skb);
    564
    565select_by_hash:
    566	if (!nsk)
    567		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
    568
    569	if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
    570		nsk = NULL;
    571		goto failure;
    572	}
    573
    574out:
    575	rcu_read_unlock();
    576	return nsk;
    577
    578failure:
    579	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
    580	goto out;
    581}
    582EXPORT_SYMBOL(reuseport_migrate_sock);
    583
    584int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
    585{
    586	struct sock_reuseport *reuse;
    587	struct bpf_prog *old_prog;
    588
    589	if (sk_unhashed(sk)) {
    590		int err;
    591
    592		if (!sk->sk_reuseport)
    593			return -EINVAL;
    594
    595		err = reuseport_alloc(sk, false);
    596		if (err)
    597			return err;
    598	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
    599		/* The socket wasn't bound with SO_REUSEPORT */
    600		return -EINVAL;
    601	}
    602
    603	spin_lock_bh(&reuseport_lock);
    604	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
    605					  lockdep_is_held(&reuseport_lock));
    606	old_prog = rcu_dereference_protected(reuse->prog,
    607					     lockdep_is_held(&reuseport_lock));
    608	rcu_assign_pointer(reuse->prog, prog);
    609	spin_unlock_bh(&reuseport_lock);
    610
    611	sk_reuseport_prog_free(old_prog);
    612	return 0;
    613}
    614EXPORT_SYMBOL(reuseport_attach_prog);
    615
    616int reuseport_detach_prog(struct sock *sk)
    617{
    618	struct sock_reuseport *reuse;
    619	struct bpf_prog *old_prog;
    620
    621	old_prog = NULL;
    622	spin_lock_bh(&reuseport_lock);
    623	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
    624					  lockdep_is_held(&reuseport_lock));
    625
    626	/* reuse must be checked after acquiring the reuseport_lock
    627	 * because reuseport_grow() can detach a closed sk.
    628	 */
    629	if (!reuse) {
    630		spin_unlock_bh(&reuseport_lock);
    631		return sk->sk_reuseport ? -ENOENT : -EINVAL;
    632	}
    633
    634	if (sk_unhashed(sk) && reuse->num_closed_socks) {
    635		spin_unlock_bh(&reuseport_lock);
    636		return -ENOENT;
    637	}
    638
    639	old_prog = rcu_replace_pointer(reuse->prog, old_prog,
    640				       lockdep_is_held(&reuseport_lock));
    641	spin_unlock_bh(&reuseport_lock);
    642
    643	if (!old_prog)
    644		return -ENOENT;
    645
    646	sk_reuseport_prog_free(old_prog);
    647	return 0;
    648}
    649EXPORT_SYMBOL(reuseport_detach_prog);