cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

xsk.c (33810B)


      1// SPDX-License-Identifier: GPL-2.0
      2/* XDP sockets
      3 *
      4 * AF_XDP sockets allows a channel between XDP programs and userspace
      5 * applications.
      6 * Copyright(c) 2018 Intel Corporation.
      7 *
      8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
      9 *	      Magnus Karlsson <magnus.karlsson@intel.com>
     10 */
     11
     12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
     13
     14#include <linux/if_xdp.h>
     15#include <linux/init.h>
     16#include <linux/sched/mm.h>
     17#include <linux/sched/signal.h>
     18#include <linux/sched/task.h>
     19#include <linux/socket.h>
     20#include <linux/file.h>
     21#include <linux/uaccess.h>
     22#include <linux/net.h>
     23#include <linux/netdevice.h>
     24#include <linux/rculist.h>
     25#include <net/xdp_sock_drv.h>
     26#include <net/busy_poll.h>
     27#include <net/xdp.h>
     28
     29#include "xsk_queue.h"
     30#include "xdp_umem.h"
     31#include "xsk.h"
     32
     33#define TX_BATCH_SIZE 32
     34
     35static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
     36
     37void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
     38{
     39	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
     40		return;
     41
     42	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
     43	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
     44}
     45EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
     46
     47void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
     48{
     49	struct xdp_sock *xs;
     50
     51	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
     52		return;
     53
     54	rcu_read_lock();
     55	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
     56		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
     57	}
     58	rcu_read_unlock();
     59
     60	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
     61}
     62EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
     63
     64void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
     65{
     66	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
     67		return;
     68
     69	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
     70	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
     71}
     72EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
     73
     74void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
     75{
     76	struct xdp_sock *xs;
     77
     78	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
     79		return;
     80
     81	rcu_read_lock();
     82	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
     83		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
     84	}
     85	rcu_read_unlock();
     86
     87	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
     88}
     89EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
     90
     91bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
     92{
     93	return pool->uses_need_wakeup;
     94}
     95EXPORT_SYMBOL(xsk_uses_need_wakeup);
     96
     97struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
     98					    u16 queue_id)
     99{
    100	if (queue_id < dev->real_num_rx_queues)
    101		return dev->_rx[queue_id].pool;
    102	if (queue_id < dev->real_num_tx_queues)
    103		return dev->_tx[queue_id].pool;
    104
    105	return NULL;
    106}
    107EXPORT_SYMBOL(xsk_get_pool_from_qid);
    108
    109void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
    110{
    111	if (queue_id < dev->num_rx_queues)
    112		dev->_rx[queue_id].pool = NULL;
    113	if (queue_id < dev->num_tx_queues)
    114		dev->_tx[queue_id].pool = NULL;
    115}
    116
    117/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
    118 * not know if the device has more tx queues than rx, or the opposite.
    119 * This might also change during run time.
    120 */
    121int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
    122			u16 queue_id)
    123{
    124	if (queue_id >= max_t(unsigned int,
    125			      dev->real_num_rx_queues,
    126			      dev->real_num_tx_queues))
    127		return -EINVAL;
    128
    129	if (queue_id < dev->real_num_rx_queues)
    130		dev->_rx[queue_id].pool = pool;
    131	if (queue_id < dev->real_num_tx_queues)
    132		dev->_tx[queue_id].pool = pool;
    133
    134	return 0;
    135}
    136
    137static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
    138{
    139	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
    140	u64 addr;
    141	int err;
    142
    143	addr = xp_get_handle(xskb);
    144	err = xskq_prod_reserve_desc(xs->rx, addr, len);
    145	if (err) {
    146		xs->rx_queue_full++;
    147		return err;
    148	}
    149
    150	xp_release(xskb);
    151	return 0;
    152}
    153
    154static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
    155{
    156	void *from_buf, *to_buf;
    157	u32 metalen;
    158
    159	if (unlikely(xdp_data_meta_unsupported(from))) {
    160		from_buf = from->data;
    161		to_buf = to->data;
    162		metalen = 0;
    163	} else {
    164		from_buf = from->data_meta;
    165		metalen = from->data - from->data_meta;
    166		to_buf = to->data - metalen;
    167	}
    168
    169	memcpy(to_buf, from_buf, len + metalen);
    170}
    171
    172static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
    173{
    174	struct xdp_buff *xsk_xdp;
    175	int err;
    176	u32 len;
    177
    178	len = xdp->data_end - xdp->data;
    179	if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
    180		xs->rx_dropped++;
    181		return -ENOSPC;
    182	}
    183
    184	xsk_xdp = xsk_buff_alloc(xs->pool);
    185	if (!xsk_xdp) {
    186		xs->rx_dropped++;
    187		return -ENOMEM;
    188	}
    189
    190	xsk_copy_xdp(xsk_xdp, xdp, len);
    191	err = __xsk_rcv_zc(xs, xsk_xdp, len);
    192	if (err) {
    193		xsk_buff_free(xsk_xdp);
    194		return err;
    195	}
    196	return 0;
    197}
    198
    199static bool xsk_tx_writeable(struct xdp_sock *xs)
    200{
    201	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
    202		return false;
    203
    204	return true;
    205}
    206
    207static bool xsk_is_bound(struct xdp_sock *xs)
    208{
    209	if (READ_ONCE(xs->state) == XSK_BOUND) {
    210		/* Matches smp_wmb() in bind(). */
    211		smp_rmb();
    212		return true;
    213	}
    214	return false;
    215}
    216
    217static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
    218{
    219	if (!xsk_is_bound(xs))
    220		return -ENXIO;
    221
    222	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
    223		return -EINVAL;
    224
    225	sk_mark_napi_id_once_xdp(&xs->sk, xdp);
    226	return 0;
    227}
    228
    229static void xsk_flush(struct xdp_sock *xs)
    230{
    231	xskq_prod_submit(xs->rx);
    232	__xskq_cons_release(xs->pool->fq);
    233	sock_def_readable(&xs->sk);
    234}
    235
    236int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
    237{
    238	int err;
    239
    240	spin_lock_bh(&xs->rx_lock);
    241	err = xsk_rcv_check(xs, xdp);
    242	if (!err) {
    243		err = __xsk_rcv(xs, xdp);
    244		xsk_flush(xs);
    245	}
    246	spin_unlock_bh(&xs->rx_lock);
    247	return err;
    248}
    249
    250static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
    251{
    252	int err;
    253	u32 len;
    254
    255	err = xsk_rcv_check(xs, xdp);
    256	if (err)
    257		return err;
    258
    259	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
    260		len = xdp->data_end - xdp->data;
    261		return __xsk_rcv_zc(xs, xdp, len);
    262	}
    263
    264	err = __xsk_rcv(xs, xdp);
    265	if (!err)
    266		xdp_return_buff(xdp);
    267	return err;
    268}
    269
    270int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
    271{
    272	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
    273	int err;
    274
    275	err = xsk_rcv(xs, xdp);
    276	if (err)
    277		return err;
    278
    279	if (!xs->flush_node.prev)
    280		list_add(&xs->flush_node, flush_list);
    281
    282	return 0;
    283}
    284
    285void __xsk_map_flush(void)
    286{
    287	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
    288	struct xdp_sock *xs, *tmp;
    289
    290	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
    291		xsk_flush(xs);
    292		__list_del_clearprev(&xs->flush_node);
    293	}
    294}
    295
    296void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
    297{
    298	xskq_prod_submit_n(pool->cq, nb_entries);
    299}
    300EXPORT_SYMBOL(xsk_tx_completed);
    301
    302void xsk_tx_release(struct xsk_buff_pool *pool)
    303{
    304	struct xdp_sock *xs;
    305
    306	rcu_read_lock();
    307	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
    308		__xskq_cons_release(xs->tx);
    309		if (xsk_tx_writeable(xs))
    310			xs->sk.sk_write_space(&xs->sk);
    311	}
    312	rcu_read_unlock();
    313}
    314EXPORT_SYMBOL(xsk_tx_release);
    315
    316bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
    317{
    318	struct xdp_sock *xs;
    319
    320	rcu_read_lock();
    321	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
    322		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
    323			xs->tx->queue_empty_descs++;
    324			continue;
    325		}
    326
    327		/* This is the backpressure mechanism for the Tx path.
    328		 * Reserve space in the completion queue and only proceed
    329		 * if there is space in it. This avoids having to implement
    330		 * any buffering in the Tx path.
    331		 */
    332		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
    333			goto out;
    334
    335		xskq_cons_release(xs->tx);
    336		rcu_read_unlock();
    337		return true;
    338	}
    339
    340out:
    341	rcu_read_unlock();
    342	return false;
    343}
    344EXPORT_SYMBOL(xsk_tx_peek_desc);
    345
    346static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
    347{
    348	struct xdp_desc *descs = pool->tx_descs;
    349	u32 nb_pkts = 0;
    350
    351	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
    352		nb_pkts++;
    353
    354	xsk_tx_release(pool);
    355	return nb_pkts;
    356}
    357
    358u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries)
    359{
    360	struct xdp_sock *xs;
    361	u32 nb_pkts;
    362
    363	rcu_read_lock();
    364	if (!list_is_singular(&pool->xsk_tx_list)) {
    365		/* Fallback to the non-batched version */
    366		rcu_read_unlock();
    367		return xsk_tx_peek_release_fallback(pool, max_entries);
    368	}
    369
    370	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
    371	if (!xs) {
    372		nb_pkts = 0;
    373		goto out;
    374	}
    375
    376	max_entries = xskq_cons_nb_entries(xs->tx, max_entries);
    377	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, max_entries);
    378	if (!nb_pkts) {
    379		xs->tx->queue_empty_descs++;
    380		goto out;
    381	}
    382
    383	/* This is the backpressure mechanism for the Tx path. Try to
    384	 * reserve space in the completion queue for all packets, but
    385	 * if there are fewer slots available, just process that many
    386	 * packets. This avoids having to implement any buffering in
    387	 * the Tx path.
    388	 */
    389	nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
    390	if (!nb_pkts)
    391		goto out;
    392
    393	xskq_cons_release_n(xs->tx, max_entries);
    394	__xskq_cons_release(xs->tx);
    395	xs->sk.sk_write_space(&xs->sk);
    396
    397out:
    398	rcu_read_unlock();
    399	return nb_pkts;
    400}
    401EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
    402
    403static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
    404{
    405	struct net_device *dev = xs->dev;
    406
    407	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
    408}
    409
    410static void xsk_destruct_skb(struct sk_buff *skb)
    411{
    412	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
    413	struct xdp_sock *xs = xdp_sk(skb->sk);
    414	unsigned long flags;
    415
    416	spin_lock_irqsave(&xs->pool->cq_lock, flags);
    417	xskq_prod_submit_addr(xs->pool->cq, addr);
    418	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
    419
    420	sock_wfree(skb);
    421}
    422
    423static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
    424					      struct xdp_desc *desc)
    425{
    426	struct xsk_buff_pool *pool = xs->pool;
    427	u32 hr, len, ts, offset, copy, copied;
    428	struct sk_buff *skb;
    429	struct page *page;
    430	void *buffer;
    431	int err, i;
    432	u64 addr;
    433
    434	hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
    435
    436	skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
    437	if (unlikely(!skb))
    438		return ERR_PTR(err);
    439
    440	skb_reserve(skb, hr);
    441
    442	addr = desc->addr;
    443	len = desc->len;
    444	ts = pool->unaligned ? len : pool->chunk_size;
    445
    446	buffer = xsk_buff_raw_get_data(pool, addr);
    447	offset = offset_in_page(buffer);
    448	addr = buffer - pool->addrs;
    449
    450	for (copied = 0, i = 0; copied < len; i++) {
    451		page = pool->umem->pgs[addr >> PAGE_SHIFT];
    452		get_page(page);
    453
    454		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
    455		skb_fill_page_desc(skb, i, page, offset, copy);
    456
    457		copied += copy;
    458		addr += copy;
    459		offset = 0;
    460	}
    461
    462	skb->len += len;
    463	skb->data_len += len;
    464	skb->truesize += ts;
    465
    466	refcount_add(ts, &xs->sk.sk_wmem_alloc);
    467
    468	return skb;
    469}
    470
    471static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
    472				     struct xdp_desc *desc)
    473{
    474	struct net_device *dev = xs->dev;
    475	struct sk_buff *skb;
    476
    477	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
    478		skb = xsk_build_skb_zerocopy(xs, desc);
    479		if (IS_ERR(skb))
    480			return skb;
    481	} else {
    482		u32 hr, tr, len;
    483		void *buffer;
    484		int err;
    485
    486		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
    487		tr = dev->needed_tailroom;
    488		len = desc->len;
    489
    490		skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
    491		if (unlikely(!skb))
    492			return ERR_PTR(err);
    493
    494		skb_reserve(skb, hr);
    495		skb_put(skb, len);
    496
    497		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
    498		err = skb_store_bits(skb, 0, buffer, len);
    499		if (unlikely(err)) {
    500			kfree_skb(skb);
    501			return ERR_PTR(err);
    502		}
    503	}
    504
    505	skb->dev = dev;
    506	skb->priority = xs->sk.sk_priority;
    507	skb->mark = xs->sk.sk_mark;
    508	skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
    509	skb->destructor = xsk_destruct_skb;
    510
    511	return skb;
    512}
    513
    514static int xsk_generic_xmit(struct sock *sk)
    515{
    516	struct xdp_sock *xs = xdp_sk(sk);
    517	u32 max_batch = TX_BATCH_SIZE;
    518	bool sent_frame = false;
    519	struct xdp_desc desc;
    520	struct sk_buff *skb;
    521	unsigned long flags;
    522	int err = 0;
    523
    524	mutex_lock(&xs->mutex);
    525
    526	/* Since we dropped the RCU read lock, the socket state might have changed. */
    527	if (unlikely(!xsk_is_bound(xs))) {
    528		err = -ENXIO;
    529		goto out;
    530	}
    531
    532	if (xs->queue_id >= xs->dev->real_num_tx_queues)
    533		goto out;
    534
    535	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
    536		if (max_batch-- == 0) {
    537			err = -EAGAIN;
    538			goto out;
    539		}
    540
    541		/* This is the backpressure mechanism for the Tx path.
    542		 * Reserve space in the completion queue and only proceed
    543		 * if there is space in it. This avoids having to implement
    544		 * any buffering in the Tx path.
    545		 */
    546		spin_lock_irqsave(&xs->pool->cq_lock, flags);
    547		if (xskq_prod_reserve(xs->pool->cq)) {
    548			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
    549			goto out;
    550		}
    551		spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
    552
    553		skb = xsk_build_skb(xs, &desc);
    554		if (IS_ERR(skb)) {
    555			err = PTR_ERR(skb);
    556			spin_lock_irqsave(&xs->pool->cq_lock, flags);
    557			xskq_prod_cancel(xs->pool->cq);
    558			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
    559			goto out;
    560		}
    561
    562		err = __dev_direct_xmit(skb, xs->queue_id);
    563		if  (err == NETDEV_TX_BUSY) {
    564			/* Tell user-space to retry the send */
    565			skb->destructor = sock_wfree;
    566			spin_lock_irqsave(&xs->pool->cq_lock, flags);
    567			xskq_prod_cancel(xs->pool->cq);
    568			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
    569			/* Free skb without triggering the perf drop trace */
    570			consume_skb(skb);
    571			err = -EAGAIN;
    572			goto out;
    573		}
    574
    575		xskq_cons_release(xs->tx);
    576		/* Ignore NET_XMIT_CN as packet might have been sent */
    577		if (err == NET_XMIT_DROP) {
    578			/* SKB completed but not sent */
    579			err = -EBUSY;
    580			goto out;
    581		}
    582
    583		sent_frame = true;
    584	}
    585
    586	xs->tx->queue_empty_descs++;
    587
    588out:
    589	if (sent_frame)
    590		if (xsk_tx_writeable(xs))
    591			sk->sk_write_space(sk);
    592
    593	mutex_unlock(&xs->mutex);
    594	return err;
    595}
    596
    597static int xsk_xmit(struct sock *sk)
    598{
    599	struct xdp_sock *xs = xdp_sk(sk);
    600	int ret;
    601
    602	if (unlikely(!(xs->dev->flags & IFF_UP)))
    603		return -ENETDOWN;
    604	if (unlikely(!xs->tx))
    605		return -ENOBUFS;
    606
    607	if (xs->zc)
    608		return xsk_wakeup(xs, XDP_WAKEUP_TX);
    609
    610	/* Drop the RCU lock since the SKB path might sleep. */
    611	rcu_read_unlock();
    612	ret = xsk_generic_xmit(sk);
    613	/* Reaquire RCU lock before going into common code. */
    614	rcu_read_lock();
    615
    616	return ret;
    617}
    618
    619static bool xsk_no_wakeup(struct sock *sk)
    620{
    621#ifdef CONFIG_NET_RX_BUSY_POLL
    622	/* Prefer busy-polling, skip the wakeup. */
    623	return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
    624		READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
    625#else
    626	return false;
    627#endif
    628}
    629
    630static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
    631{
    632	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
    633	struct sock *sk = sock->sk;
    634	struct xdp_sock *xs = xdp_sk(sk);
    635	struct xsk_buff_pool *pool;
    636
    637	if (unlikely(!xsk_is_bound(xs)))
    638		return -ENXIO;
    639	if (unlikely(need_wait))
    640		return -EOPNOTSUPP;
    641
    642	if (sk_can_busy_loop(sk))
    643		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
    644
    645	if (xs->zc && xsk_no_wakeup(sk))
    646		return 0;
    647
    648	pool = xs->pool;
    649	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
    650		return xsk_xmit(sk);
    651	return 0;
    652}
    653
    654static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
    655{
    656	int ret;
    657
    658	rcu_read_lock();
    659	ret = __xsk_sendmsg(sock, m, total_len);
    660	rcu_read_unlock();
    661
    662	return ret;
    663}
    664
    665static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
    666{
    667	bool need_wait = !(flags & MSG_DONTWAIT);
    668	struct sock *sk = sock->sk;
    669	struct xdp_sock *xs = xdp_sk(sk);
    670
    671	if (unlikely(!xsk_is_bound(xs)))
    672		return -ENXIO;
    673	if (unlikely(!(xs->dev->flags & IFF_UP)))
    674		return -ENETDOWN;
    675	if (unlikely(!xs->rx))
    676		return -ENOBUFS;
    677	if (unlikely(need_wait))
    678		return -EOPNOTSUPP;
    679
    680	if (sk_can_busy_loop(sk))
    681		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
    682
    683	if (xsk_no_wakeup(sk))
    684		return 0;
    685
    686	if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
    687		return xsk_wakeup(xs, XDP_WAKEUP_RX);
    688	return 0;
    689}
    690
    691static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
    692{
    693	int ret;
    694
    695	rcu_read_lock();
    696	ret = __xsk_recvmsg(sock, m, len, flags);
    697	rcu_read_unlock();
    698
    699	return ret;
    700}
    701
    702static __poll_t xsk_poll(struct file *file, struct socket *sock,
    703			     struct poll_table_struct *wait)
    704{
    705	__poll_t mask = 0;
    706	struct sock *sk = sock->sk;
    707	struct xdp_sock *xs = xdp_sk(sk);
    708	struct xsk_buff_pool *pool;
    709
    710	sock_poll_wait(file, sock, wait);
    711
    712	rcu_read_lock();
    713	if (unlikely(!xsk_is_bound(xs))) {
    714		rcu_read_unlock();
    715		return mask;
    716	}
    717
    718	pool = xs->pool;
    719
    720	if (pool->cached_need_wakeup) {
    721		if (xs->zc)
    722			xsk_wakeup(xs, pool->cached_need_wakeup);
    723		else
    724			/* Poll needs to drive Tx also in copy mode */
    725			xsk_xmit(sk);
    726	}
    727
    728	if (xs->rx && !xskq_prod_is_empty(xs->rx))
    729		mask |= EPOLLIN | EPOLLRDNORM;
    730	if (xs->tx && xsk_tx_writeable(xs))
    731		mask |= EPOLLOUT | EPOLLWRNORM;
    732
    733	rcu_read_unlock();
    734	return mask;
    735}
    736
    737static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
    738			  bool umem_queue)
    739{
    740	struct xsk_queue *q;
    741
    742	if (entries == 0 || *queue || !is_power_of_2(entries))
    743		return -EINVAL;
    744
    745	q = xskq_create(entries, umem_queue);
    746	if (!q)
    747		return -ENOMEM;
    748
    749	/* Make sure queue is ready before it can be seen by others */
    750	smp_wmb();
    751	WRITE_ONCE(*queue, q);
    752	return 0;
    753}
    754
    755static void xsk_unbind_dev(struct xdp_sock *xs)
    756{
    757	struct net_device *dev = xs->dev;
    758
    759	if (xs->state != XSK_BOUND)
    760		return;
    761	WRITE_ONCE(xs->state, XSK_UNBOUND);
    762
    763	/* Wait for driver to stop using the xdp socket. */
    764	xp_del_xsk(xs->pool, xs);
    765	synchronize_net();
    766	dev_put(dev);
    767}
    768
    769static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
    770					      struct xdp_sock __rcu ***map_entry)
    771{
    772	struct xsk_map *map = NULL;
    773	struct xsk_map_node *node;
    774
    775	*map_entry = NULL;
    776
    777	spin_lock_bh(&xs->map_list_lock);
    778	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
    779					node);
    780	if (node) {
    781		bpf_map_inc(&node->map->map);
    782		map = node->map;
    783		*map_entry = node->map_entry;
    784	}
    785	spin_unlock_bh(&xs->map_list_lock);
    786	return map;
    787}
    788
    789static void xsk_delete_from_maps(struct xdp_sock *xs)
    790{
    791	/* This function removes the current XDP socket from all the
    792	 * maps it resides in. We need to take extra care here, due to
    793	 * the two locks involved. Each map has a lock synchronizing
    794	 * updates to the entries, and each socket has a lock that
    795	 * synchronizes access to the list of maps (map_list). For
    796	 * deadlock avoidance the locks need to be taken in the order
    797	 * "map lock"->"socket map list lock". We start off by
    798	 * accessing the socket map list, and take a reference to the
    799	 * map to guarantee existence between the
    800	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
    801	 * calls. Then we ask the map to remove the socket, which
    802	 * tries to remove the socket from the map. Note that there
    803	 * might be updates to the map between
    804	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
    805	 */
    806	struct xdp_sock __rcu **map_entry = NULL;
    807	struct xsk_map *map;
    808
    809	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
    810		xsk_map_try_sock_delete(map, xs, map_entry);
    811		bpf_map_put(&map->map);
    812	}
    813}
    814
    815static int xsk_release(struct socket *sock)
    816{
    817	struct sock *sk = sock->sk;
    818	struct xdp_sock *xs = xdp_sk(sk);
    819	struct net *net;
    820
    821	if (!sk)
    822		return 0;
    823
    824	net = sock_net(sk);
    825
    826	mutex_lock(&net->xdp.lock);
    827	sk_del_node_init_rcu(sk);
    828	mutex_unlock(&net->xdp.lock);
    829
    830	sock_prot_inuse_add(net, sk->sk_prot, -1);
    831
    832	xsk_delete_from_maps(xs);
    833	mutex_lock(&xs->mutex);
    834	xsk_unbind_dev(xs);
    835	mutex_unlock(&xs->mutex);
    836
    837	xskq_destroy(xs->rx);
    838	xskq_destroy(xs->tx);
    839	xskq_destroy(xs->fq_tmp);
    840	xskq_destroy(xs->cq_tmp);
    841
    842	sock_orphan(sk);
    843	sock->sk = NULL;
    844
    845	sk_refcnt_debug_release(sk);
    846	sock_put(sk);
    847
    848	return 0;
    849}
    850
    851static struct socket *xsk_lookup_xsk_from_fd(int fd)
    852{
    853	struct socket *sock;
    854	int err;
    855
    856	sock = sockfd_lookup(fd, &err);
    857	if (!sock)
    858		return ERR_PTR(-ENOTSOCK);
    859
    860	if (sock->sk->sk_family != PF_XDP) {
    861		sockfd_put(sock);
    862		return ERR_PTR(-ENOPROTOOPT);
    863	}
    864
    865	return sock;
    866}
    867
    868static bool xsk_validate_queues(struct xdp_sock *xs)
    869{
    870	return xs->fq_tmp && xs->cq_tmp;
    871}
    872
    873static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
    874{
    875	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
    876	struct sock *sk = sock->sk;
    877	struct xdp_sock *xs = xdp_sk(sk);
    878	struct net_device *dev;
    879	u32 flags, qid;
    880	int err = 0;
    881
    882	if (addr_len < sizeof(struct sockaddr_xdp))
    883		return -EINVAL;
    884	if (sxdp->sxdp_family != AF_XDP)
    885		return -EINVAL;
    886
    887	flags = sxdp->sxdp_flags;
    888	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
    889		      XDP_USE_NEED_WAKEUP))
    890		return -EINVAL;
    891
    892	rtnl_lock();
    893	mutex_lock(&xs->mutex);
    894	if (xs->state != XSK_READY) {
    895		err = -EBUSY;
    896		goto out_release;
    897	}
    898
    899	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
    900	if (!dev) {
    901		err = -ENODEV;
    902		goto out_release;
    903	}
    904
    905	if (!xs->rx && !xs->tx) {
    906		err = -EINVAL;
    907		goto out_unlock;
    908	}
    909
    910	qid = sxdp->sxdp_queue_id;
    911
    912	if (flags & XDP_SHARED_UMEM) {
    913		struct xdp_sock *umem_xs;
    914		struct socket *sock;
    915
    916		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
    917		    (flags & XDP_USE_NEED_WAKEUP)) {
    918			/* Cannot specify flags for shared sockets. */
    919			err = -EINVAL;
    920			goto out_unlock;
    921		}
    922
    923		if (xs->umem) {
    924			/* We have already our own. */
    925			err = -EINVAL;
    926			goto out_unlock;
    927		}
    928
    929		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
    930		if (IS_ERR(sock)) {
    931			err = PTR_ERR(sock);
    932			goto out_unlock;
    933		}
    934
    935		umem_xs = xdp_sk(sock->sk);
    936		if (!xsk_is_bound(umem_xs)) {
    937			err = -EBADF;
    938			sockfd_put(sock);
    939			goto out_unlock;
    940		}
    941
    942		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
    943			/* Share the umem with another socket on another qid
    944			 * and/or device.
    945			 */
    946			xs->pool = xp_create_and_assign_umem(xs,
    947							     umem_xs->umem);
    948			if (!xs->pool) {
    949				err = -ENOMEM;
    950				sockfd_put(sock);
    951				goto out_unlock;
    952			}
    953
    954			err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
    955						   dev, qid);
    956			if (err) {
    957				xp_destroy(xs->pool);
    958				xs->pool = NULL;
    959				sockfd_put(sock);
    960				goto out_unlock;
    961			}
    962		} else {
    963			/* Share the buffer pool with the other socket. */
    964			if (xs->fq_tmp || xs->cq_tmp) {
    965				/* Do not allow setting your own fq or cq. */
    966				err = -EINVAL;
    967				sockfd_put(sock);
    968				goto out_unlock;
    969			}
    970
    971			xp_get_pool(umem_xs->pool);
    972			xs->pool = umem_xs->pool;
    973
    974			/* If underlying shared umem was created without Tx
    975			 * ring, allocate Tx descs array that Tx batching API
    976			 * utilizes
    977			 */
    978			if (xs->tx && !xs->pool->tx_descs) {
    979				err = xp_alloc_tx_descs(xs->pool, xs);
    980				if (err) {
    981					xp_put_pool(xs->pool);
    982					sockfd_put(sock);
    983					goto out_unlock;
    984				}
    985			}
    986		}
    987
    988		xdp_get_umem(umem_xs->umem);
    989		WRITE_ONCE(xs->umem, umem_xs->umem);
    990		sockfd_put(sock);
    991	} else if (!xs->umem || !xsk_validate_queues(xs)) {
    992		err = -EINVAL;
    993		goto out_unlock;
    994	} else {
    995		/* This xsk has its own umem. */
    996		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
    997		if (!xs->pool) {
    998			err = -ENOMEM;
    999			goto out_unlock;
   1000		}
   1001
   1002		err = xp_assign_dev(xs->pool, dev, qid, flags);
   1003		if (err) {
   1004			xp_destroy(xs->pool);
   1005			xs->pool = NULL;
   1006			goto out_unlock;
   1007		}
   1008	}
   1009
   1010	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
   1011	xs->fq_tmp = NULL;
   1012	xs->cq_tmp = NULL;
   1013
   1014	xs->dev = dev;
   1015	xs->zc = xs->umem->zc;
   1016	xs->queue_id = qid;
   1017	xp_add_xsk(xs->pool, xs);
   1018
   1019out_unlock:
   1020	if (err) {
   1021		dev_put(dev);
   1022	} else {
   1023		/* Matches smp_rmb() in bind() for shared umem
   1024		 * sockets, and xsk_is_bound().
   1025		 */
   1026		smp_wmb();
   1027		WRITE_ONCE(xs->state, XSK_BOUND);
   1028	}
   1029out_release:
   1030	mutex_unlock(&xs->mutex);
   1031	rtnl_unlock();
   1032	return err;
   1033}
   1034
   1035struct xdp_umem_reg_v1 {
   1036	__u64 addr; /* Start of packet data area */
   1037	__u64 len; /* Length of packet data area */
   1038	__u32 chunk_size;
   1039	__u32 headroom;
   1040};
   1041
   1042static int xsk_setsockopt(struct socket *sock, int level, int optname,
   1043			  sockptr_t optval, unsigned int optlen)
   1044{
   1045	struct sock *sk = sock->sk;
   1046	struct xdp_sock *xs = xdp_sk(sk);
   1047	int err;
   1048
   1049	if (level != SOL_XDP)
   1050		return -ENOPROTOOPT;
   1051
   1052	switch (optname) {
   1053	case XDP_RX_RING:
   1054	case XDP_TX_RING:
   1055	{
   1056		struct xsk_queue **q;
   1057		int entries;
   1058
   1059		if (optlen < sizeof(entries))
   1060			return -EINVAL;
   1061		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
   1062			return -EFAULT;
   1063
   1064		mutex_lock(&xs->mutex);
   1065		if (xs->state != XSK_READY) {
   1066			mutex_unlock(&xs->mutex);
   1067			return -EBUSY;
   1068		}
   1069		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
   1070		err = xsk_init_queue(entries, q, false);
   1071		if (!err && optname == XDP_TX_RING)
   1072			/* Tx needs to be explicitly woken up the first time */
   1073			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
   1074		mutex_unlock(&xs->mutex);
   1075		return err;
   1076	}
   1077	case XDP_UMEM_REG:
   1078	{
   1079		size_t mr_size = sizeof(struct xdp_umem_reg);
   1080		struct xdp_umem_reg mr = {};
   1081		struct xdp_umem *umem;
   1082
   1083		if (optlen < sizeof(struct xdp_umem_reg_v1))
   1084			return -EINVAL;
   1085		else if (optlen < sizeof(mr))
   1086			mr_size = sizeof(struct xdp_umem_reg_v1);
   1087
   1088		if (copy_from_sockptr(&mr, optval, mr_size))
   1089			return -EFAULT;
   1090
   1091		mutex_lock(&xs->mutex);
   1092		if (xs->state != XSK_READY || xs->umem) {
   1093			mutex_unlock(&xs->mutex);
   1094			return -EBUSY;
   1095		}
   1096
   1097		umem = xdp_umem_create(&mr);
   1098		if (IS_ERR(umem)) {
   1099			mutex_unlock(&xs->mutex);
   1100			return PTR_ERR(umem);
   1101		}
   1102
   1103		/* Make sure umem is ready before it can be seen by others */
   1104		smp_wmb();
   1105		WRITE_ONCE(xs->umem, umem);
   1106		mutex_unlock(&xs->mutex);
   1107		return 0;
   1108	}
   1109	case XDP_UMEM_FILL_RING:
   1110	case XDP_UMEM_COMPLETION_RING:
   1111	{
   1112		struct xsk_queue **q;
   1113		int entries;
   1114
   1115		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
   1116			return -EFAULT;
   1117
   1118		mutex_lock(&xs->mutex);
   1119		if (xs->state != XSK_READY) {
   1120			mutex_unlock(&xs->mutex);
   1121			return -EBUSY;
   1122		}
   1123
   1124		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
   1125			&xs->cq_tmp;
   1126		err = xsk_init_queue(entries, q, true);
   1127		mutex_unlock(&xs->mutex);
   1128		return err;
   1129	}
   1130	default:
   1131		break;
   1132	}
   1133
   1134	return -ENOPROTOOPT;
   1135}
   1136
   1137static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
   1138{
   1139	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
   1140	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
   1141	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
   1142}
   1143
   1144static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
   1145{
   1146	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
   1147	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
   1148	ring->desc = offsetof(struct xdp_umem_ring, desc);
   1149}
   1150
   1151struct xdp_statistics_v1 {
   1152	__u64 rx_dropped;
   1153	__u64 rx_invalid_descs;
   1154	__u64 tx_invalid_descs;
   1155};
   1156
   1157static int xsk_getsockopt(struct socket *sock, int level, int optname,
   1158			  char __user *optval, int __user *optlen)
   1159{
   1160	struct sock *sk = sock->sk;
   1161	struct xdp_sock *xs = xdp_sk(sk);
   1162	int len;
   1163
   1164	if (level != SOL_XDP)
   1165		return -ENOPROTOOPT;
   1166
   1167	if (get_user(len, optlen))
   1168		return -EFAULT;
   1169	if (len < 0)
   1170		return -EINVAL;
   1171
   1172	switch (optname) {
   1173	case XDP_STATISTICS:
   1174	{
   1175		struct xdp_statistics stats = {};
   1176		bool extra_stats = true;
   1177		size_t stats_size;
   1178
   1179		if (len < sizeof(struct xdp_statistics_v1)) {
   1180			return -EINVAL;
   1181		} else if (len < sizeof(stats)) {
   1182			extra_stats = false;
   1183			stats_size = sizeof(struct xdp_statistics_v1);
   1184		} else {
   1185			stats_size = sizeof(stats);
   1186		}
   1187
   1188		mutex_lock(&xs->mutex);
   1189		stats.rx_dropped = xs->rx_dropped;
   1190		if (extra_stats) {
   1191			stats.rx_ring_full = xs->rx_queue_full;
   1192			stats.rx_fill_ring_empty_descs =
   1193				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
   1194			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
   1195		} else {
   1196			stats.rx_dropped += xs->rx_queue_full;
   1197		}
   1198		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
   1199		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
   1200		mutex_unlock(&xs->mutex);
   1201
   1202		if (copy_to_user(optval, &stats, stats_size))
   1203			return -EFAULT;
   1204		if (put_user(stats_size, optlen))
   1205			return -EFAULT;
   1206
   1207		return 0;
   1208	}
   1209	case XDP_MMAP_OFFSETS:
   1210	{
   1211		struct xdp_mmap_offsets off;
   1212		struct xdp_mmap_offsets_v1 off_v1;
   1213		bool flags_supported = true;
   1214		void *to_copy;
   1215
   1216		if (len < sizeof(off_v1))
   1217			return -EINVAL;
   1218		else if (len < sizeof(off))
   1219			flags_supported = false;
   1220
   1221		if (flags_supported) {
   1222			/* xdp_ring_offset is identical to xdp_ring_offset_v1
   1223			 * except for the flags field added to the end.
   1224			 */
   1225			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
   1226					       &off.rx);
   1227			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
   1228					       &off.tx);
   1229			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
   1230					       &off.fr);
   1231			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
   1232					       &off.cr);
   1233			off.rx.flags = offsetof(struct xdp_rxtx_ring,
   1234						ptrs.flags);
   1235			off.tx.flags = offsetof(struct xdp_rxtx_ring,
   1236						ptrs.flags);
   1237			off.fr.flags = offsetof(struct xdp_umem_ring,
   1238						ptrs.flags);
   1239			off.cr.flags = offsetof(struct xdp_umem_ring,
   1240						ptrs.flags);
   1241
   1242			len = sizeof(off);
   1243			to_copy = &off;
   1244		} else {
   1245			xsk_enter_rxtx_offsets(&off_v1.rx);
   1246			xsk_enter_rxtx_offsets(&off_v1.tx);
   1247			xsk_enter_umem_offsets(&off_v1.fr);
   1248			xsk_enter_umem_offsets(&off_v1.cr);
   1249
   1250			len = sizeof(off_v1);
   1251			to_copy = &off_v1;
   1252		}
   1253
   1254		if (copy_to_user(optval, to_copy, len))
   1255			return -EFAULT;
   1256		if (put_user(len, optlen))
   1257			return -EFAULT;
   1258
   1259		return 0;
   1260	}
   1261	case XDP_OPTIONS:
   1262	{
   1263		struct xdp_options opts = {};
   1264
   1265		if (len < sizeof(opts))
   1266			return -EINVAL;
   1267
   1268		mutex_lock(&xs->mutex);
   1269		if (xs->zc)
   1270			opts.flags |= XDP_OPTIONS_ZEROCOPY;
   1271		mutex_unlock(&xs->mutex);
   1272
   1273		len = sizeof(opts);
   1274		if (copy_to_user(optval, &opts, len))
   1275			return -EFAULT;
   1276		if (put_user(len, optlen))
   1277			return -EFAULT;
   1278
   1279		return 0;
   1280	}
   1281	default:
   1282		break;
   1283	}
   1284
   1285	return -EOPNOTSUPP;
   1286}
   1287
   1288static int xsk_mmap(struct file *file, struct socket *sock,
   1289		    struct vm_area_struct *vma)
   1290{
   1291	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
   1292	unsigned long size = vma->vm_end - vma->vm_start;
   1293	struct xdp_sock *xs = xdp_sk(sock->sk);
   1294	struct xsk_queue *q = NULL;
   1295	unsigned long pfn;
   1296	struct page *qpg;
   1297
   1298	if (READ_ONCE(xs->state) != XSK_READY)
   1299		return -EBUSY;
   1300
   1301	if (offset == XDP_PGOFF_RX_RING) {
   1302		q = READ_ONCE(xs->rx);
   1303	} else if (offset == XDP_PGOFF_TX_RING) {
   1304		q = READ_ONCE(xs->tx);
   1305	} else {
   1306		/* Matches the smp_wmb() in XDP_UMEM_REG */
   1307		smp_rmb();
   1308		if (offset == XDP_UMEM_PGOFF_FILL_RING)
   1309			q = READ_ONCE(xs->fq_tmp);
   1310		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
   1311			q = READ_ONCE(xs->cq_tmp);
   1312	}
   1313
   1314	if (!q)
   1315		return -EINVAL;
   1316
   1317	/* Matches the smp_wmb() in xsk_init_queue */
   1318	smp_rmb();
   1319	qpg = virt_to_head_page(q->ring);
   1320	if (size > page_size(qpg))
   1321		return -EINVAL;
   1322
   1323	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
   1324	return remap_pfn_range(vma, vma->vm_start, pfn,
   1325			       size, vma->vm_page_prot);
   1326}
   1327
   1328static int xsk_notifier(struct notifier_block *this,
   1329			unsigned long msg, void *ptr)
   1330{
   1331	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
   1332	struct net *net = dev_net(dev);
   1333	struct sock *sk;
   1334
   1335	switch (msg) {
   1336	case NETDEV_UNREGISTER:
   1337		mutex_lock(&net->xdp.lock);
   1338		sk_for_each(sk, &net->xdp.list) {
   1339			struct xdp_sock *xs = xdp_sk(sk);
   1340
   1341			mutex_lock(&xs->mutex);
   1342			if (xs->dev == dev) {
   1343				sk->sk_err = ENETDOWN;
   1344				if (!sock_flag(sk, SOCK_DEAD))
   1345					sk_error_report(sk);
   1346
   1347				xsk_unbind_dev(xs);
   1348
   1349				/* Clear device references. */
   1350				xp_clear_dev(xs->pool);
   1351			}
   1352			mutex_unlock(&xs->mutex);
   1353		}
   1354		mutex_unlock(&net->xdp.lock);
   1355		break;
   1356	}
   1357	return NOTIFY_DONE;
   1358}
   1359
   1360static struct proto xsk_proto = {
   1361	.name =		"XDP",
   1362	.owner =	THIS_MODULE,
   1363	.obj_size =	sizeof(struct xdp_sock),
   1364};
   1365
   1366static const struct proto_ops xsk_proto_ops = {
   1367	.family		= PF_XDP,
   1368	.owner		= THIS_MODULE,
   1369	.release	= xsk_release,
   1370	.bind		= xsk_bind,
   1371	.connect	= sock_no_connect,
   1372	.socketpair	= sock_no_socketpair,
   1373	.accept		= sock_no_accept,
   1374	.getname	= sock_no_getname,
   1375	.poll		= xsk_poll,
   1376	.ioctl		= sock_no_ioctl,
   1377	.listen		= sock_no_listen,
   1378	.shutdown	= sock_no_shutdown,
   1379	.setsockopt	= xsk_setsockopt,
   1380	.getsockopt	= xsk_getsockopt,
   1381	.sendmsg	= xsk_sendmsg,
   1382	.recvmsg	= xsk_recvmsg,
   1383	.mmap		= xsk_mmap,
   1384	.sendpage	= sock_no_sendpage,
   1385};
   1386
   1387static void xsk_destruct(struct sock *sk)
   1388{
   1389	struct xdp_sock *xs = xdp_sk(sk);
   1390
   1391	if (!sock_flag(sk, SOCK_DEAD))
   1392		return;
   1393
   1394	if (!xp_put_pool(xs->pool))
   1395		xdp_put_umem(xs->umem, !xs->pool);
   1396
   1397	sk_refcnt_debug_dec(sk);
   1398}
   1399
   1400static int xsk_create(struct net *net, struct socket *sock, int protocol,
   1401		      int kern)
   1402{
   1403	struct xdp_sock *xs;
   1404	struct sock *sk;
   1405
   1406	if (!ns_capable(net->user_ns, CAP_NET_RAW))
   1407		return -EPERM;
   1408	if (sock->type != SOCK_RAW)
   1409		return -ESOCKTNOSUPPORT;
   1410
   1411	if (protocol)
   1412		return -EPROTONOSUPPORT;
   1413
   1414	sock->state = SS_UNCONNECTED;
   1415
   1416	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
   1417	if (!sk)
   1418		return -ENOBUFS;
   1419
   1420	sock->ops = &xsk_proto_ops;
   1421
   1422	sock_init_data(sock, sk);
   1423
   1424	sk->sk_family = PF_XDP;
   1425
   1426	sk->sk_destruct = xsk_destruct;
   1427	sk_refcnt_debug_inc(sk);
   1428
   1429	sock_set_flag(sk, SOCK_RCU_FREE);
   1430
   1431	xs = xdp_sk(sk);
   1432	xs->state = XSK_READY;
   1433	mutex_init(&xs->mutex);
   1434	spin_lock_init(&xs->rx_lock);
   1435
   1436	INIT_LIST_HEAD(&xs->map_list);
   1437	spin_lock_init(&xs->map_list_lock);
   1438
   1439	mutex_lock(&net->xdp.lock);
   1440	sk_add_node_rcu(sk, &net->xdp.list);
   1441	mutex_unlock(&net->xdp.lock);
   1442
   1443	sock_prot_inuse_add(net, &xsk_proto, 1);
   1444
   1445	return 0;
   1446}
   1447
   1448static const struct net_proto_family xsk_family_ops = {
   1449	.family = PF_XDP,
   1450	.create = xsk_create,
   1451	.owner	= THIS_MODULE,
   1452};
   1453
   1454static struct notifier_block xsk_netdev_notifier = {
   1455	.notifier_call	= xsk_notifier,
   1456};
   1457
   1458static int __net_init xsk_net_init(struct net *net)
   1459{
   1460	mutex_init(&net->xdp.lock);
   1461	INIT_HLIST_HEAD(&net->xdp.list);
   1462	return 0;
   1463}
   1464
   1465static void __net_exit xsk_net_exit(struct net *net)
   1466{
   1467	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
   1468}
   1469
   1470static struct pernet_operations xsk_net_ops = {
   1471	.init = xsk_net_init,
   1472	.exit = xsk_net_exit,
   1473};
   1474
   1475static int __init xsk_init(void)
   1476{
   1477	int err, cpu;
   1478
   1479	err = proto_register(&xsk_proto, 0 /* no slab */);
   1480	if (err)
   1481		goto out;
   1482
   1483	err = sock_register(&xsk_family_ops);
   1484	if (err)
   1485		goto out_proto;
   1486
   1487	err = register_pernet_subsys(&xsk_net_ops);
   1488	if (err)
   1489		goto out_sk;
   1490
   1491	err = register_netdevice_notifier(&xsk_netdev_notifier);
   1492	if (err)
   1493		goto out_pernet;
   1494
   1495	for_each_possible_cpu(cpu)
   1496		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
   1497	return 0;
   1498
   1499out_pernet:
   1500	unregister_pernet_subsys(&xsk_net_ops);
   1501out_sk:
   1502	sock_unregister(PF_XDP);
   1503out_proto:
   1504	proto_unregister(&xsk_proto);
   1505out:
   1506	return err;
   1507}
   1508
   1509fs_initcall(xsk_init);