cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

virtio_net.c (102716B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/* A network driver using virtio.
      3 *
      4 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
      5 */
      6//#define DEBUG
      7#include <linux/netdevice.h>
      8#include <linux/etherdevice.h>
      9#include <linux/ethtool.h>
     10#include <linux/module.h>
     11#include <linux/virtio.h>
     12#include <linux/virtio_net.h>
     13#include <linux/bpf.h>
     14#include <linux/bpf_trace.h>
     15#include <linux/scatterlist.h>
     16#include <linux/if_vlan.h>
     17#include <linux/slab.h>
     18#include <linux/cpu.h>
     19#include <linux/average.h>
     20#include <linux/filter.h>
     21#include <linux/kernel.h>
     22#include <net/route.h>
     23#include <net/xdp.h>
     24#include <net/net_failover.h>
     25
     26static int napi_weight = NAPI_POLL_WEIGHT;
     27module_param(napi_weight, int, 0444);
     28
     29static bool csum = true, gso = true, napi_tx = true;
     30module_param(csum, bool, 0444);
     31module_param(gso, bool, 0444);
     32module_param(napi_tx, bool, 0644);
     33
     34/* FIXME: MTU in config. */
     35#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
     36#define GOOD_COPY_LEN	128
     37
     38#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
     39
     40/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
     41#define VIRTIO_XDP_HEADROOM 256
     42
     43/* Separating two types of XDP xmit */
     44#define VIRTIO_XDP_TX		BIT(0)
     45#define VIRTIO_XDP_REDIR	BIT(1)
     46
     47#define VIRTIO_XDP_FLAG	BIT(0)
     48
     49/* RX packet size EWMA. The average packet size is used to determine the packet
     50 * buffer size when refilling RX rings. As the entire RX ring may be refilled
     51 * at once, the weight is chosen so that the EWMA will be insensitive to short-
     52 * term, transient changes in packet size.
     53 */
     54DECLARE_EWMA(pkt_len, 0, 64)
     55
     56#define VIRTNET_DRIVER_VERSION "1.0.0"
     57
     58static const unsigned long guest_offloads[] = {
     59	VIRTIO_NET_F_GUEST_TSO4,
     60	VIRTIO_NET_F_GUEST_TSO6,
     61	VIRTIO_NET_F_GUEST_ECN,
     62	VIRTIO_NET_F_GUEST_UFO,
     63	VIRTIO_NET_F_GUEST_CSUM
     64};
     65
     66#define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
     67				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
     68				(1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
     69				(1ULL << VIRTIO_NET_F_GUEST_UFO))
     70
     71struct virtnet_stat_desc {
     72	char desc[ETH_GSTRING_LEN];
     73	size_t offset;
     74};
     75
     76struct virtnet_sq_stats {
     77	struct u64_stats_sync syncp;
     78	u64 packets;
     79	u64 bytes;
     80	u64 xdp_tx;
     81	u64 xdp_tx_drops;
     82	u64 kicks;
     83	u64 tx_timeouts;
     84};
     85
     86struct virtnet_rq_stats {
     87	struct u64_stats_sync syncp;
     88	u64 packets;
     89	u64 bytes;
     90	u64 drops;
     91	u64 xdp_packets;
     92	u64 xdp_tx;
     93	u64 xdp_redirects;
     94	u64 xdp_drops;
     95	u64 kicks;
     96};
     97
     98#define VIRTNET_SQ_STAT(m)	offsetof(struct virtnet_sq_stats, m)
     99#define VIRTNET_RQ_STAT(m)	offsetof(struct virtnet_rq_stats, m)
    100
    101static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
    102	{ "packets",		VIRTNET_SQ_STAT(packets) },
    103	{ "bytes",		VIRTNET_SQ_STAT(bytes) },
    104	{ "xdp_tx",		VIRTNET_SQ_STAT(xdp_tx) },
    105	{ "xdp_tx_drops",	VIRTNET_SQ_STAT(xdp_tx_drops) },
    106	{ "kicks",		VIRTNET_SQ_STAT(kicks) },
    107	{ "tx_timeouts",	VIRTNET_SQ_STAT(tx_timeouts) },
    108};
    109
    110static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
    111	{ "packets",		VIRTNET_RQ_STAT(packets) },
    112	{ "bytes",		VIRTNET_RQ_STAT(bytes) },
    113	{ "drops",		VIRTNET_RQ_STAT(drops) },
    114	{ "xdp_packets",	VIRTNET_RQ_STAT(xdp_packets) },
    115	{ "xdp_tx",		VIRTNET_RQ_STAT(xdp_tx) },
    116	{ "xdp_redirects",	VIRTNET_RQ_STAT(xdp_redirects) },
    117	{ "xdp_drops",		VIRTNET_RQ_STAT(xdp_drops) },
    118	{ "kicks",		VIRTNET_RQ_STAT(kicks) },
    119};
    120
    121#define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
    122#define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)
    123
    124/* Internal representation of a send virtqueue */
    125struct send_queue {
    126	/* Virtqueue associated with this send _queue */
    127	struct virtqueue *vq;
    128
    129	/* TX: fragments + linear part + virtio header */
    130	struct scatterlist sg[MAX_SKB_FRAGS + 2];
    131
    132	/* Name of the send queue: output.$index */
    133	char name[40];
    134
    135	struct virtnet_sq_stats stats;
    136
    137	struct napi_struct napi;
    138};
    139
    140/* Internal representation of a receive virtqueue */
    141struct receive_queue {
    142	/* Virtqueue associated with this receive_queue */
    143	struct virtqueue *vq;
    144
    145	struct napi_struct napi;
    146
    147	struct bpf_prog __rcu *xdp_prog;
    148
    149	struct virtnet_rq_stats stats;
    150
    151	/* Chain pages by the private ptr. */
    152	struct page *pages;
    153
    154	/* Average packet length for mergeable receive buffers. */
    155	struct ewma_pkt_len mrg_avg_pkt_len;
    156
    157	/* Page frag for packet buffer allocation. */
    158	struct page_frag alloc_frag;
    159
    160	/* RX: fragments + linear part + virtio header */
    161	struct scatterlist sg[MAX_SKB_FRAGS + 2];
    162
    163	/* Min single buffer size for mergeable buffers case. */
    164	unsigned int min_buf_len;
    165
    166	/* Name of this receive queue: input.$index */
    167	char name[40];
    168
    169	struct xdp_rxq_info xdp_rxq;
    170};
    171
    172/* This structure can contain rss message with maximum settings for indirection table and keysize
    173 * Note, that default structure that describes RSS configuration virtio_net_rss_config
    174 * contains same info but can't handle table values.
    175 * In any case, structure would be passed to virtio hw through sg_buf split by parts
    176 * because table sizes may be differ according to the device configuration.
    177 */
    178#define VIRTIO_NET_RSS_MAX_KEY_SIZE     40
    179#define VIRTIO_NET_RSS_MAX_TABLE_LEN    128
    180struct virtio_net_ctrl_rss {
    181	u32 hash_types;
    182	u16 indirection_table_mask;
    183	u16 unclassified_queue;
    184	u16 indirection_table[VIRTIO_NET_RSS_MAX_TABLE_LEN];
    185	u16 max_tx_vq;
    186	u8 hash_key_length;
    187	u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
    188};
    189
    190/* Control VQ buffers: protected by the rtnl lock */
    191struct control_buf {
    192	struct virtio_net_ctrl_hdr hdr;
    193	virtio_net_ctrl_ack status;
    194	struct virtio_net_ctrl_mq mq;
    195	u8 promisc;
    196	u8 allmulti;
    197	__virtio16 vid;
    198	__virtio64 offloads;
    199	struct virtio_net_ctrl_rss rss;
    200};
    201
    202struct virtnet_info {
    203	struct virtio_device *vdev;
    204	struct virtqueue *cvq;
    205	struct net_device *dev;
    206	struct send_queue *sq;
    207	struct receive_queue *rq;
    208	unsigned int status;
    209
    210	/* Max # of queue pairs supported by the device */
    211	u16 max_queue_pairs;
    212
    213	/* # of queue pairs currently used by the driver */
    214	u16 curr_queue_pairs;
    215
    216	/* # of XDP queue pairs currently used by the driver */
    217	u16 xdp_queue_pairs;
    218
    219	/* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */
    220	bool xdp_enabled;
    221
    222	/* I like... big packets and I cannot lie! */
    223	bool big_packets;
    224
    225	/* Host will merge rx buffers for big packets (shake it! shake it!) */
    226	bool mergeable_rx_bufs;
    227
    228	/* Host supports rss and/or hash report */
    229	bool has_rss;
    230	bool has_rss_hash_report;
    231	u8 rss_key_size;
    232	u16 rss_indir_table_size;
    233	u32 rss_hash_types_supported;
    234	u32 rss_hash_types_saved;
    235
    236	/* Has control virtqueue */
    237	bool has_cvq;
    238
    239	/* Host can handle any s/g split between our header and packet data */
    240	bool any_header_sg;
    241
    242	/* Packet virtio header size */
    243	u8 hdr_len;
    244
    245	/* Work struct for refilling if we run low on memory. */
    246	struct delayed_work refill;
    247
    248	/* Work struct for config space updates */
    249	struct work_struct config_work;
    250
    251	/* Does the affinity hint is set for virtqueues? */
    252	bool affinity_hint_set;
    253
    254	/* CPU hotplug instances for online & dead */
    255	struct hlist_node node;
    256	struct hlist_node node_dead;
    257
    258	struct control_buf *ctrl;
    259
    260	/* Ethtool settings */
    261	u8 duplex;
    262	u32 speed;
    263
    264	unsigned long guest_offloads;
    265	unsigned long guest_offloads_capable;
    266
    267	/* failover when STANDBY feature enabled */
    268	struct failover *failover;
    269};
    270
    271struct padded_vnet_hdr {
    272	struct virtio_net_hdr_v1_hash hdr;
    273	/*
    274	 * hdr is in a separate sg buffer, and data sg buffer shares same page
    275	 * with this header sg. This padding makes next sg 16 byte aligned
    276	 * after the header.
    277	 */
    278	char padding[12];
    279};
    280
    281static bool is_xdp_frame(void *ptr)
    282{
    283	return (unsigned long)ptr & VIRTIO_XDP_FLAG;
    284}
    285
    286static void *xdp_to_ptr(struct xdp_frame *ptr)
    287{
    288	return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
    289}
    290
    291static struct xdp_frame *ptr_to_xdp(void *ptr)
    292{
    293	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
    294}
    295
    296/* Converting between virtqueue no. and kernel tx/rx queue no.
    297 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
    298 */
    299static int vq2txq(struct virtqueue *vq)
    300{
    301	return (vq->index - 1) / 2;
    302}
    303
    304static int txq2vq(int txq)
    305{
    306	return txq * 2 + 1;
    307}
    308
    309static int vq2rxq(struct virtqueue *vq)
    310{
    311	return vq->index / 2;
    312}
    313
    314static int rxq2vq(int rxq)
    315{
    316	return rxq * 2;
    317}
    318
    319static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
    320{
    321	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
    322}
    323
    324/*
    325 * private is used to chain pages for big packets, put the whole
    326 * most recent used list in the beginning for reuse
    327 */
    328static void give_pages(struct receive_queue *rq, struct page *page)
    329{
    330	struct page *end;
    331
    332	/* Find end of list, sew whole thing into vi->rq.pages. */
    333	for (end = page; end->private; end = (struct page *)end->private);
    334	end->private = (unsigned long)rq->pages;
    335	rq->pages = page;
    336}
    337
    338static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
    339{
    340	struct page *p = rq->pages;
    341
    342	if (p) {
    343		rq->pages = (struct page *)p->private;
    344		/* clear private here, it is used to chain pages */
    345		p->private = 0;
    346	} else
    347		p = alloc_page(gfp_mask);
    348	return p;
    349}
    350
    351static void virtqueue_napi_schedule(struct napi_struct *napi,
    352				    struct virtqueue *vq)
    353{
    354	if (napi_schedule_prep(napi)) {
    355		virtqueue_disable_cb(vq);
    356		__napi_schedule(napi);
    357	}
    358}
    359
    360static void virtqueue_napi_complete(struct napi_struct *napi,
    361				    struct virtqueue *vq, int processed)
    362{
    363	int opaque;
    364
    365	opaque = virtqueue_enable_cb_prepare(vq);
    366	if (napi_complete_done(napi, processed)) {
    367		if (unlikely(virtqueue_poll(vq, opaque)))
    368			virtqueue_napi_schedule(napi, vq);
    369	} else {
    370		virtqueue_disable_cb(vq);
    371	}
    372}
    373
    374static void skb_xmit_done(struct virtqueue *vq)
    375{
    376	struct virtnet_info *vi = vq->vdev->priv;
    377	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
    378
    379	/* Suppress further interrupts. */
    380	virtqueue_disable_cb(vq);
    381
    382	if (napi->weight)
    383		virtqueue_napi_schedule(napi, vq);
    384	else
    385		/* We were probably waiting for more output buffers. */
    386		netif_wake_subqueue(vi->dev, vq2txq(vq));
    387}
    388
    389#define MRG_CTX_HEADER_SHIFT 22
    390static void *mergeable_len_to_ctx(unsigned int truesize,
    391				  unsigned int headroom)
    392{
    393	return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
    394}
    395
    396static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
    397{
    398	return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
    399}
    400
    401static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
    402{
    403	return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
    404}
    405
    406/* Called from bottom half context */
    407static struct sk_buff *page_to_skb(struct virtnet_info *vi,
    408				   struct receive_queue *rq,
    409				   struct page *page, unsigned int offset,
    410				   unsigned int len, unsigned int truesize,
    411				   bool hdr_valid, unsigned int metasize,
    412				   unsigned int headroom)
    413{
    414	struct sk_buff *skb;
    415	struct virtio_net_hdr_mrg_rxbuf *hdr;
    416	unsigned int copy, hdr_len, hdr_padded_len;
    417	struct page *page_to_free = NULL;
    418	int tailroom, shinfo_size;
    419	char *p, *hdr_p, *buf;
    420
    421	p = page_address(page) + offset;
    422	hdr_p = p;
    423
    424	hdr_len = vi->hdr_len;
    425	if (vi->mergeable_rx_bufs)
    426		hdr_padded_len = hdr_len;
    427	else
    428		hdr_padded_len = sizeof(struct padded_vnet_hdr);
    429
    430	/* If headroom is not 0, there is an offset between the beginning of the
    431	 * data and the allocated space, otherwise the data and the allocated
    432	 * space are aligned.
    433	 *
    434	 * Buffers with headroom use PAGE_SIZE as alloc size, see
    435	 * add_recvbuf_mergeable() + get_mergeable_buf_len()
    436	 */
    437	truesize = headroom ? PAGE_SIZE : truesize;
    438	tailroom = truesize - headroom;
    439	buf = p - headroom;
    440
    441	len -= hdr_len;
    442	offset += hdr_padded_len;
    443	p += hdr_padded_len;
    444	tailroom -= hdr_padded_len + len;
    445
    446	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
    447
    448	/* copy small packet so we can reuse these pages */
    449	if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
    450		skb = build_skb(buf, truesize);
    451		if (unlikely(!skb))
    452			return NULL;
    453
    454		skb_reserve(skb, p - buf);
    455		skb_put(skb, len);
    456
    457		page = (struct page *)page->private;
    458		if (page)
    459			give_pages(rq, page);
    460		goto ok;
    461	}
    462
    463	/* copy small packet so we can reuse these pages for small data */
    464	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
    465	if (unlikely(!skb))
    466		return NULL;
    467
    468	/* Copy all frame if it fits skb->head, otherwise
    469	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
    470	 */
    471	if (len <= skb_tailroom(skb))
    472		copy = len;
    473	else
    474		copy = ETH_HLEN + metasize;
    475	skb_put_data(skb, p, copy);
    476
    477	len -= copy;
    478	offset += copy;
    479
    480	if (vi->mergeable_rx_bufs) {
    481		if (len)
    482			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
    483		else
    484			page_to_free = page;
    485		goto ok;
    486	}
    487
    488	/*
    489	 * Verify that we can indeed put this data into a skb.
    490	 * This is here to handle cases when the device erroneously
    491	 * tries to receive more than is possible. This is usually
    492	 * the case of a broken device.
    493	 */
    494	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
    495		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
    496		dev_kfree_skb(skb);
    497		return NULL;
    498	}
    499	BUG_ON(offset >= PAGE_SIZE);
    500	while (len) {
    501		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
    502		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
    503				frag_size, truesize);
    504		len -= frag_size;
    505		page = (struct page *)page->private;
    506		offset = 0;
    507	}
    508
    509	if (page)
    510		give_pages(rq, page);
    511
    512ok:
    513	/* hdr_valid means no XDP, so we can copy the vnet header */
    514	if (hdr_valid) {
    515		hdr = skb_vnet_hdr(skb);
    516		memcpy(hdr, hdr_p, hdr_len);
    517	}
    518	if (page_to_free)
    519		put_page(page_to_free);
    520
    521	if (metasize) {
    522		__skb_pull(skb, metasize);
    523		skb_metadata_set(skb, metasize);
    524	}
    525
    526	return skb;
    527}
    528
    529static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
    530				   struct send_queue *sq,
    531				   struct xdp_frame *xdpf)
    532{
    533	struct virtio_net_hdr_mrg_rxbuf *hdr;
    534	int err;
    535
    536	if (unlikely(xdpf->headroom < vi->hdr_len))
    537		return -EOVERFLOW;
    538
    539	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
    540	xdpf->data -= vi->hdr_len;
    541	/* Zero header and leave csum up to XDP layers */
    542	hdr = xdpf->data;
    543	memset(hdr, 0, vi->hdr_len);
    544	xdpf->len   += vi->hdr_len;
    545
    546	sg_init_one(sq->sg, xdpf->data, xdpf->len);
    547
    548	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp_to_ptr(xdpf),
    549				   GFP_ATOMIC);
    550	if (unlikely(err))
    551		return -ENOSPC; /* Caller handle free/refcnt */
    552
    553	return 0;
    554}
    555
    556/* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on
    557 * the current cpu, so it does not need to be locked.
    558 *
    559 * Here we use marco instead of inline functions because we have to deal with
    560 * three issues at the same time: 1. the choice of sq. 2. judge and execute the
    561 * lock/unlock of txq 3. make sparse happy. It is difficult for two inline
    562 * functions to perfectly solve these three problems at the same time.
    563 */
    564#define virtnet_xdp_get_sq(vi) ({                                       \
    565	int cpu = smp_processor_id();                                   \
    566	struct netdev_queue *txq;                                       \
    567	typeof(vi) v = (vi);                                            \
    568	unsigned int qp;                                                \
    569									\
    570	if (v->curr_queue_pairs > nr_cpu_ids) {                         \
    571		qp = v->curr_queue_pairs - v->xdp_queue_pairs;          \
    572		qp += cpu;                                              \
    573		txq = netdev_get_tx_queue(v->dev, qp);                  \
    574		__netif_tx_acquire(txq);                                \
    575	} else {                                                        \
    576		qp = cpu % v->curr_queue_pairs;                         \
    577		txq = netdev_get_tx_queue(v->dev, qp);                  \
    578		__netif_tx_lock(txq, cpu);                              \
    579	}                                                               \
    580	v->sq + qp;                                                     \
    581})
    582
    583#define virtnet_xdp_put_sq(vi, q) {                                     \
    584	struct netdev_queue *txq;                                       \
    585	typeof(vi) v = (vi);                                            \
    586									\
    587	txq = netdev_get_tx_queue(v->dev, (q) - v->sq);                 \
    588	if (v->curr_queue_pairs > nr_cpu_ids)                           \
    589		__netif_tx_release(txq);                                \
    590	else                                                            \
    591		__netif_tx_unlock(txq);                                 \
    592}
    593
    594static int virtnet_xdp_xmit(struct net_device *dev,
    595			    int n, struct xdp_frame **frames, u32 flags)
    596{
    597	struct virtnet_info *vi = netdev_priv(dev);
    598	struct receive_queue *rq = vi->rq;
    599	struct bpf_prog *xdp_prog;
    600	struct send_queue *sq;
    601	unsigned int len;
    602	int packets = 0;
    603	int bytes = 0;
    604	int nxmit = 0;
    605	int kicks = 0;
    606	void *ptr;
    607	int ret;
    608	int i;
    609
    610	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
    611	 * indicate XDP resources have been successfully allocated.
    612	 */
    613	xdp_prog = rcu_access_pointer(rq->xdp_prog);
    614	if (!xdp_prog)
    615		return -ENXIO;
    616
    617	sq = virtnet_xdp_get_sq(vi);
    618
    619	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
    620		ret = -EINVAL;
    621		goto out;
    622	}
    623
    624	/* Free up any pending old buffers before queueing new ones. */
    625	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
    626		if (likely(is_xdp_frame(ptr))) {
    627			struct xdp_frame *frame = ptr_to_xdp(ptr);
    628
    629			bytes += frame->len;
    630			xdp_return_frame(frame);
    631		} else {
    632			struct sk_buff *skb = ptr;
    633
    634			bytes += skb->len;
    635			napi_consume_skb(skb, false);
    636		}
    637		packets++;
    638	}
    639
    640	for (i = 0; i < n; i++) {
    641		struct xdp_frame *xdpf = frames[i];
    642
    643		if (__virtnet_xdp_xmit_one(vi, sq, xdpf))
    644			break;
    645		nxmit++;
    646	}
    647	ret = nxmit;
    648
    649	if (flags & XDP_XMIT_FLUSH) {
    650		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
    651			kicks = 1;
    652	}
    653out:
    654	u64_stats_update_begin(&sq->stats.syncp);
    655	sq->stats.bytes += bytes;
    656	sq->stats.packets += packets;
    657	sq->stats.xdp_tx += n;
    658	sq->stats.xdp_tx_drops += n - nxmit;
    659	sq->stats.kicks += kicks;
    660	u64_stats_update_end(&sq->stats.syncp);
    661
    662	virtnet_xdp_put_sq(vi, sq);
    663	return ret;
    664}
    665
    666static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
    667{
    668	return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0;
    669}
    670
    671/* We copy the packet for XDP in the following cases:
    672 *
    673 * 1) Packet is scattered across multiple rx buffers.
    674 * 2) Headroom space is insufficient.
    675 *
    676 * This is inefficient but it's a temporary condition that
    677 * we hit right after XDP is enabled and until queue is refilled
    678 * with large buffers with sufficient headroom - so it should affect
    679 * at most queue size packets.
    680 * Afterwards, the conditions to enable
    681 * XDP should preclude the underlying device from sending packets
    682 * across multiple buffers (num_buf > 1), and we make sure buffers
    683 * have enough headroom.
    684 */
    685static struct page *xdp_linearize_page(struct receive_queue *rq,
    686				       u16 *num_buf,
    687				       struct page *p,
    688				       int offset,
    689				       int page_off,
    690				       unsigned int *len)
    691{
    692	struct page *page = alloc_page(GFP_ATOMIC);
    693
    694	if (!page)
    695		return NULL;
    696
    697	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
    698	page_off += *len;
    699
    700	while (--*num_buf) {
    701		int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
    702		unsigned int buflen;
    703		void *buf;
    704		int off;
    705
    706		buf = virtqueue_get_buf(rq->vq, &buflen);
    707		if (unlikely(!buf))
    708			goto err_buf;
    709
    710		p = virt_to_head_page(buf);
    711		off = buf - page_address(p);
    712
    713		/* guard against a misconfigured or uncooperative backend that
    714		 * is sending packet larger than the MTU.
    715		 */
    716		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
    717			put_page(p);
    718			goto err_buf;
    719		}
    720
    721		memcpy(page_address(page) + page_off,
    722		       page_address(p) + off, buflen);
    723		page_off += buflen;
    724		put_page(p);
    725	}
    726
    727	/* Headroom does not contribute to packet length */
    728	*len = page_off - VIRTIO_XDP_HEADROOM;
    729	return page;
    730err_buf:
    731	__free_pages(page, 0);
    732	return NULL;
    733}
    734
    735static struct sk_buff *receive_small(struct net_device *dev,
    736				     struct virtnet_info *vi,
    737				     struct receive_queue *rq,
    738				     void *buf, void *ctx,
    739				     unsigned int len,
    740				     unsigned int *xdp_xmit,
    741				     struct virtnet_rq_stats *stats)
    742{
    743	struct sk_buff *skb;
    744	struct bpf_prog *xdp_prog;
    745	unsigned int xdp_headroom = (unsigned long)ctx;
    746	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
    747	unsigned int headroom = vi->hdr_len + header_offset;
    748	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
    749			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
    750	struct page *page = virt_to_head_page(buf);
    751	unsigned int delta = 0;
    752	struct page *xdp_page;
    753	int err;
    754	unsigned int metasize = 0;
    755
    756	len -= vi->hdr_len;
    757	stats->bytes += len;
    758
    759	if (unlikely(len > GOOD_PACKET_LEN)) {
    760		pr_debug("%s: rx error: len %u exceeds max size %d\n",
    761			 dev->name, len, GOOD_PACKET_LEN);
    762		dev->stats.rx_length_errors++;
    763		goto err;
    764	}
    765
    766	if (likely(!vi->xdp_enabled)) {
    767		xdp_prog = NULL;
    768		goto skip_xdp;
    769	}
    770
    771	rcu_read_lock();
    772	xdp_prog = rcu_dereference(rq->xdp_prog);
    773	if (xdp_prog) {
    774		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
    775		struct xdp_frame *xdpf;
    776		struct xdp_buff xdp;
    777		void *orig_data;
    778		u32 act;
    779
    780		if (unlikely(hdr->hdr.gso_type))
    781			goto err_xdp;
    782
    783		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
    784			int offset = buf - page_address(page) + header_offset;
    785			unsigned int tlen = len + vi->hdr_len;
    786			u16 num_buf = 1;
    787
    788			xdp_headroom = virtnet_get_headroom(vi);
    789			header_offset = VIRTNET_RX_PAD + xdp_headroom;
    790			headroom = vi->hdr_len + header_offset;
    791			buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
    792				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
    793			xdp_page = xdp_linearize_page(rq, &num_buf, page,
    794						      offset, header_offset,
    795						      &tlen);
    796			if (!xdp_page)
    797				goto err_xdp;
    798
    799			buf = page_address(xdp_page);
    800			put_page(page);
    801			page = xdp_page;
    802		}
    803
    804		xdp_init_buff(&xdp, buflen, &rq->xdp_rxq);
    805		xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len,
    806				 xdp_headroom, len, true);
    807		orig_data = xdp.data;
    808		act = bpf_prog_run_xdp(xdp_prog, &xdp);
    809		stats->xdp_packets++;
    810
    811		switch (act) {
    812		case XDP_PASS:
    813			/* Recalculate length in case bpf program changed it */
    814			delta = orig_data - xdp.data;
    815			len = xdp.data_end - xdp.data;
    816			metasize = xdp.data - xdp.data_meta;
    817			break;
    818		case XDP_TX:
    819			stats->xdp_tx++;
    820			xdpf = xdp_convert_buff_to_frame(&xdp);
    821			if (unlikely(!xdpf))
    822				goto err_xdp;
    823			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
    824			if (unlikely(!err)) {
    825				xdp_return_frame_rx_napi(xdpf);
    826			} else if (unlikely(err < 0)) {
    827				trace_xdp_exception(vi->dev, xdp_prog, act);
    828				goto err_xdp;
    829			}
    830			*xdp_xmit |= VIRTIO_XDP_TX;
    831			rcu_read_unlock();
    832			goto xdp_xmit;
    833		case XDP_REDIRECT:
    834			stats->xdp_redirects++;
    835			err = xdp_do_redirect(dev, &xdp, xdp_prog);
    836			if (err)
    837				goto err_xdp;
    838			*xdp_xmit |= VIRTIO_XDP_REDIR;
    839			rcu_read_unlock();
    840			goto xdp_xmit;
    841		default:
    842			bpf_warn_invalid_xdp_action(vi->dev, xdp_prog, act);
    843			fallthrough;
    844		case XDP_ABORTED:
    845			trace_xdp_exception(vi->dev, xdp_prog, act);
    846			goto err_xdp;
    847		case XDP_DROP:
    848			goto err_xdp;
    849		}
    850	}
    851	rcu_read_unlock();
    852
    853skip_xdp:
    854	skb = build_skb(buf, buflen);
    855	if (!skb)
    856		goto err;
    857	skb_reserve(skb, headroom - delta);
    858	skb_put(skb, len);
    859	if (!xdp_prog) {
    860		buf += header_offset;
    861		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
    862	} /* keep zeroed vnet hdr since XDP is loaded */
    863
    864	if (metasize)
    865		skb_metadata_set(skb, metasize);
    866
    867	return skb;
    868
    869err_xdp:
    870	rcu_read_unlock();
    871	stats->xdp_drops++;
    872err:
    873	stats->drops++;
    874	put_page(page);
    875xdp_xmit:
    876	return NULL;
    877}
    878
    879static struct sk_buff *receive_big(struct net_device *dev,
    880				   struct virtnet_info *vi,
    881				   struct receive_queue *rq,
    882				   void *buf,
    883				   unsigned int len,
    884				   struct virtnet_rq_stats *stats)
    885{
    886	struct page *page = buf;
    887	struct sk_buff *skb =
    888		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
    889
    890	stats->bytes += len - vi->hdr_len;
    891	if (unlikely(!skb))
    892		goto err;
    893
    894	return skb;
    895
    896err:
    897	stats->drops++;
    898	give_pages(rq, page);
    899	return NULL;
    900}
    901
    902static struct sk_buff *receive_mergeable(struct net_device *dev,
    903					 struct virtnet_info *vi,
    904					 struct receive_queue *rq,
    905					 void *buf,
    906					 void *ctx,
    907					 unsigned int len,
    908					 unsigned int *xdp_xmit,
    909					 struct virtnet_rq_stats *stats)
    910{
    911	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
    912	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
    913	struct page *page = virt_to_head_page(buf);
    914	int offset = buf - page_address(page);
    915	struct sk_buff *head_skb, *curr_skb;
    916	struct bpf_prog *xdp_prog;
    917	unsigned int truesize = mergeable_ctx_to_truesize(ctx);
    918	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
    919	unsigned int metasize = 0;
    920	unsigned int frame_sz;
    921	int err;
    922
    923	head_skb = NULL;
    924	stats->bytes += len - vi->hdr_len;
    925
    926	if (unlikely(len > truesize)) {
    927		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
    928			 dev->name, len, (unsigned long)ctx);
    929		dev->stats.rx_length_errors++;
    930		goto err_skb;
    931	}
    932
    933	if (likely(!vi->xdp_enabled)) {
    934		xdp_prog = NULL;
    935		goto skip_xdp;
    936	}
    937
    938	rcu_read_lock();
    939	xdp_prog = rcu_dereference(rq->xdp_prog);
    940	if (xdp_prog) {
    941		struct xdp_frame *xdpf;
    942		struct page *xdp_page;
    943		struct xdp_buff xdp;
    944		void *data;
    945		u32 act;
    946
    947		/* Transient failure which in theory could occur if
    948		 * in-flight packets from before XDP was enabled reach
    949		 * the receive path after XDP is loaded.
    950		 */
    951		if (unlikely(hdr->hdr.gso_type))
    952			goto err_xdp;
    953
    954		/* Buffers with headroom use PAGE_SIZE as alloc size,
    955		 * see add_recvbuf_mergeable() + get_mergeable_buf_len()
    956		 */
    957		frame_sz = headroom ? PAGE_SIZE : truesize;
    958
    959		/* This happens when rx buffer size is underestimated
    960		 * or headroom is not enough because of the buffer
    961		 * was refilled before XDP is set. This should only
    962		 * happen for the first several packets, so we don't
    963		 * care much about its performance.
    964		 */
    965		if (unlikely(num_buf > 1 ||
    966			     headroom < virtnet_get_headroom(vi))) {
    967			/* linearize data for XDP */
    968			xdp_page = xdp_linearize_page(rq, &num_buf,
    969						      page, offset,
    970						      VIRTIO_XDP_HEADROOM,
    971						      &len);
    972			frame_sz = PAGE_SIZE;
    973
    974			if (!xdp_page)
    975				goto err_xdp;
    976			offset = VIRTIO_XDP_HEADROOM;
    977		} else {
    978			xdp_page = page;
    979		}
    980
    981		/* Allow consuming headroom but reserve enough space to push
    982		 * the descriptor on if we get an XDP_TX return code.
    983		 */
    984		data = page_address(xdp_page) + offset;
    985		xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq);
    986		xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len,
    987				 VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true);
    988
    989		act = bpf_prog_run_xdp(xdp_prog, &xdp);
    990		stats->xdp_packets++;
    991
    992		switch (act) {
    993		case XDP_PASS:
    994			metasize = xdp.data - xdp.data_meta;
    995
    996			/* recalculate offset to account for any header
    997			 * adjustments and minus the metasize to copy the
    998			 * metadata in page_to_skb(). Note other cases do not
    999			 * build an skb and avoid using offset
   1000			 */
   1001			offset = xdp.data - page_address(xdp_page) -
   1002				 vi->hdr_len - metasize;
   1003
   1004			/* recalculate len if xdp.data, xdp.data_end or
   1005			 * xdp.data_meta were adjusted
   1006			 */
   1007			len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
   1008
   1009			/* recalculate headroom if xdp.data or xdp_data_meta
   1010			 * were adjusted, note that offset should always point
   1011			 * to the start of the reserved bytes for virtio_net
   1012			 * header which are followed by xdp.data, that means
   1013			 * that offset is equal to the headroom (when buf is
   1014			 * starting at the beginning of the page, otherwise
   1015			 * there is a base offset inside the page) but it's used
   1016			 * with a different starting point (buf start) than
   1017			 * xdp.data (buf start + vnet hdr size). If xdp.data or
   1018			 * data_meta were adjusted by the xdp prog then the
   1019			 * headroom size has changed and so has the offset, we
   1020			 * can use data_hard_start, which points at buf start +
   1021			 * vnet hdr size, to calculate the new headroom and use
   1022			 * it later to compute buf start in page_to_skb()
   1023			 */
   1024			headroom = xdp.data - xdp.data_hard_start - metasize;
   1025
   1026			/* We can only create skb based on xdp_page. */
   1027			if (unlikely(xdp_page != page)) {
   1028				rcu_read_unlock();
   1029				put_page(page);
   1030				head_skb = page_to_skb(vi, rq, xdp_page, offset,
   1031						       len, PAGE_SIZE, false,
   1032						       metasize,
   1033						       headroom);
   1034				return head_skb;
   1035			}
   1036			break;
   1037		case XDP_TX:
   1038			stats->xdp_tx++;
   1039			xdpf = xdp_convert_buff_to_frame(&xdp);
   1040			if (unlikely(!xdpf))
   1041				goto err_xdp;
   1042			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
   1043			if (unlikely(!err)) {
   1044				xdp_return_frame_rx_napi(xdpf);
   1045			} else if (unlikely(err < 0)) {
   1046				trace_xdp_exception(vi->dev, xdp_prog, act);
   1047				if (unlikely(xdp_page != page))
   1048					put_page(xdp_page);
   1049				goto err_xdp;
   1050			}
   1051			*xdp_xmit |= VIRTIO_XDP_TX;
   1052			if (unlikely(xdp_page != page))
   1053				put_page(page);
   1054			rcu_read_unlock();
   1055			goto xdp_xmit;
   1056		case XDP_REDIRECT:
   1057			stats->xdp_redirects++;
   1058			err = xdp_do_redirect(dev, &xdp, xdp_prog);
   1059			if (err) {
   1060				if (unlikely(xdp_page != page))
   1061					put_page(xdp_page);
   1062				goto err_xdp;
   1063			}
   1064			*xdp_xmit |= VIRTIO_XDP_REDIR;
   1065			if (unlikely(xdp_page != page))
   1066				put_page(page);
   1067			rcu_read_unlock();
   1068			goto xdp_xmit;
   1069		default:
   1070			bpf_warn_invalid_xdp_action(vi->dev, xdp_prog, act);
   1071			fallthrough;
   1072		case XDP_ABORTED:
   1073			trace_xdp_exception(vi->dev, xdp_prog, act);
   1074			fallthrough;
   1075		case XDP_DROP:
   1076			if (unlikely(xdp_page != page))
   1077				__free_pages(xdp_page, 0);
   1078			goto err_xdp;
   1079		}
   1080	}
   1081	rcu_read_unlock();
   1082
   1083skip_xdp:
   1084	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
   1085			       metasize, headroom);
   1086	curr_skb = head_skb;
   1087
   1088	if (unlikely(!curr_skb))
   1089		goto err_skb;
   1090	while (--num_buf) {
   1091		int num_skb_frags;
   1092
   1093		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
   1094		if (unlikely(!buf)) {
   1095			pr_debug("%s: rx error: %d buffers out of %d missing\n",
   1096				 dev->name, num_buf,
   1097				 virtio16_to_cpu(vi->vdev,
   1098						 hdr->num_buffers));
   1099			dev->stats.rx_length_errors++;
   1100			goto err_buf;
   1101		}
   1102
   1103		stats->bytes += len;
   1104		page = virt_to_head_page(buf);
   1105
   1106		truesize = mergeable_ctx_to_truesize(ctx);
   1107		if (unlikely(len > truesize)) {
   1108			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
   1109				 dev->name, len, (unsigned long)ctx);
   1110			dev->stats.rx_length_errors++;
   1111			goto err_skb;
   1112		}
   1113
   1114		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
   1115		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
   1116			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
   1117
   1118			if (unlikely(!nskb))
   1119				goto err_skb;
   1120			if (curr_skb == head_skb)
   1121				skb_shinfo(curr_skb)->frag_list = nskb;
   1122			else
   1123				curr_skb->next = nskb;
   1124			curr_skb = nskb;
   1125			head_skb->truesize += nskb->truesize;
   1126			num_skb_frags = 0;
   1127		}
   1128		if (curr_skb != head_skb) {
   1129			head_skb->data_len += len;
   1130			head_skb->len += len;
   1131			head_skb->truesize += truesize;
   1132		}
   1133		offset = buf - page_address(page);
   1134		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
   1135			put_page(page);
   1136			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
   1137					     len, truesize);
   1138		} else {
   1139			skb_add_rx_frag(curr_skb, num_skb_frags, page,
   1140					offset, len, truesize);
   1141		}
   1142	}
   1143
   1144	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
   1145	return head_skb;
   1146
   1147err_xdp:
   1148	rcu_read_unlock();
   1149	stats->xdp_drops++;
   1150err_skb:
   1151	put_page(page);
   1152	while (num_buf-- > 1) {
   1153		buf = virtqueue_get_buf(rq->vq, &len);
   1154		if (unlikely(!buf)) {
   1155			pr_debug("%s: rx error: %d buffers missing\n",
   1156				 dev->name, num_buf);
   1157			dev->stats.rx_length_errors++;
   1158			break;
   1159		}
   1160		stats->bytes += len;
   1161		page = virt_to_head_page(buf);
   1162		put_page(page);
   1163	}
   1164err_buf:
   1165	stats->drops++;
   1166	dev_kfree_skb(head_skb);
   1167xdp_xmit:
   1168	return NULL;
   1169}
   1170
   1171static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash,
   1172				struct sk_buff *skb)
   1173{
   1174	enum pkt_hash_types rss_hash_type;
   1175
   1176	if (!hdr_hash || !skb)
   1177		return;
   1178
   1179	switch ((int)hdr_hash->hash_report) {
   1180	case VIRTIO_NET_HASH_REPORT_TCPv4:
   1181	case VIRTIO_NET_HASH_REPORT_UDPv4:
   1182	case VIRTIO_NET_HASH_REPORT_TCPv6:
   1183	case VIRTIO_NET_HASH_REPORT_UDPv6:
   1184	case VIRTIO_NET_HASH_REPORT_TCPv6_EX:
   1185	case VIRTIO_NET_HASH_REPORT_UDPv6_EX:
   1186		rss_hash_type = PKT_HASH_TYPE_L4;
   1187		break;
   1188	case VIRTIO_NET_HASH_REPORT_IPv4:
   1189	case VIRTIO_NET_HASH_REPORT_IPv6:
   1190	case VIRTIO_NET_HASH_REPORT_IPv6_EX:
   1191		rss_hash_type = PKT_HASH_TYPE_L3;
   1192		break;
   1193	case VIRTIO_NET_HASH_REPORT_NONE:
   1194	default:
   1195		rss_hash_type = PKT_HASH_TYPE_NONE;
   1196	}
   1197	skb_set_hash(skb, (unsigned int)hdr_hash->hash_value, rss_hash_type);
   1198}
   1199
   1200static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
   1201			void *buf, unsigned int len, void **ctx,
   1202			unsigned int *xdp_xmit,
   1203			struct virtnet_rq_stats *stats)
   1204{
   1205	struct net_device *dev = vi->dev;
   1206	struct sk_buff *skb;
   1207	struct virtio_net_hdr_mrg_rxbuf *hdr;
   1208
   1209	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
   1210		pr_debug("%s: short packet %i\n", dev->name, len);
   1211		dev->stats.rx_length_errors++;
   1212		if (vi->mergeable_rx_bufs) {
   1213			put_page(virt_to_head_page(buf));
   1214		} else if (vi->big_packets) {
   1215			give_pages(rq, buf);
   1216		} else {
   1217			put_page(virt_to_head_page(buf));
   1218		}
   1219		return;
   1220	}
   1221
   1222	if (vi->mergeable_rx_bufs)
   1223		skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
   1224					stats);
   1225	else if (vi->big_packets)
   1226		skb = receive_big(dev, vi, rq, buf, len, stats);
   1227	else
   1228		skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
   1229
   1230	if (unlikely(!skb))
   1231		return;
   1232
   1233	hdr = skb_vnet_hdr(skb);
   1234	if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report)
   1235		virtio_skb_set_hash((const struct virtio_net_hdr_v1_hash *)hdr, skb);
   1236
   1237	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
   1238		skb->ip_summed = CHECKSUM_UNNECESSARY;
   1239
   1240	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
   1241				  virtio_is_little_endian(vi->vdev))) {
   1242		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
   1243				     dev->name, hdr->hdr.gso_type,
   1244				     hdr->hdr.gso_size);
   1245		goto frame_err;
   1246	}
   1247
   1248	skb_record_rx_queue(skb, vq2rxq(rq->vq));
   1249	skb->protocol = eth_type_trans(skb, dev);
   1250	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
   1251		 ntohs(skb->protocol), skb->len, skb->pkt_type);
   1252
   1253	napi_gro_receive(&rq->napi, skb);
   1254	return;
   1255
   1256frame_err:
   1257	dev->stats.rx_frame_errors++;
   1258	dev_kfree_skb(skb);
   1259}
   1260
   1261/* Unlike mergeable buffers, all buffers are allocated to the
   1262 * same size, except for the headroom. For this reason we do
   1263 * not need to use  mergeable_len_to_ctx here - it is enough
   1264 * to store the headroom as the context ignoring the truesize.
   1265 */
   1266static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
   1267			     gfp_t gfp)
   1268{
   1269	struct page_frag *alloc_frag = &rq->alloc_frag;
   1270	char *buf;
   1271	unsigned int xdp_headroom = virtnet_get_headroom(vi);
   1272	void *ctx = (void *)(unsigned long)xdp_headroom;
   1273	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
   1274	int err;
   1275
   1276	len = SKB_DATA_ALIGN(len) +
   1277	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
   1278	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
   1279		return -ENOMEM;
   1280
   1281	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
   1282	get_page(alloc_frag->page);
   1283	alloc_frag->offset += len;
   1284	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
   1285		    vi->hdr_len + GOOD_PACKET_LEN);
   1286	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
   1287	if (err < 0)
   1288		put_page(virt_to_head_page(buf));
   1289	return err;
   1290}
   1291
   1292static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
   1293			   gfp_t gfp)
   1294{
   1295	struct page *first, *list = NULL;
   1296	char *p;
   1297	int i, err, offset;
   1298
   1299	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);
   1300
   1301	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
   1302	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
   1303		first = get_a_page(rq, gfp);
   1304		if (!first) {
   1305			if (list)
   1306				give_pages(rq, list);
   1307			return -ENOMEM;
   1308		}
   1309		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
   1310
   1311		/* chain new page in list head to match sg */
   1312		first->private = (unsigned long)list;
   1313		list = first;
   1314	}
   1315
   1316	first = get_a_page(rq, gfp);
   1317	if (!first) {
   1318		give_pages(rq, list);
   1319		return -ENOMEM;
   1320	}
   1321	p = page_address(first);
   1322
   1323	/* rq->sg[0], rq->sg[1] share the same page */
   1324	/* a separated rq->sg[0] for header - required in case !any_header_sg */
   1325	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
   1326
   1327	/* rq->sg[1] for data packet, from offset */
   1328	offset = sizeof(struct padded_vnet_hdr);
   1329	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
   1330
   1331	/* chain first in list head */
   1332	first->private = (unsigned long)list;
   1333	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
   1334				  first, gfp);
   1335	if (err < 0)
   1336		give_pages(rq, first);
   1337
   1338	return err;
   1339}
   1340
   1341static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
   1342					  struct ewma_pkt_len *avg_pkt_len,
   1343					  unsigned int room)
   1344{
   1345	struct virtnet_info *vi = rq->vq->vdev->priv;
   1346	const size_t hdr_len = vi->hdr_len;
   1347	unsigned int len;
   1348
   1349	if (room)
   1350		return PAGE_SIZE - room;
   1351
   1352	len = hdr_len +	clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
   1353				rq->min_buf_len, PAGE_SIZE - hdr_len);
   1354
   1355	return ALIGN(len, L1_CACHE_BYTES);
   1356}
   1357
   1358static int add_recvbuf_mergeable(struct virtnet_info *vi,
   1359				 struct receive_queue *rq, gfp_t gfp)
   1360{
   1361	struct page_frag *alloc_frag = &rq->alloc_frag;
   1362	unsigned int headroom = virtnet_get_headroom(vi);
   1363	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
   1364	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
   1365	char *buf;
   1366	void *ctx;
   1367	int err;
   1368	unsigned int len, hole;
   1369
   1370	/* Extra tailroom is needed to satisfy XDP's assumption. This
   1371	 * means rx frags coalescing won't work, but consider we've
   1372	 * disabled GSO for XDP, it won't be a big issue.
   1373	 */
   1374	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
   1375	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
   1376		return -ENOMEM;
   1377
   1378	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
   1379	buf += headroom; /* advance address leaving hole at front of pkt */
   1380	get_page(alloc_frag->page);
   1381	alloc_frag->offset += len + room;
   1382	hole = alloc_frag->size - alloc_frag->offset;
   1383	if (hole < len + room) {
   1384		/* To avoid internal fragmentation, if there is very likely not
   1385		 * enough space for another buffer, add the remaining space to
   1386		 * the current buffer.
   1387		 */
   1388		len += hole;
   1389		alloc_frag->offset += hole;
   1390	}
   1391
   1392	sg_init_one(rq->sg, buf, len);
   1393	ctx = mergeable_len_to_ctx(len, headroom);
   1394	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
   1395	if (err < 0)
   1396		put_page(virt_to_head_page(buf));
   1397
   1398	return err;
   1399}
   1400
   1401/*
   1402 * Returns false if we couldn't fill entirely (OOM).
   1403 *
   1404 * Normally run in the receive path, but can also be run from ndo_open
   1405 * before we're receiving packets, or from refill_work which is
   1406 * careful to disable receiving (using napi_disable).
   1407 */
   1408static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
   1409			  gfp_t gfp)
   1410{
   1411	int err;
   1412	bool oom;
   1413
   1414	do {
   1415		if (vi->mergeable_rx_bufs)
   1416			err = add_recvbuf_mergeable(vi, rq, gfp);
   1417		else if (vi->big_packets)
   1418			err = add_recvbuf_big(vi, rq, gfp);
   1419		else
   1420			err = add_recvbuf_small(vi, rq, gfp);
   1421
   1422		oom = err == -ENOMEM;
   1423		if (err)
   1424			break;
   1425	} while (rq->vq->num_free);
   1426	if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
   1427		unsigned long flags;
   1428
   1429		flags = u64_stats_update_begin_irqsave(&rq->stats.syncp);
   1430		rq->stats.kicks++;
   1431		u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
   1432	}
   1433
   1434	return !oom;
   1435}
   1436
   1437static void skb_recv_done(struct virtqueue *rvq)
   1438{
   1439	struct virtnet_info *vi = rvq->vdev->priv;
   1440	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
   1441
   1442	virtqueue_napi_schedule(&rq->napi, rvq);
   1443}
   1444
   1445static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
   1446{
   1447	napi_enable(napi);
   1448
   1449	/* If all buffers were filled by other side before we napi_enabled, we
   1450	 * won't get another interrupt, so process any outstanding packets now.
   1451	 * Call local_bh_enable after to trigger softIRQ processing.
   1452	 */
   1453	local_bh_disable();
   1454	virtqueue_napi_schedule(napi, vq);
   1455	local_bh_enable();
   1456}
   1457
   1458static void virtnet_napi_tx_enable(struct virtnet_info *vi,
   1459				   struct virtqueue *vq,
   1460				   struct napi_struct *napi)
   1461{
   1462	if (!napi->weight)
   1463		return;
   1464
   1465	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
   1466	 * enable the feature if this is likely affine with the transmit path.
   1467	 */
   1468	if (!vi->affinity_hint_set) {
   1469		napi->weight = 0;
   1470		return;
   1471	}
   1472
   1473	return virtnet_napi_enable(vq, napi);
   1474}
   1475
   1476static void virtnet_napi_tx_disable(struct napi_struct *napi)
   1477{
   1478	if (napi->weight)
   1479		napi_disable(napi);
   1480}
   1481
   1482static void refill_work(struct work_struct *work)
   1483{
   1484	struct virtnet_info *vi =
   1485		container_of(work, struct virtnet_info, refill.work);
   1486	bool still_empty;
   1487	int i;
   1488
   1489	for (i = 0; i < vi->curr_queue_pairs; i++) {
   1490		struct receive_queue *rq = &vi->rq[i];
   1491
   1492		napi_disable(&rq->napi);
   1493		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
   1494		virtnet_napi_enable(rq->vq, &rq->napi);
   1495
   1496		/* In theory, this can happen: if we don't get any buffers in
   1497		 * we will *never* try to fill again.
   1498		 */
   1499		if (still_empty)
   1500			schedule_delayed_work(&vi->refill, HZ/2);
   1501	}
   1502}
   1503
   1504static int virtnet_receive(struct receive_queue *rq, int budget,
   1505			   unsigned int *xdp_xmit)
   1506{
   1507	struct virtnet_info *vi = rq->vq->vdev->priv;
   1508	struct virtnet_rq_stats stats = {};
   1509	unsigned int len;
   1510	void *buf;
   1511	int i;
   1512
   1513	if (!vi->big_packets || vi->mergeable_rx_bufs) {
   1514		void *ctx;
   1515
   1516		while (stats.packets < budget &&
   1517		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
   1518			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
   1519			stats.packets++;
   1520		}
   1521	} else {
   1522		while (stats.packets < budget &&
   1523		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
   1524			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
   1525			stats.packets++;
   1526		}
   1527	}
   1528
   1529	if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
   1530		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
   1531			schedule_delayed_work(&vi->refill, 0);
   1532	}
   1533
   1534	u64_stats_update_begin(&rq->stats.syncp);
   1535	for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
   1536		size_t offset = virtnet_rq_stats_desc[i].offset;
   1537		u64 *item;
   1538
   1539		item = (u64 *)((u8 *)&rq->stats + offset);
   1540		*item += *(u64 *)((u8 *)&stats + offset);
   1541	}
   1542	u64_stats_update_end(&rq->stats.syncp);
   1543
   1544	return stats.packets;
   1545}
   1546
   1547static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
   1548{
   1549	unsigned int len;
   1550	unsigned int packets = 0;
   1551	unsigned int bytes = 0;
   1552	void *ptr;
   1553
   1554	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
   1555		if (likely(!is_xdp_frame(ptr))) {
   1556			struct sk_buff *skb = ptr;
   1557
   1558			pr_debug("Sent skb %p\n", skb);
   1559
   1560			bytes += skb->len;
   1561			napi_consume_skb(skb, in_napi);
   1562		} else {
   1563			struct xdp_frame *frame = ptr_to_xdp(ptr);
   1564
   1565			bytes += frame->len;
   1566			xdp_return_frame(frame);
   1567		}
   1568		packets++;
   1569	}
   1570
   1571	/* Avoid overhead when no packets have been processed
   1572	 * happens when called speculatively from start_xmit.
   1573	 */
   1574	if (!packets)
   1575		return;
   1576
   1577	u64_stats_update_begin(&sq->stats.syncp);
   1578	sq->stats.bytes += bytes;
   1579	sq->stats.packets += packets;
   1580	u64_stats_update_end(&sq->stats.syncp);
   1581}
   1582
   1583static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
   1584{
   1585	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
   1586		return false;
   1587	else if (q < vi->curr_queue_pairs)
   1588		return true;
   1589	else
   1590		return false;
   1591}
   1592
   1593static void virtnet_poll_cleantx(struct receive_queue *rq)
   1594{
   1595	struct virtnet_info *vi = rq->vq->vdev->priv;
   1596	unsigned int index = vq2rxq(rq->vq);
   1597	struct send_queue *sq = &vi->sq[index];
   1598	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);
   1599
   1600	if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index))
   1601		return;
   1602
   1603	if (__netif_tx_trylock(txq)) {
   1604		do {
   1605			virtqueue_disable_cb(sq->vq);
   1606			free_old_xmit_skbs(sq, true);
   1607		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
   1608
   1609		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
   1610			netif_tx_wake_queue(txq);
   1611
   1612		__netif_tx_unlock(txq);
   1613	}
   1614}
   1615
   1616static int virtnet_poll(struct napi_struct *napi, int budget)
   1617{
   1618	struct receive_queue *rq =
   1619		container_of(napi, struct receive_queue, napi);
   1620	struct virtnet_info *vi = rq->vq->vdev->priv;
   1621	struct send_queue *sq;
   1622	unsigned int received;
   1623	unsigned int xdp_xmit = 0;
   1624
   1625	virtnet_poll_cleantx(rq);
   1626
   1627	received = virtnet_receive(rq, budget, &xdp_xmit);
   1628
   1629	/* Out of packets? */
   1630	if (received < budget)
   1631		virtqueue_napi_complete(napi, rq->vq, received);
   1632
   1633	if (xdp_xmit & VIRTIO_XDP_REDIR)
   1634		xdp_do_flush();
   1635
   1636	if (xdp_xmit & VIRTIO_XDP_TX) {
   1637		sq = virtnet_xdp_get_sq(vi);
   1638		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
   1639			u64_stats_update_begin(&sq->stats.syncp);
   1640			sq->stats.kicks++;
   1641			u64_stats_update_end(&sq->stats.syncp);
   1642		}
   1643		virtnet_xdp_put_sq(vi, sq);
   1644	}
   1645
   1646	return received;
   1647}
   1648
   1649static int virtnet_open(struct net_device *dev)
   1650{
   1651	struct virtnet_info *vi = netdev_priv(dev);
   1652	int i, err;
   1653
   1654	for (i = 0; i < vi->max_queue_pairs; i++) {
   1655		if (i < vi->curr_queue_pairs)
   1656			/* Make sure we have some buffers: if oom use wq. */
   1657			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
   1658				schedule_delayed_work(&vi->refill, 0);
   1659
   1660		err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
   1661		if (err < 0)
   1662			return err;
   1663
   1664		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
   1665						 MEM_TYPE_PAGE_SHARED, NULL);
   1666		if (err < 0) {
   1667			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
   1668			return err;
   1669		}
   1670
   1671		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
   1672		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
   1673	}
   1674
   1675	return 0;
   1676}
   1677
   1678static int virtnet_poll_tx(struct napi_struct *napi, int budget)
   1679{
   1680	struct send_queue *sq = container_of(napi, struct send_queue, napi);
   1681	struct virtnet_info *vi = sq->vq->vdev->priv;
   1682	unsigned int index = vq2txq(sq->vq);
   1683	struct netdev_queue *txq;
   1684	int opaque;
   1685	bool done;
   1686
   1687	if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
   1688		/* We don't need to enable cb for XDP */
   1689		napi_complete_done(napi, 0);
   1690		return 0;
   1691	}
   1692
   1693	txq = netdev_get_tx_queue(vi->dev, index);
   1694	__netif_tx_lock(txq, raw_smp_processor_id());
   1695	virtqueue_disable_cb(sq->vq);
   1696	free_old_xmit_skbs(sq, true);
   1697
   1698	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
   1699		netif_tx_wake_queue(txq);
   1700
   1701	opaque = virtqueue_enable_cb_prepare(sq->vq);
   1702
   1703	done = napi_complete_done(napi, 0);
   1704
   1705	if (!done)
   1706		virtqueue_disable_cb(sq->vq);
   1707
   1708	__netif_tx_unlock(txq);
   1709
   1710	if (done) {
   1711		if (unlikely(virtqueue_poll(sq->vq, opaque))) {
   1712			if (napi_schedule_prep(napi)) {
   1713				__netif_tx_lock(txq, raw_smp_processor_id());
   1714				virtqueue_disable_cb(sq->vq);
   1715				__netif_tx_unlock(txq);
   1716				__napi_schedule(napi);
   1717			}
   1718		}
   1719	}
   1720
   1721	return 0;
   1722}
   1723
   1724static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
   1725{
   1726	struct virtio_net_hdr_mrg_rxbuf *hdr;
   1727	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
   1728	struct virtnet_info *vi = sq->vq->vdev->priv;
   1729	int num_sg;
   1730	unsigned hdr_len = vi->hdr_len;
   1731	bool can_push;
   1732
   1733	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
   1734
   1735	can_push = vi->any_header_sg &&
   1736		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
   1737		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
   1738	/* Even if we can, don't push here yet as this would skew
   1739	 * csum_start offset below. */
   1740	if (can_push)
   1741		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
   1742	else
   1743		hdr = skb_vnet_hdr(skb);
   1744
   1745	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
   1746				    virtio_is_little_endian(vi->vdev), false,
   1747				    0))
   1748		return -EPROTO;
   1749
   1750	if (vi->mergeable_rx_bufs)
   1751		hdr->num_buffers = 0;
   1752
   1753	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
   1754	if (can_push) {
   1755		__skb_push(skb, hdr_len);
   1756		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
   1757		if (unlikely(num_sg < 0))
   1758			return num_sg;
   1759		/* Pull header back to avoid skew in tx bytes calculations. */
   1760		__skb_pull(skb, hdr_len);
   1761	} else {
   1762		sg_set_buf(sq->sg, hdr, hdr_len);
   1763		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
   1764		if (unlikely(num_sg < 0))
   1765			return num_sg;
   1766		num_sg++;
   1767	}
   1768	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
   1769}
   1770
   1771static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
   1772{
   1773	struct virtnet_info *vi = netdev_priv(dev);
   1774	int qnum = skb_get_queue_mapping(skb);
   1775	struct send_queue *sq = &vi->sq[qnum];
   1776	int err;
   1777	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
   1778	bool kick = !netdev_xmit_more();
   1779	bool use_napi = sq->napi.weight;
   1780
   1781	/* Free up any pending old buffers before queueing new ones. */
   1782	do {
   1783		if (use_napi)
   1784			virtqueue_disable_cb(sq->vq);
   1785
   1786		free_old_xmit_skbs(sq, false);
   1787
   1788	} while (use_napi && kick &&
   1789	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
   1790
   1791	/* timestamp packet in software */
   1792	skb_tx_timestamp(skb);
   1793
   1794	/* Try to transmit */
   1795	err = xmit_skb(sq, skb);
   1796
   1797	/* This should not happen! */
   1798	if (unlikely(err)) {
   1799		dev->stats.tx_fifo_errors++;
   1800		if (net_ratelimit())
   1801			dev_warn(&dev->dev,
   1802				 "Unexpected TXQ (%d) queue failure: %d\n",
   1803				 qnum, err);
   1804		dev->stats.tx_dropped++;
   1805		dev_kfree_skb_any(skb);
   1806		return NETDEV_TX_OK;
   1807	}
   1808
   1809	/* Don't wait up for transmitted skbs to be freed. */
   1810	if (!use_napi) {
   1811		skb_orphan(skb);
   1812		nf_reset_ct(skb);
   1813	}
   1814
   1815	/* If running out of space, stop queue to avoid getting packets that we
   1816	 * are then unable to transmit.
   1817	 * An alternative would be to force queuing layer to requeue the skb by
   1818	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
   1819	 * returned in a normal path of operation: it means that driver is not
   1820	 * maintaining the TX queue stop/start state properly, and causes
   1821	 * the stack to do a non-trivial amount of useless work.
   1822	 * Since most packets only take 1 or 2 ring slots, stopping the queue
   1823	 * early means 16 slots are typically wasted.
   1824	 */
   1825	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
   1826		netif_stop_subqueue(dev, qnum);
   1827		if (!use_napi &&
   1828		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
   1829			/* More just got used, free them then recheck. */
   1830			free_old_xmit_skbs(sq, false);
   1831			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
   1832				netif_start_subqueue(dev, qnum);
   1833				virtqueue_disable_cb(sq->vq);
   1834			}
   1835		}
   1836	}
   1837
   1838	if (kick || netif_xmit_stopped(txq)) {
   1839		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
   1840			u64_stats_update_begin(&sq->stats.syncp);
   1841			sq->stats.kicks++;
   1842			u64_stats_update_end(&sq->stats.syncp);
   1843		}
   1844	}
   1845
   1846	return NETDEV_TX_OK;
   1847}
   1848
   1849/*
   1850 * Send command via the control virtqueue and check status.  Commands
   1851 * supported by the hypervisor, as indicated by feature bits, should
   1852 * never fail unless improperly formatted.
   1853 */
   1854static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
   1855				 struct scatterlist *out)
   1856{
   1857	struct scatterlist *sgs[4], hdr, stat;
   1858	unsigned out_num = 0, tmp;
   1859	int ret;
   1860
   1861	/* Caller should know better */
   1862	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
   1863
   1864	vi->ctrl->status = ~0;
   1865	vi->ctrl->hdr.class = class;
   1866	vi->ctrl->hdr.cmd = cmd;
   1867	/* Add header */
   1868	sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
   1869	sgs[out_num++] = &hdr;
   1870
   1871	if (out)
   1872		sgs[out_num++] = out;
   1873
   1874	/* Add return status. */
   1875	sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
   1876	sgs[out_num] = &stat;
   1877
   1878	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
   1879	ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
   1880	if (ret < 0) {
   1881		dev_warn(&vi->vdev->dev,
   1882			 "Failed to add sgs for command vq: %d\n.", ret);
   1883		return false;
   1884	}
   1885
   1886	if (unlikely(!virtqueue_kick(vi->cvq)))
   1887		return vi->ctrl->status == VIRTIO_NET_OK;
   1888
   1889	/* Spin for a response, the kick causes an ioport write, trapping
   1890	 * into the hypervisor, so the request should be handled immediately.
   1891	 */
   1892	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
   1893	       !virtqueue_is_broken(vi->cvq))
   1894		cpu_relax();
   1895
   1896	return vi->ctrl->status == VIRTIO_NET_OK;
   1897}
   1898
   1899static int virtnet_set_mac_address(struct net_device *dev, void *p)
   1900{
   1901	struct virtnet_info *vi = netdev_priv(dev);
   1902	struct virtio_device *vdev = vi->vdev;
   1903	int ret;
   1904	struct sockaddr *addr;
   1905	struct scatterlist sg;
   1906
   1907	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
   1908		return -EOPNOTSUPP;
   1909
   1910	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
   1911	if (!addr)
   1912		return -ENOMEM;
   1913
   1914	ret = eth_prepare_mac_addr_change(dev, addr);
   1915	if (ret)
   1916		goto out;
   1917
   1918	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
   1919		sg_init_one(&sg, addr->sa_data, dev->addr_len);
   1920		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
   1921					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
   1922			dev_warn(&vdev->dev,
   1923				 "Failed to set mac address by vq command.\n");
   1924			ret = -EINVAL;
   1925			goto out;
   1926		}
   1927	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
   1928		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
   1929		unsigned int i;
   1930
   1931		/* Naturally, this has an atomicity problem. */
   1932		for (i = 0; i < dev->addr_len; i++)
   1933			virtio_cwrite8(vdev,
   1934				       offsetof(struct virtio_net_config, mac) +
   1935				       i, addr->sa_data[i]);
   1936	}
   1937
   1938	eth_commit_mac_addr_change(dev, p);
   1939	ret = 0;
   1940
   1941out:
   1942	kfree(addr);
   1943	return ret;
   1944}
   1945
   1946static void virtnet_stats(struct net_device *dev,
   1947			  struct rtnl_link_stats64 *tot)
   1948{
   1949	struct virtnet_info *vi = netdev_priv(dev);
   1950	unsigned int start;
   1951	int i;
   1952
   1953	for (i = 0; i < vi->max_queue_pairs; i++) {
   1954		u64 tpackets, tbytes, terrors, rpackets, rbytes, rdrops;
   1955		struct receive_queue *rq = &vi->rq[i];
   1956		struct send_queue *sq = &vi->sq[i];
   1957
   1958		do {
   1959			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
   1960			tpackets = sq->stats.packets;
   1961			tbytes   = sq->stats.bytes;
   1962			terrors  = sq->stats.tx_timeouts;
   1963		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
   1964
   1965		do {
   1966			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
   1967			rpackets = rq->stats.packets;
   1968			rbytes   = rq->stats.bytes;
   1969			rdrops   = rq->stats.drops;
   1970		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
   1971
   1972		tot->rx_packets += rpackets;
   1973		tot->tx_packets += tpackets;
   1974		tot->rx_bytes   += rbytes;
   1975		tot->tx_bytes   += tbytes;
   1976		tot->rx_dropped += rdrops;
   1977		tot->tx_errors  += terrors;
   1978	}
   1979
   1980	tot->tx_dropped = dev->stats.tx_dropped;
   1981	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
   1982	tot->rx_length_errors = dev->stats.rx_length_errors;
   1983	tot->rx_frame_errors = dev->stats.rx_frame_errors;
   1984}
   1985
   1986static void virtnet_ack_link_announce(struct virtnet_info *vi)
   1987{
   1988	rtnl_lock();
   1989	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
   1990				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
   1991		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
   1992	rtnl_unlock();
   1993}
   1994
   1995static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
   1996{
   1997	struct scatterlist sg;
   1998	struct net_device *dev = vi->dev;
   1999
   2000	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
   2001		return 0;
   2002
   2003	vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
   2004	sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
   2005
   2006	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
   2007				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
   2008		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
   2009			 queue_pairs);
   2010		return -EINVAL;
   2011	} else {
   2012		vi->curr_queue_pairs = queue_pairs;
   2013		/* virtnet_open() will refill when device is going to up. */
   2014		if (dev->flags & IFF_UP)
   2015			schedule_delayed_work(&vi->refill, 0);
   2016	}
   2017
   2018	return 0;
   2019}
   2020
   2021static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
   2022{
   2023	int err;
   2024
   2025	rtnl_lock();
   2026	err = _virtnet_set_queues(vi, queue_pairs);
   2027	rtnl_unlock();
   2028	return err;
   2029}
   2030
   2031static int virtnet_close(struct net_device *dev)
   2032{
   2033	struct virtnet_info *vi = netdev_priv(dev);
   2034	int i;
   2035
   2036	/* Make sure refill_work doesn't re-enable napi! */
   2037	cancel_delayed_work_sync(&vi->refill);
   2038
   2039	for (i = 0; i < vi->max_queue_pairs; i++) {
   2040		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
   2041		napi_disable(&vi->rq[i].napi);
   2042		virtnet_napi_tx_disable(&vi->sq[i].napi);
   2043	}
   2044
   2045	return 0;
   2046}
   2047
   2048static void virtnet_set_rx_mode(struct net_device *dev)
   2049{
   2050	struct virtnet_info *vi = netdev_priv(dev);
   2051	struct scatterlist sg[2];
   2052	struct virtio_net_ctrl_mac *mac_data;
   2053	struct netdev_hw_addr *ha;
   2054	int uc_count;
   2055	int mc_count;
   2056	void *buf;
   2057	int i;
   2058
   2059	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
   2060	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
   2061		return;
   2062
   2063	vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
   2064	vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
   2065
   2066	sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
   2067
   2068	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
   2069				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
   2070		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
   2071			 vi->ctrl->promisc ? "en" : "dis");
   2072
   2073	sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
   2074
   2075	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
   2076				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
   2077		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
   2078			 vi->ctrl->allmulti ? "en" : "dis");
   2079
   2080	uc_count = netdev_uc_count(dev);
   2081	mc_count = netdev_mc_count(dev);
   2082	/* MAC filter - use one buffer for both lists */
   2083	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
   2084		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
   2085	mac_data = buf;
   2086	if (!buf)
   2087		return;
   2088
   2089	sg_init_table(sg, 2);
   2090
   2091	/* Store the unicast list and count in the front of the buffer */
   2092	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
   2093	i = 0;
   2094	netdev_for_each_uc_addr(ha, dev)
   2095		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
   2096
   2097	sg_set_buf(&sg[0], mac_data,
   2098		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
   2099
   2100	/* multicast list and count fill the end */
   2101	mac_data = (void *)&mac_data->macs[uc_count][0];
   2102
   2103	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
   2104	i = 0;
   2105	netdev_for_each_mc_addr(ha, dev)
   2106		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
   2107
   2108	sg_set_buf(&sg[1], mac_data,
   2109		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
   2110
   2111	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
   2112				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
   2113		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
   2114
   2115	kfree(buf);
   2116}
   2117
   2118static int virtnet_vlan_rx_add_vid(struct net_device *dev,
   2119				   __be16 proto, u16 vid)
   2120{
   2121	struct virtnet_info *vi = netdev_priv(dev);
   2122	struct scatterlist sg;
   2123
   2124	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
   2125	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
   2126
   2127	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
   2128				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
   2129		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
   2130	return 0;
   2131}
   2132
   2133static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
   2134				    __be16 proto, u16 vid)
   2135{
   2136	struct virtnet_info *vi = netdev_priv(dev);
   2137	struct scatterlist sg;
   2138
   2139	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
   2140	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
   2141
   2142	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
   2143				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
   2144		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
   2145	return 0;
   2146}
   2147
   2148static void virtnet_clean_affinity(struct virtnet_info *vi)
   2149{
   2150	int i;
   2151
   2152	if (vi->affinity_hint_set) {
   2153		for (i = 0; i < vi->max_queue_pairs; i++) {
   2154			virtqueue_set_affinity(vi->rq[i].vq, NULL);
   2155			virtqueue_set_affinity(vi->sq[i].vq, NULL);
   2156		}
   2157
   2158		vi->affinity_hint_set = false;
   2159	}
   2160}
   2161
   2162static void virtnet_set_affinity(struct virtnet_info *vi)
   2163{
   2164	cpumask_var_t mask;
   2165	int stragglers;
   2166	int group_size;
   2167	int i, j, cpu;
   2168	int num_cpu;
   2169	int stride;
   2170
   2171	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
   2172		virtnet_clean_affinity(vi);
   2173		return;
   2174	}
   2175
   2176	num_cpu = num_online_cpus();
   2177	stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
   2178	stragglers = num_cpu >= vi->curr_queue_pairs ?
   2179			num_cpu % vi->curr_queue_pairs :
   2180			0;
   2181	cpu = cpumask_first(cpu_online_mask);
   2182
   2183	for (i = 0; i < vi->curr_queue_pairs; i++) {
   2184		group_size = stride + (i < stragglers ? 1 : 0);
   2185
   2186		for (j = 0; j < group_size; j++) {
   2187			cpumask_set_cpu(cpu, mask);
   2188			cpu = cpumask_next_wrap(cpu, cpu_online_mask,
   2189						nr_cpu_ids, false);
   2190		}
   2191		virtqueue_set_affinity(vi->rq[i].vq, mask);
   2192		virtqueue_set_affinity(vi->sq[i].vq, mask);
   2193		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
   2194		cpumask_clear(mask);
   2195	}
   2196
   2197	vi->affinity_hint_set = true;
   2198	free_cpumask_var(mask);
   2199}
   2200
   2201static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
   2202{
   2203	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
   2204						   node);
   2205	virtnet_set_affinity(vi);
   2206	return 0;
   2207}
   2208
   2209static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
   2210{
   2211	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
   2212						   node_dead);
   2213	virtnet_set_affinity(vi);
   2214	return 0;
   2215}
   2216
   2217static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
   2218{
   2219	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
   2220						   node);
   2221
   2222	virtnet_clean_affinity(vi);
   2223	return 0;
   2224}
   2225
   2226static enum cpuhp_state virtionet_online;
   2227
   2228static int virtnet_cpu_notif_add(struct virtnet_info *vi)
   2229{
   2230	int ret;
   2231
   2232	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
   2233	if (ret)
   2234		return ret;
   2235	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
   2236					       &vi->node_dead);
   2237	if (!ret)
   2238		return ret;
   2239	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
   2240	return ret;
   2241}
   2242
   2243static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
   2244{
   2245	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
   2246	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
   2247					    &vi->node_dead);
   2248}
   2249
   2250static void virtnet_get_ringparam(struct net_device *dev,
   2251				  struct ethtool_ringparam *ring,
   2252				  struct kernel_ethtool_ringparam *kernel_ring,
   2253				  struct netlink_ext_ack *extack)
   2254{
   2255	struct virtnet_info *vi = netdev_priv(dev);
   2256
   2257	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
   2258	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
   2259	ring->rx_pending = ring->rx_max_pending;
   2260	ring->tx_pending = ring->tx_max_pending;
   2261}
   2262
   2263static bool virtnet_commit_rss_command(struct virtnet_info *vi)
   2264{
   2265	struct net_device *dev = vi->dev;
   2266	struct scatterlist sgs[4];
   2267	unsigned int sg_buf_size;
   2268
   2269	/* prepare sgs */
   2270	sg_init_table(sgs, 4);
   2271
   2272	sg_buf_size = offsetof(struct virtio_net_ctrl_rss, indirection_table);
   2273	sg_set_buf(&sgs[0], &vi->ctrl->rss, sg_buf_size);
   2274
   2275	sg_buf_size = sizeof(uint16_t) * (vi->ctrl->rss.indirection_table_mask + 1);
   2276	sg_set_buf(&sgs[1], vi->ctrl->rss.indirection_table, sg_buf_size);
   2277
   2278	sg_buf_size = offsetof(struct virtio_net_ctrl_rss, key)
   2279			- offsetof(struct virtio_net_ctrl_rss, max_tx_vq);
   2280	sg_set_buf(&sgs[2], &vi->ctrl->rss.max_tx_vq, sg_buf_size);
   2281
   2282	sg_buf_size = vi->rss_key_size;
   2283	sg_set_buf(&sgs[3], vi->ctrl->rss.key, sg_buf_size);
   2284
   2285	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
   2286				  vi->has_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG
   2287				  : VIRTIO_NET_CTRL_MQ_HASH_CONFIG, sgs)) {
   2288		dev_warn(&dev->dev, "VIRTIONET issue with committing RSS sgs\n");
   2289		return false;
   2290	}
   2291	return true;
   2292}
   2293
   2294static void virtnet_init_default_rss(struct virtnet_info *vi)
   2295{
   2296	u32 indir_val = 0;
   2297	int i = 0;
   2298
   2299	vi->ctrl->rss.hash_types = vi->rss_hash_types_supported;
   2300	vi->rss_hash_types_saved = vi->rss_hash_types_supported;
   2301	vi->ctrl->rss.indirection_table_mask = vi->rss_indir_table_size
   2302						? vi->rss_indir_table_size - 1 : 0;
   2303	vi->ctrl->rss.unclassified_queue = 0;
   2304
   2305	for (; i < vi->rss_indir_table_size; ++i) {
   2306		indir_val = ethtool_rxfh_indir_default(i, vi->curr_queue_pairs);
   2307		vi->ctrl->rss.indirection_table[i] = indir_val;
   2308	}
   2309
   2310	vi->ctrl->rss.max_tx_vq = vi->curr_queue_pairs;
   2311	vi->ctrl->rss.hash_key_length = vi->rss_key_size;
   2312
   2313	netdev_rss_key_fill(vi->ctrl->rss.key, vi->rss_key_size);
   2314}
   2315
   2316static void virtnet_get_hashflow(const struct virtnet_info *vi, struct ethtool_rxnfc *info)
   2317{
   2318	info->data = 0;
   2319	switch (info->flow_type) {
   2320	case TCP_V4_FLOW:
   2321		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
   2322			info->data = RXH_IP_SRC | RXH_IP_DST |
   2323						 RXH_L4_B_0_1 | RXH_L4_B_2_3;
   2324		} else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
   2325			info->data = RXH_IP_SRC | RXH_IP_DST;
   2326		}
   2327		break;
   2328	case TCP_V6_FLOW:
   2329		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
   2330			info->data = RXH_IP_SRC | RXH_IP_DST |
   2331						 RXH_L4_B_0_1 | RXH_L4_B_2_3;
   2332		} else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
   2333			info->data = RXH_IP_SRC | RXH_IP_DST;
   2334		}
   2335		break;
   2336	case UDP_V4_FLOW:
   2337		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
   2338			info->data = RXH_IP_SRC | RXH_IP_DST |
   2339						 RXH_L4_B_0_1 | RXH_L4_B_2_3;
   2340		} else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
   2341			info->data = RXH_IP_SRC | RXH_IP_DST;
   2342		}
   2343		break;
   2344	case UDP_V6_FLOW:
   2345		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
   2346			info->data = RXH_IP_SRC | RXH_IP_DST |
   2347						 RXH_L4_B_0_1 | RXH_L4_B_2_3;
   2348		} else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
   2349			info->data = RXH_IP_SRC | RXH_IP_DST;
   2350		}
   2351		break;
   2352	case IPV4_FLOW:
   2353		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
   2354			info->data = RXH_IP_SRC | RXH_IP_DST;
   2355
   2356		break;
   2357	case IPV6_FLOW:
   2358		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
   2359			info->data = RXH_IP_SRC | RXH_IP_DST;
   2360
   2361		break;
   2362	default:
   2363		info->data = 0;
   2364		break;
   2365	}
   2366}
   2367
   2368static bool virtnet_set_hashflow(struct virtnet_info *vi, struct ethtool_rxnfc *info)
   2369{
   2370	u32 new_hashtypes = vi->rss_hash_types_saved;
   2371	bool is_disable = info->data & RXH_DISCARD;
   2372	bool is_l4 = info->data == (RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3);
   2373
   2374	/* supports only 'sd', 'sdfn' and 'r' */
   2375	if (!((info->data == (RXH_IP_SRC | RXH_IP_DST)) | is_l4 | is_disable))
   2376		return false;
   2377
   2378	switch (info->flow_type) {
   2379	case TCP_V4_FLOW:
   2380		new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_TCPv4);
   2381		if (!is_disable)
   2382			new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4
   2383				| (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv4 : 0);
   2384		break;
   2385	case UDP_V4_FLOW:
   2386		new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_UDPv4);
   2387		if (!is_disable)
   2388			new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4
   2389				| (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv4 : 0);
   2390		break;
   2391	case IPV4_FLOW:
   2392		new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv4;
   2393		if (!is_disable)
   2394			new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv4;
   2395		break;
   2396	case TCP_V6_FLOW:
   2397		new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_TCPv6);
   2398		if (!is_disable)
   2399			new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6
   2400				| (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv6 : 0);
   2401		break;
   2402	case UDP_V6_FLOW:
   2403		new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_UDPv6);
   2404		if (!is_disable)
   2405			new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6
   2406				| (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv6 : 0);
   2407		break;
   2408	case IPV6_FLOW:
   2409		new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv6;
   2410		if (!is_disable)
   2411			new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv6;
   2412		break;
   2413	default:
   2414		/* unsupported flow */
   2415		return false;
   2416	}
   2417
   2418	/* if unsupported hashtype was set */
   2419	if (new_hashtypes != (new_hashtypes & vi->rss_hash_types_supported))
   2420		return false;
   2421
   2422	if (new_hashtypes != vi->rss_hash_types_saved) {
   2423		vi->rss_hash_types_saved = new_hashtypes;
   2424		vi->ctrl->rss.hash_types = vi->rss_hash_types_saved;
   2425		if (vi->dev->features & NETIF_F_RXHASH)
   2426			return virtnet_commit_rss_command(vi);
   2427	}
   2428
   2429	return true;
   2430}
   2431
   2432static void virtnet_get_drvinfo(struct net_device *dev,
   2433				struct ethtool_drvinfo *info)
   2434{
   2435	struct virtnet_info *vi = netdev_priv(dev);
   2436	struct virtio_device *vdev = vi->vdev;
   2437
   2438	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
   2439	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
   2440	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));
   2441
   2442}
   2443
   2444/* TODO: Eliminate OOO packets during switching */
   2445static int virtnet_set_channels(struct net_device *dev,
   2446				struct ethtool_channels *channels)
   2447{
   2448	struct virtnet_info *vi = netdev_priv(dev);
   2449	u16 queue_pairs = channels->combined_count;
   2450	int err;
   2451
   2452	/* We don't support separate rx/tx channels.
   2453	 * We don't allow setting 'other' channels.
   2454	 */
   2455	if (channels->rx_count || channels->tx_count || channels->other_count)
   2456		return -EINVAL;
   2457
   2458	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
   2459		return -EINVAL;
   2460
   2461	/* For now we don't support modifying channels while XDP is loaded
   2462	 * also when XDP is loaded all RX queues have XDP programs so we only
   2463	 * need to check a single RX queue.
   2464	 */
   2465	if (vi->rq[0].xdp_prog)
   2466		return -EINVAL;
   2467
   2468	cpus_read_lock();
   2469	err = _virtnet_set_queues(vi, queue_pairs);
   2470	if (err) {
   2471		cpus_read_unlock();
   2472		goto err;
   2473	}
   2474	virtnet_set_affinity(vi);
   2475	cpus_read_unlock();
   2476
   2477	netif_set_real_num_tx_queues(dev, queue_pairs);
   2478	netif_set_real_num_rx_queues(dev, queue_pairs);
   2479 err:
   2480	return err;
   2481}
   2482
   2483static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
   2484{
   2485	struct virtnet_info *vi = netdev_priv(dev);
   2486	unsigned int i, j;
   2487	u8 *p = data;
   2488
   2489	switch (stringset) {
   2490	case ETH_SS_STATS:
   2491		for (i = 0; i < vi->curr_queue_pairs; i++) {
   2492			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++)
   2493				ethtool_sprintf(&p, "rx_queue_%u_%s", i,
   2494						virtnet_rq_stats_desc[j].desc);
   2495		}
   2496
   2497		for (i = 0; i < vi->curr_queue_pairs; i++) {
   2498			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++)
   2499				ethtool_sprintf(&p, "tx_queue_%u_%s", i,
   2500						virtnet_sq_stats_desc[j].desc);
   2501		}
   2502		break;
   2503	}
   2504}
   2505
   2506static int virtnet_get_sset_count(struct net_device *dev, int sset)
   2507{
   2508	struct virtnet_info *vi = netdev_priv(dev);
   2509
   2510	switch (sset) {
   2511	case ETH_SS_STATS:
   2512		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
   2513					       VIRTNET_SQ_STATS_LEN);
   2514	default:
   2515		return -EOPNOTSUPP;
   2516	}
   2517}
   2518
   2519static void virtnet_get_ethtool_stats(struct net_device *dev,
   2520				      struct ethtool_stats *stats, u64 *data)
   2521{
   2522	struct virtnet_info *vi = netdev_priv(dev);
   2523	unsigned int idx = 0, start, i, j;
   2524	const u8 *stats_base;
   2525	size_t offset;
   2526
   2527	for (i = 0; i < vi->curr_queue_pairs; i++) {
   2528		struct receive_queue *rq = &vi->rq[i];
   2529
   2530		stats_base = (u8 *)&rq->stats;
   2531		do {
   2532			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
   2533			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
   2534				offset = virtnet_rq_stats_desc[j].offset;
   2535				data[idx + j] = *(u64 *)(stats_base + offset);
   2536			}
   2537		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
   2538		idx += VIRTNET_RQ_STATS_LEN;
   2539	}
   2540
   2541	for (i = 0; i < vi->curr_queue_pairs; i++) {
   2542		struct send_queue *sq = &vi->sq[i];
   2543
   2544		stats_base = (u8 *)&sq->stats;
   2545		do {
   2546			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
   2547			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
   2548				offset = virtnet_sq_stats_desc[j].offset;
   2549				data[idx + j] = *(u64 *)(stats_base + offset);
   2550			}
   2551		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
   2552		idx += VIRTNET_SQ_STATS_LEN;
   2553	}
   2554}
   2555
   2556static void virtnet_get_channels(struct net_device *dev,
   2557				 struct ethtool_channels *channels)
   2558{
   2559	struct virtnet_info *vi = netdev_priv(dev);
   2560
   2561	channels->combined_count = vi->curr_queue_pairs;
   2562	channels->max_combined = vi->max_queue_pairs;
   2563	channels->max_other = 0;
   2564	channels->rx_count = 0;
   2565	channels->tx_count = 0;
   2566	channels->other_count = 0;
   2567}
   2568
   2569static int virtnet_set_link_ksettings(struct net_device *dev,
   2570				      const struct ethtool_link_ksettings *cmd)
   2571{
   2572	struct virtnet_info *vi = netdev_priv(dev);
   2573
   2574	return ethtool_virtdev_set_link_ksettings(dev, cmd,
   2575						  &vi->speed, &vi->duplex);
   2576}
   2577
   2578static int virtnet_get_link_ksettings(struct net_device *dev,
   2579				      struct ethtool_link_ksettings *cmd)
   2580{
   2581	struct virtnet_info *vi = netdev_priv(dev);
   2582
   2583	cmd->base.speed = vi->speed;
   2584	cmd->base.duplex = vi->duplex;
   2585	cmd->base.port = PORT_OTHER;
   2586
   2587	return 0;
   2588}
   2589
   2590static int virtnet_set_coalesce(struct net_device *dev,
   2591				struct ethtool_coalesce *ec,
   2592				struct kernel_ethtool_coalesce *kernel_coal,
   2593				struct netlink_ext_ack *extack)
   2594{
   2595	struct virtnet_info *vi = netdev_priv(dev);
   2596	int i, napi_weight;
   2597
   2598	if (ec->tx_max_coalesced_frames > 1 ||
   2599	    ec->rx_max_coalesced_frames != 1)
   2600		return -EINVAL;
   2601
   2602	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
   2603	if (napi_weight ^ vi->sq[0].napi.weight) {
   2604		if (dev->flags & IFF_UP)
   2605			return -EBUSY;
   2606		for (i = 0; i < vi->max_queue_pairs; i++)
   2607			vi->sq[i].napi.weight = napi_weight;
   2608	}
   2609
   2610	return 0;
   2611}
   2612
   2613static int virtnet_get_coalesce(struct net_device *dev,
   2614				struct ethtool_coalesce *ec,
   2615				struct kernel_ethtool_coalesce *kernel_coal,
   2616				struct netlink_ext_ack *extack)
   2617{
   2618	struct ethtool_coalesce ec_default = {
   2619		.cmd = ETHTOOL_GCOALESCE,
   2620		.rx_max_coalesced_frames = 1,
   2621	};
   2622	struct virtnet_info *vi = netdev_priv(dev);
   2623
   2624	memcpy(ec, &ec_default, sizeof(ec_default));
   2625
   2626	if (vi->sq[0].napi.weight)
   2627		ec->tx_max_coalesced_frames = 1;
   2628
   2629	return 0;
   2630}
   2631
   2632static void virtnet_init_settings(struct net_device *dev)
   2633{
   2634	struct virtnet_info *vi = netdev_priv(dev);
   2635
   2636	vi->speed = SPEED_UNKNOWN;
   2637	vi->duplex = DUPLEX_UNKNOWN;
   2638}
   2639
   2640static void virtnet_update_settings(struct virtnet_info *vi)
   2641{
   2642	u32 speed;
   2643	u8 duplex;
   2644
   2645	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
   2646		return;
   2647
   2648	virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
   2649
   2650	if (ethtool_validate_speed(speed))
   2651		vi->speed = speed;
   2652
   2653	virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
   2654
   2655	if (ethtool_validate_duplex(duplex))
   2656		vi->duplex = duplex;
   2657}
   2658
   2659static u32 virtnet_get_rxfh_key_size(struct net_device *dev)
   2660{
   2661	return ((struct virtnet_info *)netdev_priv(dev))->rss_key_size;
   2662}
   2663
   2664static u32 virtnet_get_rxfh_indir_size(struct net_device *dev)
   2665{
   2666	return ((struct virtnet_info *)netdev_priv(dev))->rss_indir_table_size;
   2667}
   2668
   2669static int virtnet_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
   2670{
   2671	struct virtnet_info *vi = netdev_priv(dev);
   2672	int i;
   2673
   2674	if (indir) {
   2675		for (i = 0; i < vi->rss_indir_table_size; ++i)
   2676			indir[i] = vi->ctrl->rss.indirection_table[i];
   2677	}
   2678
   2679	if (key)
   2680		memcpy(key, vi->ctrl->rss.key, vi->rss_key_size);
   2681
   2682	if (hfunc)
   2683		*hfunc = ETH_RSS_HASH_TOP;
   2684
   2685	return 0;
   2686}
   2687
   2688static int virtnet_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key, const u8 hfunc)
   2689{
   2690	struct virtnet_info *vi = netdev_priv(dev);
   2691	int i;
   2692
   2693	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
   2694		return -EOPNOTSUPP;
   2695
   2696	if (indir) {
   2697		for (i = 0; i < vi->rss_indir_table_size; ++i)
   2698			vi->ctrl->rss.indirection_table[i] = indir[i];
   2699	}
   2700	if (key)
   2701		memcpy(vi->ctrl->rss.key, key, vi->rss_key_size);
   2702
   2703	virtnet_commit_rss_command(vi);
   2704
   2705	return 0;
   2706}
   2707
   2708static int virtnet_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, u32 *rule_locs)
   2709{
   2710	struct virtnet_info *vi = netdev_priv(dev);
   2711	int rc = 0;
   2712
   2713	switch (info->cmd) {
   2714	case ETHTOOL_GRXRINGS:
   2715		info->data = vi->curr_queue_pairs;
   2716		break;
   2717	case ETHTOOL_GRXFH:
   2718		virtnet_get_hashflow(vi, info);
   2719		break;
   2720	default:
   2721		rc = -EOPNOTSUPP;
   2722	}
   2723
   2724	return rc;
   2725}
   2726
   2727static int virtnet_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info)
   2728{
   2729	struct virtnet_info *vi = netdev_priv(dev);
   2730	int rc = 0;
   2731
   2732	switch (info->cmd) {
   2733	case ETHTOOL_SRXFH:
   2734		if (!virtnet_set_hashflow(vi, info))
   2735			rc = -EINVAL;
   2736
   2737		break;
   2738	default:
   2739		rc = -EOPNOTSUPP;
   2740	}
   2741
   2742	return rc;
   2743}
   2744
   2745static const struct ethtool_ops virtnet_ethtool_ops = {
   2746	.supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
   2747	.get_drvinfo = virtnet_get_drvinfo,
   2748	.get_link = ethtool_op_get_link,
   2749	.get_ringparam = virtnet_get_ringparam,
   2750	.get_strings = virtnet_get_strings,
   2751	.get_sset_count = virtnet_get_sset_count,
   2752	.get_ethtool_stats = virtnet_get_ethtool_stats,
   2753	.set_channels = virtnet_set_channels,
   2754	.get_channels = virtnet_get_channels,
   2755	.get_ts_info = ethtool_op_get_ts_info,
   2756	.get_link_ksettings = virtnet_get_link_ksettings,
   2757	.set_link_ksettings = virtnet_set_link_ksettings,
   2758	.set_coalesce = virtnet_set_coalesce,
   2759	.get_coalesce = virtnet_get_coalesce,
   2760	.get_rxfh_key_size = virtnet_get_rxfh_key_size,
   2761	.get_rxfh_indir_size = virtnet_get_rxfh_indir_size,
   2762	.get_rxfh = virtnet_get_rxfh,
   2763	.set_rxfh = virtnet_set_rxfh,
   2764	.get_rxnfc = virtnet_get_rxnfc,
   2765	.set_rxnfc = virtnet_set_rxnfc,
   2766};
   2767
   2768static void virtnet_freeze_down(struct virtio_device *vdev)
   2769{
   2770	struct virtnet_info *vi = vdev->priv;
   2771
   2772	/* Make sure no work handler is accessing the device */
   2773	flush_work(&vi->config_work);
   2774
   2775	netif_tx_lock_bh(vi->dev);
   2776	netif_device_detach(vi->dev);
   2777	netif_tx_unlock_bh(vi->dev);
   2778	if (netif_running(vi->dev))
   2779		virtnet_close(vi->dev);
   2780}
   2781
   2782static int init_vqs(struct virtnet_info *vi);
   2783
   2784static int virtnet_restore_up(struct virtio_device *vdev)
   2785{
   2786	struct virtnet_info *vi = vdev->priv;
   2787	int err;
   2788
   2789	err = init_vqs(vi);
   2790	if (err)
   2791		return err;
   2792
   2793	virtio_device_ready(vdev);
   2794
   2795	if (netif_running(vi->dev)) {
   2796		err = virtnet_open(vi->dev);
   2797		if (err)
   2798			return err;
   2799	}
   2800
   2801	netif_tx_lock_bh(vi->dev);
   2802	netif_device_attach(vi->dev);
   2803	netif_tx_unlock_bh(vi->dev);
   2804	return err;
   2805}
   2806
   2807static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
   2808{
   2809	struct scatterlist sg;
   2810	vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
   2811
   2812	sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
   2813
   2814	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
   2815				  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
   2816		dev_warn(&vi->dev->dev, "Fail to set guest offload.\n");
   2817		return -EINVAL;
   2818	}
   2819
   2820	return 0;
   2821}
   2822
   2823static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
   2824{
   2825	u64 offloads = 0;
   2826
   2827	if (!vi->guest_offloads)
   2828		return 0;
   2829
   2830	return virtnet_set_guest_offloads(vi, offloads);
   2831}
   2832
   2833static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
   2834{
   2835	u64 offloads = vi->guest_offloads;
   2836
   2837	if (!vi->guest_offloads)
   2838		return 0;
   2839
   2840	return virtnet_set_guest_offloads(vi, offloads);
   2841}
   2842
   2843static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
   2844			   struct netlink_ext_ack *extack)
   2845{
   2846	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
   2847	struct virtnet_info *vi = netdev_priv(dev);
   2848	struct bpf_prog *old_prog;
   2849	u16 xdp_qp = 0, curr_qp;
   2850	int i, err;
   2851
   2852	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
   2853	    && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
   2854	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
   2855	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
   2856		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
   2857		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
   2858		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first");
   2859		return -EOPNOTSUPP;
   2860	}
   2861
   2862	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
   2863		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
   2864		return -EINVAL;
   2865	}
   2866
   2867	if (dev->mtu > max_sz) {
   2868		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
   2869		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
   2870		return -EINVAL;
   2871	}
   2872
   2873	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
   2874	if (prog)
   2875		xdp_qp = nr_cpu_ids;
   2876
   2877	/* XDP requires extra queues for XDP_TX */
   2878	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
   2879		netdev_warn_once(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n",
   2880				 curr_qp + xdp_qp, vi->max_queue_pairs);
   2881		xdp_qp = 0;
   2882	}
   2883
   2884	old_prog = rtnl_dereference(vi->rq[0].xdp_prog);
   2885	if (!prog && !old_prog)
   2886		return 0;
   2887
   2888	if (prog)
   2889		bpf_prog_add(prog, vi->max_queue_pairs - 1);
   2890
   2891	/* Make sure NAPI is not using any XDP TX queues for RX. */
   2892	if (netif_running(dev)) {
   2893		for (i = 0; i < vi->max_queue_pairs; i++) {
   2894			napi_disable(&vi->rq[i].napi);
   2895			virtnet_napi_tx_disable(&vi->sq[i].napi);
   2896		}
   2897	}
   2898
   2899	if (!prog) {
   2900		for (i = 0; i < vi->max_queue_pairs; i++) {
   2901			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
   2902			if (i == 0)
   2903				virtnet_restore_guest_offloads(vi);
   2904		}
   2905		synchronize_net();
   2906	}
   2907
   2908	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
   2909	if (err)
   2910		goto err;
   2911	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
   2912	vi->xdp_queue_pairs = xdp_qp;
   2913
   2914	if (prog) {
   2915		vi->xdp_enabled = true;
   2916		for (i = 0; i < vi->max_queue_pairs; i++) {
   2917			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
   2918			if (i == 0 && !old_prog)
   2919				virtnet_clear_guest_offloads(vi);
   2920		}
   2921	} else {
   2922		vi->xdp_enabled = false;
   2923	}
   2924
   2925	for (i = 0; i < vi->max_queue_pairs; i++) {
   2926		if (old_prog)
   2927			bpf_prog_put(old_prog);
   2928		if (netif_running(dev)) {
   2929			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
   2930			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
   2931					       &vi->sq[i].napi);
   2932		}
   2933	}
   2934
   2935	return 0;
   2936
   2937err:
   2938	if (!prog) {
   2939		virtnet_clear_guest_offloads(vi);
   2940		for (i = 0; i < vi->max_queue_pairs; i++)
   2941			rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
   2942	}
   2943
   2944	if (netif_running(dev)) {
   2945		for (i = 0; i < vi->max_queue_pairs; i++) {
   2946			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
   2947			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
   2948					       &vi->sq[i].napi);
   2949		}
   2950	}
   2951	if (prog)
   2952		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
   2953	return err;
   2954}
   2955
   2956static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
   2957{
   2958	switch (xdp->command) {
   2959	case XDP_SETUP_PROG:
   2960		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
   2961	default:
   2962		return -EINVAL;
   2963	}
   2964}
   2965
   2966static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
   2967				      size_t len)
   2968{
   2969	struct virtnet_info *vi = netdev_priv(dev);
   2970	int ret;
   2971
   2972	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
   2973		return -EOPNOTSUPP;
   2974
   2975	ret = snprintf(buf, len, "sby");
   2976	if (ret >= len)
   2977		return -EOPNOTSUPP;
   2978
   2979	return 0;
   2980}
   2981
   2982static int virtnet_set_features(struct net_device *dev,
   2983				netdev_features_t features)
   2984{
   2985	struct virtnet_info *vi = netdev_priv(dev);
   2986	u64 offloads;
   2987	int err;
   2988
   2989	if ((dev->features ^ features) & NETIF_F_GRO_HW) {
   2990		if (vi->xdp_enabled)
   2991			return -EBUSY;
   2992
   2993		if (features & NETIF_F_GRO_HW)
   2994			offloads = vi->guest_offloads_capable;
   2995		else
   2996			offloads = vi->guest_offloads_capable &
   2997				   ~GUEST_OFFLOAD_GRO_HW_MASK;
   2998
   2999		err = virtnet_set_guest_offloads(vi, offloads);
   3000		if (err)
   3001			return err;
   3002		vi->guest_offloads = offloads;
   3003	}
   3004
   3005	if ((dev->features ^ features) & NETIF_F_RXHASH) {
   3006		if (features & NETIF_F_RXHASH)
   3007			vi->ctrl->rss.hash_types = vi->rss_hash_types_saved;
   3008		else
   3009			vi->ctrl->rss.hash_types = VIRTIO_NET_HASH_REPORT_NONE;
   3010
   3011		if (!virtnet_commit_rss_command(vi))
   3012			return -EINVAL;
   3013	}
   3014
   3015	return 0;
   3016}
   3017
   3018static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue)
   3019{
   3020	struct virtnet_info *priv = netdev_priv(dev);
   3021	struct send_queue *sq = &priv->sq[txqueue];
   3022	struct netdev_queue *txq = netdev_get_tx_queue(dev, txqueue);
   3023
   3024	u64_stats_update_begin(&sq->stats.syncp);
   3025	sq->stats.tx_timeouts++;
   3026	u64_stats_update_end(&sq->stats.syncp);
   3027
   3028	netdev_err(dev, "TX timeout on queue: %u, sq: %s, vq: 0x%x, name: %s, %u usecs ago\n",
   3029		   txqueue, sq->name, sq->vq->index, sq->vq->name,
   3030		   jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start)));
   3031}
   3032
   3033static const struct net_device_ops virtnet_netdev = {
   3034	.ndo_open            = virtnet_open,
   3035	.ndo_stop   	     = virtnet_close,
   3036	.ndo_start_xmit      = start_xmit,
   3037	.ndo_validate_addr   = eth_validate_addr,
   3038	.ndo_set_mac_address = virtnet_set_mac_address,
   3039	.ndo_set_rx_mode     = virtnet_set_rx_mode,
   3040	.ndo_get_stats64     = virtnet_stats,
   3041	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
   3042	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
   3043	.ndo_bpf		= virtnet_xdp,
   3044	.ndo_xdp_xmit		= virtnet_xdp_xmit,
   3045	.ndo_features_check	= passthru_features_check,
   3046	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
   3047	.ndo_set_features	= virtnet_set_features,
   3048	.ndo_tx_timeout		= virtnet_tx_timeout,
   3049};
   3050
   3051static void virtnet_config_changed_work(struct work_struct *work)
   3052{
   3053	struct virtnet_info *vi =
   3054		container_of(work, struct virtnet_info, config_work);
   3055	u16 v;
   3056
   3057	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
   3058				 struct virtio_net_config, status, &v) < 0)
   3059		return;
   3060
   3061	if (v & VIRTIO_NET_S_ANNOUNCE) {
   3062		netdev_notify_peers(vi->dev);
   3063		virtnet_ack_link_announce(vi);
   3064	}
   3065
   3066	/* Ignore unknown (future) status bits */
   3067	v &= VIRTIO_NET_S_LINK_UP;
   3068
   3069	if (vi->status == v)
   3070		return;
   3071
   3072	vi->status = v;
   3073
   3074	if (vi->status & VIRTIO_NET_S_LINK_UP) {
   3075		virtnet_update_settings(vi);
   3076		netif_carrier_on(vi->dev);
   3077		netif_tx_wake_all_queues(vi->dev);
   3078	} else {
   3079		netif_carrier_off(vi->dev);
   3080		netif_tx_stop_all_queues(vi->dev);
   3081	}
   3082}
   3083
   3084static void virtnet_config_changed(struct virtio_device *vdev)
   3085{
   3086	struct virtnet_info *vi = vdev->priv;
   3087
   3088	schedule_work(&vi->config_work);
   3089}
   3090
   3091static void virtnet_free_queues(struct virtnet_info *vi)
   3092{
   3093	int i;
   3094
   3095	for (i = 0; i < vi->max_queue_pairs; i++) {
   3096		__netif_napi_del(&vi->rq[i].napi);
   3097		__netif_napi_del(&vi->sq[i].napi);
   3098	}
   3099
   3100	/* We called __netif_napi_del(),
   3101	 * we need to respect an RCU grace period before freeing vi->rq
   3102	 */
   3103	synchronize_net();
   3104
   3105	kfree(vi->rq);
   3106	kfree(vi->sq);
   3107	kfree(vi->ctrl);
   3108}
   3109
   3110static void _free_receive_bufs(struct virtnet_info *vi)
   3111{
   3112	struct bpf_prog *old_prog;
   3113	int i;
   3114
   3115	for (i = 0; i < vi->max_queue_pairs; i++) {
   3116		while (vi->rq[i].pages)
   3117			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
   3118
   3119		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
   3120		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
   3121		if (old_prog)
   3122			bpf_prog_put(old_prog);
   3123	}
   3124}
   3125
   3126static void free_receive_bufs(struct virtnet_info *vi)
   3127{
   3128	rtnl_lock();
   3129	_free_receive_bufs(vi);
   3130	rtnl_unlock();
   3131}
   3132
   3133static void free_receive_page_frags(struct virtnet_info *vi)
   3134{
   3135	int i;
   3136	for (i = 0; i < vi->max_queue_pairs; i++)
   3137		if (vi->rq[i].alloc_frag.page)
   3138			put_page(vi->rq[i].alloc_frag.page);
   3139}
   3140
   3141static void free_unused_bufs(struct virtnet_info *vi)
   3142{
   3143	void *buf;
   3144	int i;
   3145
   3146	for (i = 0; i < vi->max_queue_pairs; i++) {
   3147		struct virtqueue *vq = vi->sq[i].vq;
   3148		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
   3149			if (!is_xdp_frame(buf))
   3150				dev_kfree_skb(buf);
   3151			else
   3152				xdp_return_frame(ptr_to_xdp(buf));
   3153		}
   3154	}
   3155
   3156	for (i = 0; i < vi->max_queue_pairs; i++) {
   3157		struct virtqueue *vq = vi->rq[i].vq;
   3158
   3159		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
   3160			if (vi->mergeable_rx_bufs) {
   3161				put_page(virt_to_head_page(buf));
   3162			} else if (vi->big_packets) {
   3163				give_pages(&vi->rq[i], buf);
   3164			} else {
   3165				put_page(virt_to_head_page(buf));
   3166			}
   3167		}
   3168	}
   3169}
   3170
   3171static void virtnet_del_vqs(struct virtnet_info *vi)
   3172{
   3173	struct virtio_device *vdev = vi->vdev;
   3174
   3175	virtnet_clean_affinity(vi);
   3176
   3177	vdev->config->del_vqs(vdev);
   3178
   3179	virtnet_free_queues(vi);
   3180}
   3181
   3182/* How large should a single buffer be so a queue full of these can fit at
   3183 * least one full packet?
   3184 * Logic below assumes the mergeable buffer header is used.
   3185 */
   3186static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
   3187{
   3188	const unsigned int hdr_len = vi->hdr_len;
   3189	unsigned int rq_size = virtqueue_get_vring_size(vq);
   3190	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
   3191	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
   3192	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);
   3193
   3194	return max(max(min_buf_len, hdr_len) - hdr_len,
   3195		   (unsigned int)GOOD_PACKET_LEN);
   3196}
   3197
   3198static int virtnet_find_vqs(struct virtnet_info *vi)
   3199{
   3200	vq_callback_t **callbacks;
   3201	struct virtqueue **vqs;
   3202	int ret = -ENOMEM;
   3203	int i, total_vqs;
   3204	const char **names;
   3205	bool *ctx;
   3206
   3207	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
   3208	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
   3209	 * possible control vq.
   3210	 */
   3211	total_vqs = vi->max_queue_pairs * 2 +
   3212		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
   3213
   3214	/* Allocate space for find_vqs parameters */
   3215	vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
   3216	if (!vqs)
   3217		goto err_vq;
   3218	callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
   3219	if (!callbacks)
   3220		goto err_callback;
   3221	names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
   3222	if (!names)
   3223		goto err_names;
   3224	if (!vi->big_packets || vi->mergeable_rx_bufs) {
   3225		ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
   3226		if (!ctx)
   3227			goto err_ctx;
   3228	} else {
   3229		ctx = NULL;
   3230	}
   3231
   3232	/* Parameters for control virtqueue, if any */
   3233	if (vi->has_cvq) {
   3234		callbacks[total_vqs - 1] = NULL;
   3235		names[total_vqs - 1] = "control";
   3236	}
   3237
   3238	/* Allocate/initialize parameters for send/receive virtqueues */
   3239	for (i = 0; i < vi->max_queue_pairs; i++) {
   3240		callbacks[rxq2vq(i)] = skb_recv_done;
   3241		callbacks[txq2vq(i)] = skb_xmit_done;
   3242		sprintf(vi->rq[i].name, "input.%d", i);
   3243		sprintf(vi->sq[i].name, "output.%d", i);
   3244		names[rxq2vq(i)] = vi->rq[i].name;
   3245		names[txq2vq(i)] = vi->sq[i].name;
   3246		if (ctx)
   3247			ctx[rxq2vq(i)] = true;
   3248	}
   3249
   3250	ret = virtio_find_vqs_ctx(vi->vdev, total_vqs, vqs, callbacks,
   3251				  names, ctx, NULL);
   3252	if (ret)
   3253		goto err_find;
   3254
   3255	if (vi->has_cvq) {
   3256		vi->cvq = vqs[total_vqs - 1];
   3257		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
   3258			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
   3259	}
   3260
   3261	for (i = 0; i < vi->max_queue_pairs; i++) {
   3262		vi->rq[i].vq = vqs[rxq2vq(i)];
   3263		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
   3264		vi->sq[i].vq = vqs[txq2vq(i)];
   3265	}
   3266
   3267	/* run here: ret == 0. */
   3268
   3269
   3270err_find:
   3271	kfree(ctx);
   3272err_ctx:
   3273	kfree(names);
   3274err_names:
   3275	kfree(callbacks);
   3276err_callback:
   3277	kfree(vqs);
   3278err_vq:
   3279	return ret;
   3280}
   3281
   3282static int virtnet_alloc_queues(struct virtnet_info *vi)
   3283{
   3284	int i;
   3285
   3286	if (vi->has_cvq) {
   3287		vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
   3288		if (!vi->ctrl)
   3289			goto err_ctrl;
   3290	} else {
   3291		vi->ctrl = NULL;
   3292	}
   3293	vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
   3294	if (!vi->sq)
   3295		goto err_sq;
   3296	vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
   3297	if (!vi->rq)
   3298		goto err_rq;
   3299
   3300	INIT_DELAYED_WORK(&vi->refill, refill_work);
   3301	for (i = 0; i < vi->max_queue_pairs; i++) {
   3302		vi->rq[i].pages = NULL;
   3303		netif_napi_add_weight(vi->dev, &vi->rq[i].napi, virtnet_poll,
   3304				      napi_weight);
   3305		netif_napi_add_tx_weight(vi->dev, &vi->sq[i].napi,
   3306					 virtnet_poll_tx,
   3307					 napi_tx ? napi_weight : 0);
   3308
   3309		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
   3310		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
   3311		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
   3312
   3313		u64_stats_init(&vi->rq[i].stats.syncp);
   3314		u64_stats_init(&vi->sq[i].stats.syncp);
   3315	}
   3316
   3317	return 0;
   3318
   3319err_rq:
   3320	kfree(vi->sq);
   3321err_sq:
   3322	kfree(vi->ctrl);
   3323err_ctrl:
   3324	return -ENOMEM;
   3325}
   3326
   3327static int init_vqs(struct virtnet_info *vi)
   3328{
   3329	int ret;
   3330
   3331	/* Allocate send & receive queues */
   3332	ret = virtnet_alloc_queues(vi);
   3333	if (ret)
   3334		goto err;
   3335
   3336	ret = virtnet_find_vqs(vi);
   3337	if (ret)
   3338		goto err_free;
   3339
   3340	cpus_read_lock();
   3341	virtnet_set_affinity(vi);
   3342	cpus_read_unlock();
   3343
   3344	return 0;
   3345
   3346err_free:
   3347	virtnet_free_queues(vi);
   3348err:
   3349	return ret;
   3350}
   3351
   3352#ifdef CONFIG_SYSFS
   3353static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
   3354		char *buf)
   3355{
   3356	struct virtnet_info *vi = netdev_priv(queue->dev);
   3357	unsigned int queue_index = get_netdev_rx_queue_index(queue);
   3358	unsigned int headroom = virtnet_get_headroom(vi);
   3359	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
   3360	struct ewma_pkt_len *avg;
   3361
   3362	BUG_ON(queue_index >= vi->max_queue_pairs);
   3363	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
   3364	return sprintf(buf, "%u\n",
   3365		       get_mergeable_buf_len(&vi->rq[queue_index], avg,
   3366				       SKB_DATA_ALIGN(headroom + tailroom)));
   3367}
   3368
   3369static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
   3370	__ATTR_RO(mergeable_rx_buffer_size);
   3371
   3372static struct attribute *virtio_net_mrg_rx_attrs[] = {
   3373	&mergeable_rx_buffer_size_attribute.attr,
   3374	NULL
   3375};
   3376
   3377static const struct attribute_group virtio_net_mrg_rx_group = {
   3378	.name = "virtio_net",
   3379	.attrs = virtio_net_mrg_rx_attrs
   3380};
   3381#endif
   3382
   3383static bool virtnet_fail_on_feature(struct virtio_device *vdev,
   3384				    unsigned int fbit,
   3385				    const char *fname, const char *dname)
   3386{
   3387	if (!virtio_has_feature(vdev, fbit))
   3388		return false;
   3389
   3390	dev_err(&vdev->dev, "device advertises feature %s but not %s",
   3391		fname, dname);
   3392
   3393	return true;
   3394}
   3395
   3396#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
   3397	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)
   3398
   3399static bool virtnet_validate_features(struct virtio_device *vdev)
   3400{
   3401	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
   3402	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
   3403			     "VIRTIO_NET_F_CTRL_VQ") ||
   3404	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
   3405			     "VIRTIO_NET_F_CTRL_VQ") ||
   3406	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
   3407			     "VIRTIO_NET_F_CTRL_VQ") ||
   3408	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
   3409	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
   3410			     "VIRTIO_NET_F_CTRL_VQ") ||
   3411	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_RSS,
   3412			     "VIRTIO_NET_F_CTRL_VQ") ||
   3413	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_HASH_REPORT,
   3414			     "VIRTIO_NET_F_CTRL_VQ"))) {
   3415		return false;
   3416	}
   3417
   3418	return true;
   3419}
   3420
   3421#define MIN_MTU ETH_MIN_MTU
   3422#define MAX_MTU ETH_MAX_MTU
   3423
   3424static int virtnet_validate(struct virtio_device *vdev)
   3425{
   3426	if (!vdev->config->get) {
   3427		dev_err(&vdev->dev, "%s failure: config access disabled\n",
   3428			__func__);
   3429		return -EINVAL;
   3430	}
   3431
   3432	if (!virtnet_validate_features(vdev))
   3433		return -EINVAL;
   3434
   3435	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
   3436		int mtu = virtio_cread16(vdev,
   3437					 offsetof(struct virtio_net_config,
   3438						  mtu));
   3439		if (mtu < MIN_MTU)
   3440			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
   3441	}
   3442
   3443	return 0;
   3444}
   3445
   3446static int virtnet_probe(struct virtio_device *vdev)
   3447{
   3448	int i, err = -ENOMEM;
   3449	struct net_device *dev;
   3450	struct virtnet_info *vi;
   3451	u16 max_queue_pairs;
   3452	int mtu;
   3453
   3454	/* Find if host supports multiqueue/rss virtio_net device */
   3455	max_queue_pairs = 1;
   3456	if (virtio_has_feature(vdev, VIRTIO_NET_F_MQ) || virtio_has_feature(vdev, VIRTIO_NET_F_RSS))
   3457		max_queue_pairs =
   3458		     virtio_cread16(vdev, offsetof(struct virtio_net_config, max_virtqueue_pairs));
   3459
   3460	/* We need at least 2 queue's */
   3461	if (max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
   3462	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
   3463	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
   3464		max_queue_pairs = 1;
   3465
   3466	/* Allocate ourselves a network device with room for our info */
   3467	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
   3468	if (!dev)
   3469		return -ENOMEM;
   3470
   3471	/* Set up network device as normal. */
   3472	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
   3473			   IFF_TX_SKB_NO_LINEAR;
   3474	dev->netdev_ops = &virtnet_netdev;
   3475	dev->features = NETIF_F_HIGHDMA;
   3476
   3477	dev->ethtool_ops = &virtnet_ethtool_ops;
   3478	SET_NETDEV_DEV(dev, &vdev->dev);
   3479
   3480	/* Do we support "hardware" checksums? */
   3481	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
   3482		/* This opens up the world of extra features. */
   3483		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
   3484		if (csum)
   3485			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
   3486
   3487		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
   3488			dev->hw_features |= NETIF_F_TSO
   3489				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
   3490		}
   3491		/* Individual feature bits: what can host handle? */
   3492		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
   3493			dev->hw_features |= NETIF_F_TSO;
   3494		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
   3495			dev->hw_features |= NETIF_F_TSO6;
   3496		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
   3497			dev->hw_features |= NETIF_F_TSO_ECN;
   3498
   3499		dev->features |= NETIF_F_GSO_ROBUST;
   3500
   3501		if (gso)
   3502			dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
   3503		/* (!csum && gso) case will be fixed by register_netdev() */
   3504	}
   3505	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
   3506		dev->features |= NETIF_F_RXCSUM;
   3507	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
   3508	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6))
   3509		dev->features |= NETIF_F_GRO_HW;
   3510	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS))
   3511		dev->hw_features |= NETIF_F_GRO_HW;
   3512
   3513	dev->vlan_features = dev->features;
   3514
   3515	/* MTU range: 68 - 65535 */
   3516	dev->min_mtu = MIN_MTU;
   3517	dev->max_mtu = MAX_MTU;
   3518
   3519	/* Configuration may specify what MAC to use.  Otherwise random. */
   3520	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
   3521		u8 addr[ETH_ALEN];
   3522
   3523		virtio_cread_bytes(vdev,
   3524				   offsetof(struct virtio_net_config, mac),
   3525				   addr, ETH_ALEN);
   3526		eth_hw_addr_set(dev, addr);
   3527	} else {
   3528		eth_hw_addr_random(dev);
   3529	}
   3530
   3531	/* Set up our device-specific information */
   3532	vi = netdev_priv(dev);
   3533	vi->dev = dev;
   3534	vi->vdev = vdev;
   3535	vdev->priv = vi;
   3536
   3537	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
   3538
   3539	/* If we can receive ANY GSO packets, we must allocate large ones. */
   3540	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
   3541	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
   3542	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
   3543	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
   3544		vi->big_packets = true;
   3545
   3546	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
   3547		vi->mergeable_rx_bufs = true;
   3548
   3549	if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT))
   3550		vi->has_rss_hash_report = true;
   3551
   3552	if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS))
   3553		vi->has_rss = true;
   3554
   3555	if (vi->has_rss || vi->has_rss_hash_report) {
   3556		vi->rss_indir_table_size =
   3557			virtio_cread16(vdev, offsetof(struct virtio_net_config,
   3558				rss_max_indirection_table_length));
   3559		vi->rss_key_size =
   3560			virtio_cread8(vdev, offsetof(struct virtio_net_config, rss_max_key_size));
   3561
   3562		vi->rss_hash_types_supported =
   3563		    virtio_cread32(vdev, offsetof(struct virtio_net_config, supported_hash_types));
   3564		vi->rss_hash_types_supported &=
   3565				~(VIRTIO_NET_RSS_HASH_TYPE_IP_EX |
   3566				  VIRTIO_NET_RSS_HASH_TYPE_TCP_EX |
   3567				  VIRTIO_NET_RSS_HASH_TYPE_UDP_EX);
   3568
   3569		dev->hw_features |= NETIF_F_RXHASH;
   3570	}
   3571
   3572	if (vi->has_rss_hash_report)
   3573		vi->hdr_len = sizeof(struct virtio_net_hdr_v1_hash);
   3574	else if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
   3575		 virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
   3576		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
   3577	else
   3578		vi->hdr_len = sizeof(struct virtio_net_hdr);
   3579
   3580	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
   3581	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
   3582		vi->any_header_sg = true;
   3583
   3584	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
   3585		vi->has_cvq = true;
   3586
   3587	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
   3588		mtu = virtio_cread16(vdev,
   3589				     offsetof(struct virtio_net_config,
   3590					      mtu));
   3591		if (mtu < dev->min_mtu) {
   3592			/* Should never trigger: MTU was previously validated
   3593			 * in virtnet_validate.
   3594			 */
   3595			dev_err(&vdev->dev,
   3596				"device MTU appears to have changed it is now %d < %d",
   3597				mtu, dev->min_mtu);
   3598			err = -EINVAL;
   3599			goto free;
   3600		}
   3601
   3602		dev->mtu = mtu;
   3603		dev->max_mtu = mtu;
   3604
   3605		/* TODO: size buffers correctly in this case. */
   3606		if (dev->mtu > ETH_DATA_LEN)
   3607			vi->big_packets = true;
   3608	}
   3609
   3610	if (vi->any_header_sg)
   3611		dev->needed_headroom = vi->hdr_len;
   3612
   3613	/* Enable multiqueue by default */
   3614	if (num_online_cpus() >= max_queue_pairs)
   3615		vi->curr_queue_pairs = max_queue_pairs;
   3616	else
   3617		vi->curr_queue_pairs = num_online_cpus();
   3618	vi->max_queue_pairs = max_queue_pairs;
   3619
   3620	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
   3621	err = init_vqs(vi);
   3622	if (err)
   3623		goto free;
   3624
   3625#ifdef CONFIG_SYSFS
   3626	if (vi->mergeable_rx_bufs)
   3627		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
   3628#endif
   3629	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
   3630	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
   3631
   3632	virtnet_init_settings(dev);
   3633
   3634	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
   3635		vi->failover = net_failover_create(vi->dev);
   3636		if (IS_ERR(vi->failover)) {
   3637			err = PTR_ERR(vi->failover);
   3638			goto free_vqs;
   3639		}
   3640	}
   3641
   3642	if (vi->has_rss || vi->has_rss_hash_report)
   3643		virtnet_init_default_rss(vi);
   3644
   3645	/* serialize netdev register + virtio_device_ready() with ndo_open() */
   3646	rtnl_lock();
   3647
   3648	err = register_netdevice(dev);
   3649	if (err) {
   3650		pr_debug("virtio_net: registering device failed\n");
   3651		rtnl_unlock();
   3652		goto free_failover;
   3653	}
   3654
   3655	virtio_device_ready(vdev);
   3656
   3657	rtnl_unlock();
   3658
   3659	err = virtnet_cpu_notif_add(vi);
   3660	if (err) {
   3661		pr_debug("virtio_net: registering cpu notifier failed\n");
   3662		goto free_unregister_netdev;
   3663	}
   3664
   3665	virtnet_set_queues(vi, vi->curr_queue_pairs);
   3666
   3667	/* Assume link up if device can't report link status,
   3668	   otherwise get link status from config. */
   3669	netif_carrier_off(dev);
   3670	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
   3671		schedule_work(&vi->config_work);
   3672	} else {
   3673		vi->status = VIRTIO_NET_S_LINK_UP;
   3674		virtnet_update_settings(vi);
   3675		netif_carrier_on(dev);
   3676	}
   3677
   3678	for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
   3679		if (virtio_has_feature(vi->vdev, guest_offloads[i]))
   3680			set_bit(guest_offloads[i], &vi->guest_offloads);
   3681	vi->guest_offloads_capable = vi->guest_offloads;
   3682
   3683	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
   3684		 dev->name, max_queue_pairs);
   3685
   3686	return 0;
   3687
   3688free_unregister_netdev:
   3689	virtio_reset_device(vdev);
   3690
   3691	unregister_netdev(dev);
   3692free_failover:
   3693	net_failover_destroy(vi->failover);
   3694free_vqs:
   3695	cancel_delayed_work_sync(&vi->refill);
   3696	free_receive_page_frags(vi);
   3697	virtnet_del_vqs(vi);
   3698free:
   3699	free_netdev(dev);
   3700	return err;
   3701}
   3702
   3703static void remove_vq_common(struct virtnet_info *vi)
   3704{
   3705	virtio_reset_device(vi->vdev);
   3706
   3707	/* Free unused buffers in both send and recv, if any. */
   3708	free_unused_bufs(vi);
   3709
   3710	free_receive_bufs(vi);
   3711
   3712	free_receive_page_frags(vi);
   3713
   3714	virtnet_del_vqs(vi);
   3715}
   3716
   3717static void virtnet_remove(struct virtio_device *vdev)
   3718{
   3719	struct virtnet_info *vi = vdev->priv;
   3720
   3721	virtnet_cpu_notif_remove(vi);
   3722
   3723	/* Make sure no work handler is accessing the device. */
   3724	flush_work(&vi->config_work);
   3725
   3726	unregister_netdev(vi->dev);
   3727
   3728	net_failover_destroy(vi->failover);
   3729
   3730	remove_vq_common(vi);
   3731
   3732	free_netdev(vi->dev);
   3733}
   3734
   3735static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
   3736{
   3737	struct virtnet_info *vi = vdev->priv;
   3738
   3739	virtnet_cpu_notif_remove(vi);
   3740	virtnet_freeze_down(vdev);
   3741	remove_vq_common(vi);
   3742
   3743	return 0;
   3744}
   3745
   3746static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
   3747{
   3748	struct virtnet_info *vi = vdev->priv;
   3749	int err;
   3750
   3751	err = virtnet_restore_up(vdev);
   3752	if (err)
   3753		return err;
   3754	virtnet_set_queues(vi, vi->curr_queue_pairs);
   3755
   3756	err = virtnet_cpu_notif_add(vi);
   3757	if (err) {
   3758		virtnet_freeze_down(vdev);
   3759		remove_vq_common(vi);
   3760		return err;
   3761	}
   3762
   3763	return 0;
   3764}
   3765
   3766static struct virtio_device_id id_table[] = {
   3767	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
   3768	{ 0 },
   3769};
   3770
   3771#define VIRTNET_FEATURES \
   3772	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
   3773	VIRTIO_NET_F_MAC, \
   3774	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
   3775	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
   3776	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
   3777	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
   3778	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
   3779	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
   3780	VIRTIO_NET_F_CTRL_MAC_ADDR, \
   3781	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
   3782	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \
   3783	VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT
   3784
   3785static unsigned int features[] = {
   3786	VIRTNET_FEATURES,
   3787};
   3788
   3789static unsigned int features_legacy[] = {
   3790	VIRTNET_FEATURES,
   3791	VIRTIO_NET_F_GSO,
   3792	VIRTIO_F_ANY_LAYOUT,
   3793};
   3794
   3795static struct virtio_driver virtio_net_driver = {
   3796	.feature_table = features,
   3797	.feature_table_size = ARRAY_SIZE(features),
   3798	.feature_table_legacy = features_legacy,
   3799	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
   3800	.driver.name =	KBUILD_MODNAME,
   3801	.driver.owner =	THIS_MODULE,
   3802	.id_table =	id_table,
   3803	.validate =	virtnet_validate,
   3804	.probe =	virtnet_probe,
   3805	.remove =	virtnet_remove,
   3806	.config_changed = virtnet_config_changed,
   3807#ifdef CONFIG_PM_SLEEP
   3808	.freeze =	virtnet_freeze,
   3809	.restore =	virtnet_restore,
   3810#endif
   3811};
   3812
   3813static __init int virtio_net_driver_init(void)
   3814{
   3815	int ret;
   3816
   3817	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
   3818				      virtnet_cpu_online,
   3819				      virtnet_cpu_down_prep);
   3820	if (ret < 0)
   3821		goto out;
   3822	virtionet_online = ret;
   3823	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
   3824				      NULL, virtnet_cpu_dead);
   3825	if (ret)
   3826		goto err_dead;
   3827	ret = register_virtio_driver(&virtio_net_driver);
   3828	if (ret)
   3829		goto err_virtio;
   3830	return 0;
   3831err_virtio:
   3832	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
   3833err_dead:
   3834	cpuhp_remove_multi_state(virtionet_online);
   3835out:
   3836	return ret;
   3837}
   3838module_init(virtio_net_driver_init);
   3839
   3840static __exit void virtio_net_driver_exit(void)
   3841{
   3842	unregister_virtio_driver(&virtio_net_driver);
   3843	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
   3844	cpuhp_remove_multi_state(virtionet_online);
   3845}
   3846module_exit(virtio_net_driver_exit);
   3847
   3848MODULE_DEVICE_TABLE(virtio, id_table);
   3849MODULE_DESCRIPTION("Virtio network driver");
   3850MODULE_LICENSE("GPL");