netback.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
netback.c (46464B)
      1/*
      2 * Back-end of the driver for virtual network devices. This portion of the
      3 * driver exports a 'unified' network-device interface that can be accessed
      4 * by any operating system that implements a compatible front end. A
      5 * reference front-end implementation can be found in:
      6 *  drivers/net/xen-netfront.c
      7 *
      8 * Copyright (c) 2002-2005, K A Fraser
      9 *
     10 * This program is free software; you can redistribute it and/or
     11 * modify it under the terms of the GNU General Public License version 2
     12 * as published by the Free Software Foundation; or, when distributed
     13 * separately from the Linux kernel or incorporated into other
     14 * software packages, subject to the following license:
     15 *
     16 * Permission is hereby granted, free of charge, to any person obtaining a copy
     17 * of this source file (the "Software"), to deal in the Software without
     18 * restriction, including without limitation the rights to use, copy, modify,
     19 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
     20 * and to permit persons to whom the Software is furnished to do so, subject to
     21 * the following conditions:
     22 *
     23 * The above copyright notice and this permission notice shall be included in
     24 * all copies or substantial portions of the Software.
     25 *
     26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     27 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     28 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     29 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     30 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     31 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     32 * IN THE SOFTWARE.
     33 */
     34
     35#include "common.h"
     36
     37#include <linux/kthread.h>
     38#include <linux/if_vlan.h>
     39#include <linux/udp.h>
     40#include <linux/highmem.h>
     41
     42#include <net/tcp.h>
     43
     44#include <xen/xen.h>
     45#include <xen/events.h>
     46#include <xen/interface/memory.h>
     47#include <xen/page.h>
     48
     49#include <asm/xen/hypercall.h>
     50
     51/* Provide an option to disable split event channels at load time as
     52 * event channels are limited resource. Split event channels are
     53 * enabled by default.
     54 */
     55bool separate_tx_rx_irq = true;
     56module_param(separate_tx_rx_irq, bool, 0644);
     57
     58/* The time that packets can stay on the guest Rx internal queue
     59 * before they are dropped.
     60 */
     61unsigned int rx_drain_timeout_msecs = 10000;
     62module_param(rx_drain_timeout_msecs, uint, 0444);
     63
     64/* The length of time before the frontend is considered unresponsive
     65 * because it isn't providing Rx slots.
     66 */
     67unsigned int rx_stall_timeout_msecs = 60000;
     68module_param(rx_stall_timeout_msecs, uint, 0444);
     69
     70#define MAX_QUEUES_DEFAULT 8
     71unsigned int xenvif_max_queues;
     72module_param_named(max_queues, xenvif_max_queues, uint, 0644);
     73MODULE_PARM_DESC(max_queues,
     74		 "Maximum number of queues per virtual interface");
     75
     76/*
     77 * This is the maximum slots a skb can have. If a guest sends a skb
     78 * which exceeds this limit it is considered malicious.
     79 */
     80#define FATAL_SKB_SLOTS_DEFAULT 20
     81static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT;
     82module_param(fatal_skb_slots, uint, 0444);
     83
     84/* The amount to copy out of the first guest Tx slot into the skb's
     85 * linear area.  If the first slot has more data, it will be mapped
     86 * and put into the first frag.
     87 *
     88 * This is sized to avoid pulling headers from the frags for most
     89 * TCP/IP packets.
     90 */
     91#define XEN_NETBACK_TX_COPY_LEN 128
     92
     93/* This is the maximum number of flows in the hash cache. */
     94#define XENVIF_HASH_CACHE_SIZE_DEFAULT 64
     95unsigned int xenvif_hash_cache_size = XENVIF_HASH_CACHE_SIZE_DEFAULT;
     96module_param_named(hash_cache_size, xenvif_hash_cache_size, uint, 0644);
     97MODULE_PARM_DESC(hash_cache_size, "Number of flows in the hash cache");
     98
     99/* The module parameter tells that we have to put data
    100 * for xen-netfront with the XDP_PACKET_HEADROOM offset
    101 * needed for XDP processing
    102 */
    103bool provides_xdp_headroom = true;
    104module_param(provides_xdp_headroom, bool, 0644);
    105
    106static void xenvif_idx_release(struct xenvif_queue *queue, u16 pending_idx,
    107			       u8 status);
    108
    109static void make_tx_response(struct xenvif_queue *queue,
    110			     struct xen_netif_tx_request *txp,
    111			     unsigned int extra_count,
    112			     s8       st);
    113static void push_tx_responses(struct xenvif_queue *queue);
    114
    115static inline int tx_work_todo(struct xenvif_queue *queue);
    116
    117static inline unsigned long idx_to_pfn(struct xenvif_queue *queue,
    118				       u16 idx)
    119{
    120	return page_to_pfn(queue->mmap_pages[idx]);
    121}
    122
    123static inline unsigned long idx_to_kaddr(struct xenvif_queue *queue,
    124					 u16 idx)
    125{
    126	return (unsigned long)pfn_to_kaddr(idx_to_pfn(queue, idx));
    127}
    128
    129#define callback_param(vif, pending_idx) \
    130	(vif->pending_tx_info[pending_idx].callback_struct)
    131
    132/* Find the containing VIF's structure from a pointer in pending_tx_info array
    133 */
    134static inline struct xenvif_queue *ubuf_to_queue(const struct ubuf_info *ubuf)
    135{
    136	u16 pending_idx = ubuf->desc;
    137	struct pending_tx_info *temp =
    138		container_of(ubuf, struct pending_tx_info, callback_struct);
    139	return container_of(temp - pending_idx,
    140			    struct xenvif_queue,
    141			    pending_tx_info[0]);
    142}
    143
    144static u16 frag_get_pending_idx(skb_frag_t *frag)
    145{
    146	return (u16)skb_frag_off(frag);
    147}
    148
    149static void frag_set_pending_idx(skb_frag_t *frag, u16 pending_idx)
    150{
    151	skb_frag_off_set(frag, pending_idx);
    152}
    153
    154static inline pending_ring_idx_t pending_index(unsigned i)
    155{
    156	return i & (MAX_PENDING_REQS-1);
    157}
    158
    159void xenvif_kick_thread(struct xenvif_queue *queue)
    160{
    161	wake_up(&queue->wq);
    162}
    163
    164void xenvif_napi_schedule_or_enable_events(struct xenvif_queue *queue)
    165{
    166	int more_to_do;
    167
    168	RING_FINAL_CHECK_FOR_REQUESTS(&queue->tx, more_to_do);
    169
    170	if (more_to_do)
    171		napi_schedule(&queue->napi);
    172	else if (atomic_fetch_andnot(NETBK_TX_EOI | NETBK_COMMON_EOI,
    173				     &queue->eoi_pending) &
    174		 (NETBK_TX_EOI | NETBK_COMMON_EOI))
    175		xen_irq_lateeoi(queue->tx_irq, 0);
    176}
    177
    178static void tx_add_credit(struct xenvif_queue *queue)
    179{
    180	unsigned long max_burst, max_credit;
    181
    182	/*
    183	 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
    184	 * Otherwise the interface can seize up due to insufficient credit.
    185	 */
    186	max_burst = max(131072UL, queue->credit_bytes);
    187
    188	/* Take care that adding a new chunk of credit doesn't wrap to zero. */
    189	max_credit = queue->remaining_credit + queue->credit_bytes;
    190	if (max_credit < queue->remaining_credit)
    191		max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
    192
    193	queue->remaining_credit = min(max_credit, max_burst);
    194	queue->rate_limited = false;
    195}
    196
    197void xenvif_tx_credit_callback(struct timer_list *t)
    198{
    199	struct xenvif_queue *queue = from_timer(queue, t, credit_timeout);
    200	tx_add_credit(queue);
    201	xenvif_napi_schedule_or_enable_events(queue);
    202}
    203
    204static void xenvif_tx_err(struct xenvif_queue *queue,
    205			  struct xen_netif_tx_request *txp,
    206			  unsigned int extra_count, RING_IDX end)
    207{
    208	RING_IDX cons = queue->tx.req_cons;
    209	unsigned long flags;
    210
    211	do {
    212		spin_lock_irqsave(&queue->response_lock, flags);
    213		make_tx_response(queue, txp, extra_count, XEN_NETIF_RSP_ERROR);
    214		push_tx_responses(queue);
    215		spin_unlock_irqrestore(&queue->response_lock, flags);
    216		if (cons == end)
    217			break;
    218		RING_COPY_REQUEST(&queue->tx, cons++, txp);
    219		extra_count = 0; /* only the first frag can have extras */
    220	} while (1);
    221	queue->tx.req_cons = cons;
    222}
    223
    224static void xenvif_fatal_tx_err(struct xenvif *vif)
    225{
    226	netdev_err(vif->dev, "fatal error; disabling device\n");
    227	vif->disabled = true;
    228	/* Disable the vif from queue 0's kthread */
    229	if (vif->num_queues)
    230		xenvif_kick_thread(&vif->queues[0]);
    231}
    232
    233static int xenvif_count_requests(struct xenvif_queue *queue,
    234				 struct xen_netif_tx_request *first,
    235				 unsigned int extra_count,
    236				 struct xen_netif_tx_request *txp,
    237				 int work_to_do)
    238{
    239	RING_IDX cons = queue->tx.req_cons;
    240	int slots = 0;
    241	int drop_err = 0;
    242	int more_data;
    243
    244	if (!(first->flags & XEN_NETTXF_more_data))
    245		return 0;
    246
    247	do {
    248		struct xen_netif_tx_request dropped_tx = { 0 };
    249
    250		if (slots >= work_to_do) {
    251			netdev_err(queue->vif->dev,
    252				   "Asked for %d slots but exceeds this limit\n",
    253				   work_to_do);
    254			xenvif_fatal_tx_err(queue->vif);
    255			return -ENODATA;
    256		}
    257
    258		/* This guest is really using too many slots and
    259		 * considered malicious.
    260		 */
    261		if (unlikely(slots >= fatal_skb_slots)) {
    262			netdev_err(queue->vif->dev,
    263				   "Malicious frontend using %d slots, threshold %u\n",
    264				   slots, fatal_skb_slots);
    265			xenvif_fatal_tx_err(queue->vif);
    266			return -E2BIG;
    267		}
    268
    269		/* Xen network protocol had implicit dependency on
    270		 * MAX_SKB_FRAGS. XEN_NETBK_LEGACY_SLOTS_MAX is set to
    271		 * the historical MAX_SKB_FRAGS value 18 to honor the
    272		 * same behavior as before. Any packet using more than
    273		 * 18 slots but less than fatal_skb_slots slots is
    274		 * dropped
    275		 */
    276		if (!drop_err && slots >= XEN_NETBK_LEGACY_SLOTS_MAX) {
    277			if (net_ratelimit())
    278				netdev_dbg(queue->vif->dev,
    279					   "Too many slots (%d) exceeding limit (%d), dropping packet\n",
    280					   slots, XEN_NETBK_LEGACY_SLOTS_MAX);
    281			drop_err = -E2BIG;
    282		}
    283
    284		if (drop_err)
    285			txp = &dropped_tx;
    286
    287		RING_COPY_REQUEST(&queue->tx, cons + slots, txp);
    288
    289		/* If the guest submitted a frame >= 64 KiB then
    290		 * first->size overflowed and following slots will
    291		 * appear to be larger than the frame.
    292		 *
    293		 * This cannot be fatal error as there are buggy
    294		 * frontends that do this.
    295		 *
    296		 * Consume all slots and drop the packet.
    297		 */
    298		if (!drop_err && txp->size > first->size) {
    299			if (net_ratelimit())
    300				netdev_dbg(queue->vif->dev,
    301					   "Invalid tx request, slot size %u > remaining size %u\n",
    302					   txp->size, first->size);
    303			drop_err = -EIO;
    304		}
    305
    306		first->size -= txp->size;
    307		slots++;
    308
    309		if (unlikely((txp->offset + txp->size) > XEN_PAGE_SIZE)) {
    310			netdev_err(queue->vif->dev, "Cross page boundary, txp->offset: %u, size: %u\n",
    311				 txp->offset, txp->size);
    312			xenvif_fatal_tx_err(queue->vif);
    313			return -EINVAL;
    314		}
    315
    316		more_data = txp->flags & XEN_NETTXF_more_data;
    317
    318		if (!drop_err)
    319			txp++;
    320
    321	} while (more_data);
    322
    323	if (drop_err) {
    324		xenvif_tx_err(queue, first, extra_count, cons + slots);
    325		return drop_err;
    326	}
    327
    328	return slots;
    329}
    330
    331
    332struct xenvif_tx_cb {
    333	u16 pending_idx;
    334};
    335
    336#define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb)
    337
    338static inline void xenvif_tx_create_map_op(struct xenvif_queue *queue,
    339					   u16 pending_idx,
    340					   struct xen_netif_tx_request *txp,
    341					   unsigned int extra_count,
    342					   struct gnttab_map_grant_ref *mop)
    343{
    344	queue->pages_to_map[mop-queue->tx_map_ops] = queue->mmap_pages[pending_idx];
    345	gnttab_set_map_op(mop, idx_to_kaddr(queue, pending_idx),
    346			  GNTMAP_host_map | GNTMAP_readonly,
    347			  txp->gref, queue->vif->domid);
    348
    349	memcpy(&queue->pending_tx_info[pending_idx].req, txp,
    350	       sizeof(*txp));
    351	queue->pending_tx_info[pending_idx].extra_count = extra_count;
    352}
    353
    354static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
    355{
    356	struct sk_buff *skb =
    357		alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN,
    358			  GFP_ATOMIC | __GFP_NOWARN);
    359	if (unlikely(skb == NULL))
    360		return NULL;
    361
    362	/* Packets passed to netif_rx() must have some headroom. */
    363	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
    364
    365	/* Initialize it here to avoid later surprises */
    366	skb_shinfo(skb)->destructor_arg = NULL;
    367
    368	return skb;
    369}
    370
    371static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif_queue *queue,
    372							struct sk_buff *skb,
    373							struct xen_netif_tx_request *txp,
    374							struct gnttab_map_grant_ref *gop,
    375							unsigned int frag_overflow,
    376							struct sk_buff *nskb)
    377{
    378	struct skb_shared_info *shinfo = skb_shinfo(skb);
    379	skb_frag_t *frags = shinfo->frags;
    380	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
    381	int start;
    382	pending_ring_idx_t index;
    383	unsigned int nr_slots;
    384
    385	nr_slots = shinfo->nr_frags;
    386
    387	/* Skip first skb fragment if it is on same page as header fragment. */
    388	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
    389
    390	for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
    391	     shinfo->nr_frags++, txp++, gop++) {
    392		index = pending_index(queue->pending_cons++);
    393		pending_idx = queue->pending_ring[index];
    394		xenvif_tx_create_map_op(queue, pending_idx, txp, 0, gop);
    395		frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
    396	}
    397
    398	if (frag_overflow) {
    399
    400		shinfo = skb_shinfo(nskb);
    401		frags = shinfo->frags;
    402
    403		for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
    404		     shinfo->nr_frags++, txp++, gop++) {
    405			index = pending_index(queue->pending_cons++);
    406			pending_idx = queue->pending_ring[index];
    407			xenvif_tx_create_map_op(queue, pending_idx, txp, 0,
    408						gop);
    409			frag_set_pending_idx(&frags[shinfo->nr_frags],
    410					     pending_idx);
    411		}
    412
    413		skb_shinfo(skb)->frag_list = nskb;
    414	}
    415
    416	return gop;
    417}
    418
    419static inline void xenvif_grant_handle_set(struct xenvif_queue *queue,
    420					   u16 pending_idx,
    421					   grant_handle_t handle)
    422{
    423	if (unlikely(queue->grant_tx_handle[pending_idx] !=
    424		     NETBACK_INVALID_HANDLE)) {
    425		netdev_err(queue->vif->dev,
    426			   "Trying to overwrite active handle! pending_idx: 0x%x\n",
    427			   pending_idx);
    428		BUG();
    429	}
    430	queue->grant_tx_handle[pending_idx] = handle;
    431}
    432
    433static inline void xenvif_grant_handle_reset(struct xenvif_queue *queue,
    434					     u16 pending_idx)
    435{
    436	if (unlikely(queue->grant_tx_handle[pending_idx] ==
    437		     NETBACK_INVALID_HANDLE)) {
    438		netdev_err(queue->vif->dev,
    439			   "Trying to unmap invalid handle! pending_idx: 0x%x\n",
    440			   pending_idx);
    441		BUG();
    442	}
    443	queue->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
    444}
    445
    446static int xenvif_tx_check_gop(struct xenvif_queue *queue,
    447			       struct sk_buff *skb,
    448			       struct gnttab_map_grant_ref **gopp_map,
    449			       struct gnttab_copy **gopp_copy)
    450{
    451	struct gnttab_map_grant_ref *gop_map = *gopp_map;
    452	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
    453	/* This always points to the shinfo of the skb being checked, which
    454	 * could be either the first or the one on the frag_list
    455	 */
    456	struct skb_shared_info *shinfo = skb_shinfo(skb);
    457	/* If this is non-NULL, we are currently checking the frag_list skb, and
    458	 * this points to the shinfo of the first one
    459	 */
    460	struct skb_shared_info *first_shinfo = NULL;
    461	int nr_frags = shinfo->nr_frags;
    462	const bool sharedslot = nr_frags &&
    463				frag_get_pending_idx(&shinfo->frags[0]) == pending_idx;
    464	int i, err;
    465
    466	/* Check status of header. */
    467	err = (*gopp_copy)->status;
    468	if (unlikely(err)) {
    469		if (net_ratelimit())
    470			netdev_dbg(queue->vif->dev,
    471				   "Grant copy of header failed! status: %d pending_idx: %u ref: %u\n",
    472				   (*gopp_copy)->status,
    473				   pending_idx,
    474				   (*gopp_copy)->source.u.ref);
    475		/* The first frag might still have this slot mapped */
    476		if (!sharedslot)
    477			xenvif_idx_release(queue, pending_idx,
    478					   XEN_NETIF_RSP_ERROR);
    479	}
    480	(*gopp_copy)++;
    481
    482check_frags:
    483	for (i = 0; i < nr_frags; i++, gop_map++) {
    484		int j, newerr;
    485
    486		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
    487
    488		/* Check error status: if okay then remember grant handle. */
    489		newerr = gop_map->status;
    490
    491		if (likely(!newerr)) {
    492			xenvif_grant_handle_set(queue,
    493						pending_idx,
    494						gop_map->handle);
    495			/* Had a previous error? Invalidate this fragment. */
    496			if (unlikely(err)) {
    497				xenvif_idx_unmap(queue, pending_idx);
    498				/* If the mapping of the first frag was OK, but
    499				 * the header's copy failed, and they are
    500				 * sharing a slot, send an error
    501				 */
    502				if (i == 0 && !first_shinfo && sharedslot)
    503					xenvif_idx_release(queue, pending_idx,
    504							   XEN_NETIF_RSP_ERROR);
    505				else
    506					xenvif_idx_release(queue, pending_idx,
    507							   XEN_NETIF_RSP_OKAY);
    508			}
    509			continue;
    510		}
    511
    512		/* Error on this fragment: respond to client with an error. */
    513		if (net_ratelimit())
    514			netdev_dbg(queue->vif->dev,
    515				   "Grant map of %d. frag failed! status: %d pending_idx: %u ref: %u\n",
    516				   i,
    517				   gop_map->status,
    518				   pending_idx,
    519				   gop_map->ref);
    520
    521		xenvif_idx_release(queue, pending_idx, XEN_NETIF_RSP_ERROR);
    522
    523		/* Not the first error? Preceding frags already invalidated. */
    524		if (err)
    525			continue;
    526
    527		/* First error: if the header haven't shared a slot with the
    528		 * first frag, release it as well.
    529		 */
    530		if (!sharedslot)
    531			xenvif_idx_release(queue,
    532					   XENVIF_TX_CB(skb)->pending_idx,
    533					   XEN_NETIF_RSP_OKAY);
    534
    535		/* Invalidate preceding fragments of this skb. */
    536		for (j = 0; j < i; j++) {
    537			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
    538			xenvif_idx_unmap(queue, pending_idx);
    539			xenvif_idx_release(queue, pending_idx,
    540					   XEN_NETIF_RSP_OKAY);
    541		}
    542
    543		/* And if we found the error while checking the frag_list, unmap
    544		 * the first skb's frags
    545		 */
    546		if (first_shinfo) {
    547			for (j = 0; j < first_shinfo->nr_frags; j++) {
    548				pending_idx = frag_get_pending_idx(&first_shinfo->frags[j]);
    549				xenvif_idx_unmap(queue, pending_idx);
    550				xenvif_idx_release(queue, pending_idx,
    551						   XEN_NETIF_RSP_OKAY);
    552			}
    553		}
    554
    555		/* Remember the error: invalidate all subsequent fragments. */
    556		err = newerr;
    557	}
    558
    559	if (skb_has_frag_list(skb) && !first_shinfo) {
    560		first_shinfo = shinfo;
    561		shinfo = skb_shinfo(shinfo->frag_list);
    562		nr_frags = shinfo->nr_frags;
    563
    564		goto check_frags;
    565	}
    566
    567	*gopp_map = gop_map;
    568	return err;
    569}
    570
    571static void xenvif_fill_frags(struct xenvif_queue *queue, struct sk_buff *skb)
    572{
    573	struct skb_shared_info *shinfo = skb_shinfo(skb);
    574	int nr_frags = shinfo->nr_frags;
    575	int i;
    576	u16 prev_pending_idx = INVALID_PENDING_IDX;
    577
    578	for (i = 0; i < nr_frags; i++) {
    579		skb_frag_t *frag = shinfo->frags + i;
    580		struct xen_netif_tx_request *txp;
    581		struct page *page;
    582		u16 pending_idx;
    583
    584		pending_idx = frag_get_pending_idx(frag);
    585
    586		/* If this is not the first frag, chain it to the previous*/
    587		if (prev_pending_idx == INVALID_PENDING_IDX)
    588			skb_shinfo(skb)->destructor_arg =
    589				&callback_param(queue, pending_idx);
    590		else
    591			callback_param(queue, prev_pending_idx).ctx =
    592				&callback_param(queue, pending_idx);
    593
    594		callback_param(queue, pending_idx).ctx = NULL;
    595		prev_pending_idx = pending_idx;
    596
    597		txp = &queue->pending_tx_info[pending_idx].req;
    598		page = virt_to_page(idx_to_kaddr(queue, pending_idx));
    599		__skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
    600		skb->len += txp->size;
    601		skb->data_len += txp->size;
    602		skb->truesize += txp->size;
    603
    604		/* Take an extra reference to offset network stack's put_page */
    605		get_page(queue->mmap_pages[pending_idx]);
    606	}
    607}
    608
    609static int xenvif_get_extras(struct xenvif_queue *queue,
    610			     struct xen_netif_extra_info *extras,
    611			     unsigned int *extra_count,
    612			     int work_to_do)
    613{
    614	struct xen_netif_extra_info extra;
    615	RING_IDX cons = queue->tx.req_cons;
    616
    617	do {
    618		if (unlikely(work_to_do-- <= 0)) {
    619			netdev_err(queue->vif->dev, "Missing extra info\n");
    620			xenvif_fatal_tx_err(queue->vif);
    621			return -EBADR;
    622		}
    623
    624		RING_COPY_REQUEST(&queue->tx, cons, &extra);
    625
    626		queue->tx.req_cons = ++cons;
    627		(*extra_count)++;
    628
    629		if (unlikely(!extra.type ||
    630			     extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
    631			netdev_err(queue->vif->dev,
    632				   "Invalid extra type: %d\n", extra.type);
    633			xenvif_fatal_tx_err(queue->vif);
    634			return -EINVAL;
    635		}
    636
    637		memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
    638	} while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
    639
    640	return work_to_do;
    641}
    642
    643static int xenvif_set_skb_gso(struct xenvif *vif,
    644			      struct sk_buff *skb,
    645			      struct xen_netif_extra_info *gso)
    646{
    647	if (!gso->u.gso.size) {
    648		netdev_err(vif->dev, "GSO size must not be zero.\n");
    649		xenvif_fatal_tx_err(vif);
    650		return -EINVAL;
    651	}
    652
    653	switch (gso->u.gso.type) {
    654	case XEN_NETIF_GSO_TYPE_TCPV4:
    655		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
    656		break;
    657	case XEN_NETIF_GSO_TYPE_TCPV6:
    658		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
    659		break;
    660	default:
    661		netdev_err(vif->dev, "Bad GSO type %d.\n", gso->u.gso.type);
    662		xenvif_fatal_tx_err(vif);
    663		return -EINVAL;
    664	}
    665
    666	skb_shinfo(skb)->gso_size = gso->u.gso.size;
    667	/* gso_segs will be calculated later */
    668
    669	return 0;
    670}
    671
    672static int checksum_setup(struct xenvif_queue *queue, struct sk_buff *skb)
    673{
    674	bool recalculate_partial_csum = false;
    675
    676	/* A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
    677	 * peers can fail to set NETRXF_csum_blank when sending a GSO
    678	 * frame. In this case force the SKB to CHECKSUM_PARTIAL and
    679	 * recalculate the partial checksum.
    680	 */
    681	if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) {
    682		queue->stats.rx_gso_checksum_fixup++;
    683		skb->ip_summed = CHECKSUM_PARTIAL;
    684		recalculate_partial_csum = true;
    685	}
    686
    687	/* A non-CHECKSUM_PARTIAL SKB does not require setup. */
    688	if (skb->ip_summed != CHECKSUM_PARTIAL)
    689		return 0;
    690
    691	return skb_checksum_setup(skb, recalculate_partial_csum);
    692}
    693
    694static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size)
    695{
    696	u64 now = get_jiffies_64();
    697	u64 next_credit = queue->credit_window_start +
    698		msecs_to_jiffies(queue->credit_usec / 1000);
    699
    700	/* Timer could already be pending in rare cases. */
    701	if (timer_pending(&queue->credit_timeout)) {
    702		queue->rate_limited = true;
    703		return true;
    704	}
    705
    706	/* Passed the point where we can replenish credit? */
    707	if (time_after_eq64(now, next_credit)) {
    708		queue->credit_window_start = now;
    709		tx_add_credit(queue);
    710	}
    711
    712	/* Still too big to send right now? Set a callback. */
    713	if (size > queue->remaining_credit) {
    714		mod_timer(&queue->credit_timeout,
    715			  next_credit);
    716		queue->credit_window_start = next_credit;
    717		queue->rate_limited = true;
    718
    719		return true;
    720	}
    721
    722	return false;
    723}
    724
    725/* No locking is required in xenvif_mcast_add/del() as they are
    726 * only ever invoked from NAPI poll. An RCU list is used because
    727 * xenvif_mcast_match() is called asynchronously, during start_xmit.
    728 */
    729
    730static int xenvif_mcast_add(struct xenvif *vif, const u8 *addr)
    731{
    732	struct xenvif_mcast_addr *mcast;
    733
    734	if (vif->fe_mcast_count == XEN_NETBK_MCAST_MAX) {
    735		if (net_ratelimit())
    736			netdev_err(vif->dev,
    737				   "Too many multicast addresses\n");
    738		return -ENOSPC;
    739	}
    740
    741	mcast = kzalloc(sizeof(*mcast), GFP_ATOMIC);
    742	if (!mcast)
    743		return -ENOMEM;
    744
    745	ether_addr_copy(mcast->addr, addr);
    746	list_add_tail_rcu(&mcast->entry, &vif->fe_mcast_addr);
    747	vif->fe_mcast_count++;
    748
    749	return 0;
    750}
    751
    752static void xenvif_mcast_del(struct xenvif *vif, const u8 *addr)
    753{
    754	struct xenvif_mcast_addr *mcast;
    755
    756	list_for_each_entry_rcu(mcast, &vif->fe_mcast_addr, entry) {
    757		if (ether_addr_equal(addr, mcast->addr)) {
    758			--vif->fe_mcast_count;
    759			list_del_rcu(&mcast->entry);
    760			kfree_rcu(mcast, rcu);
    761			break;
    762		}
    763	}
    764}
    765
    766bool xenvif_mcast_match(struct xenvif *vif, const u8 *addr)
    767{
    768	struct xenvif_mcast_addr *mcast;
    769
    770	rcu_read_lock();
    771	list_for_each_entry_rcu(mcast, &vif->fe_mcast_addr, entry) {
    772		if (ether_addr_equal(addr, mcast->addr)) {
    773			rcu_read_unlock();
    774			return true;
    775		}
    776	}
    777	rcu_read_unlock();
    778
    779	return false;
    780}
    781
    782void xenvif_mcast_addr_list_free(struct xenvif *vif)
    783{
    784	/* No need for locking or RCU here. NAPI poll and TX queue
    785	 * are stopped.
    786	 */
    787	while (!list_empty(&vif->fe_mcast_addr)) {
    788		struct xenvif_mcast_addr *mcast;
    789
    790		mcast = list_first_entry(&vif->fe_mcast_addr,
    791					 struct xenvif_mcast_addr,
    792					 entry);
    793		--vif->fe_mcast_count;
    794		list_del(&mcast->entry);
    795		kfree(mcast);
    796	}
    797}
    798
    799static void xenvif_tx_build_gops(struct xenvif_queue *queue,
    800				     int budget,
    801				     unsigned *copy_ops,
    802				     unsigned *map_ops)
    803{
    804	struct gnttab_map_grant_ref *gop = queue->tx_map_ops;
    805	struct sk_buff *skb, *nskb;
    806	int ret;
    807	unsigned int frag_overflow;
    808
    809	while (skb_queue_len(&queue->tx_queue) < budget) {
    810		struct xen_netif_tx_request txreq;
    811		struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
    812		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
    813		unsigned int extra_count;
    814		u16 pending_idx;
    815		RING_IDX idx;
    816		int work_to_do;
    817		unsigned int data_len;
    818		pending_ring_idx_t index;
    819
    820		if (queue->tx.sring->req_prod - queue->tx.req_cons >
    821		    XEN_NETIF_TX_RING_SIZE) {
    822			netdev_err(queue->vif->dev,
    823				   "Impossible number of requests. "
    824				   "req_prod %d, req_cons %d, size %ld\n",
    825				   queue->tx.sring->req_prod, queue->tx.req_cons,
    826				   XEN_NETIF_TX_RING_SIZE);
    827			xenvif_fatal_tx_err(queue->vif);
    828			break;
    829		}
    830
    831		work_to_do = XEN_RING_NR_UNCONSUMED_REQUESTS(&queue->tx);
    832		if (!work_to_do)
    833			break;
    834
    835		idx = queue->tx.req_cons;
    836		rmb(); /* Ensure that we see the request before we copy it. */
    837		RING_COPY_REQUEST(&queue->tx, idx, &txreq);
    838
    839		/* Credit-based scheduling. */
    840		if (txreq.size > queue->remaining_credit &&
    841		    tx_credit_exceeded(queue, txreq.size))
    842			break;
    843
    844		queue->remaining_credit -= txreq.size;
    845
    846		work_to_do--;
    847		queue->tx.req_cons = ++idx;
    848
    849		memset(extras, 0, sizeof(extras));
    850		extra_count = 0;
    851		if (txreq.flags & XEN_NETTXF_extra_info) {
    852			work_to_do = xenvif_get_extras(queue, extras,
    853						       &extra_count,
    854						       work_to_do);
    855			idx = queue->tx.req_cons;
    856			if (unlikely(work_to_do < 0))
    857				break;
    858		}
    859
    860		if (extras[XEN_NETIF_EXTRA_TYPE_MCAST_ADD - 1].type) {
    861			struct xen_netif_extra_info *extra;
    862
    863			extra = &extras[XEN_NETIF_EXTRA_TYPE_MCAST_ADD - 1];
    864			ret = xenvif_mcast_add(queue->vif, extra->u.mcast.addr);
    865
    866			make_tx_response(queue, &txreq, extra_count,
    867					 (ret == 0) ?
    868					 XEN_NETIF_RSP_OKAY :
    869					 XEN_NETIF_RSP_ERROR);
    870			push_tx_responses(queue);
    871			continue;
    872		}
    873
    874		if (extras[XEN_NETIF_EXTRA_TYPE_MCAST_DEL - 1].type) {
    875			struct xen_netif_extra_info *extra;
    876
    877			extra = &extras[XEN_NETIF_EXTRA_TYPE_MCAST_DEL - 1];
    878			xenvif_mcast_del(queue->vif, extra->u.mcast.addr);
    879
    880			make_tx_response(queue, &txreq, extra_count,
    881					 XEN_NETIF_RSP_OKAY);
    882			push_tx_responses(queue);
    883			continue;
    884		}
    885
    886		ret = xenvif_count_requests(queue, &txreq, extra_count,
    887					    txfrags, work_to_do);
    888		if (unlikely(ret < 0))
    889			break;
    890
    891		idx += ret;
    892
    893		if (unlikely(txreq.size < ETH_HLEN)) {
    894			netdev_dbg(queue->vif->dev,
    895				   "Bad packet size: %d\n", txreq.size);
    896			xenvif_tx_err(queue, &txreq, extra_count, idx);
    897			break;
    898		}
    899
    900		/* No crossing a page as the payload mustn't fragment. */
    901		if (unlikely((txreq.offset + txreq.size) > XEN_PAGE_SIZE)) {
    902			netdev_err(queue->vif->dev,
    903				   "txreq.offset: %u, size: %u, end: %lu\n",
    904				   txreq.offset, txreq.size,
    905				   (unsigned long)(txreq.offset&~XEN_PAGE_MASK) + txreq.size);
    906			xenvif_fatal_tx_err(queue->vif);
    907			break;
    908		}
    909
    910		index = pending_index(queue->pending_cons);
    911		pending_idx = queue->pending_ring[index];
    912
    913		data_len = (txreq.size > XEN_NETBACK_TX_COPY_LEN &&
    914			    ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
    915			XEN_NETBACK_TX_COPY_LEN : txreq.size;
    916
    917		skb = xenvif_alloc_skb(data_len);
    918		if (unlikely(skb == NULL)) {
    919			netdev_dbg(queue->vif->dev,
    920				   "Can't allocate a skb in start_xmit.\n");
    921			xenvif_tx_err(queue, &txreq, extra_count, idx);
    922			break;
    923		}
    924
    925		skb_shinfo(skb)->nr_frags = ret;
    926		if (data_len < txreq.size)
    927			skb_shinfo(skb)->nr_frags++;
    928		/* At this point shinfo->nr_frags is in fact the number of
    929		 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
    930		 */
    931		frag_overflow = 0;
    932		nskb = NULL;
    933		if (skb_shinfo(skb)->nr_frags > MAX_SKB_FRAGS) {
    934			frag_overflow = skb_shinfo(skb)->nr_frags - MAX_SKB_FRAGS;
    935			BUG_ON(frag_overflow > MAX_SKB_FRAGS);
    936			skb_shinfo(skb)->nr_frags = MAX_SKB_FRAGS;
    937			nskb = xenvif_alloc_skb(0);
    938			if (unlikely(nskb == NULL)) {
    939				skb_shinfo(skb)->nr_frags = 0;
    940				kfree_skb(skb);
    941				xenvif_tx_err(queue, &txreq, extra_count, idx);
    942				if (net_ratelimit())
    943					netdev_err(queue->vif->dev,
    944						   "Can't allocate the frag_list skb.\n");
    945				break;
    946			}
    947		}
    948
    949		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
    950			struct xen_netif_extra_info *gso;
    951			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
    952
    953			if (xenvif_set_skb_gso(queue->vif, skb, gso)) {
    954				/* Failure in xenvif_set_skb_gso is fatal. */
    955				skb_shinfo(skb)->nr_frags = 0;
    956				kfree_skb(skb);
    957				kfree_skb(nskb);
    958				break;
    959			}
    960		}
    961
    962		if (extras[XEN_NETIF_EXTRA_TYPE_HASH - 1].type) {
    963			struct xen_netif_extra_info *extra;
    964			enum pkt_hash_types type = PKT_HASH_TYPE_NONE;
    965
    966			extra = &extras[XEN_NETIF_EXTRA_TYPE_HASH - 1];
    967
    968			switch (extra->u.hash.type) {
    969			case _XEN_NETIF_CTRL_HASH_TYPE_IPV4:
    970			case _XEN_NETIF_CTRL_HASH_TYPE_IPV6:
    971				type = PKT_HASH_TYPE_L3;
    972				break;
    973
    974			case _XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP:
    975			case _XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP:
    976				type = PKT_HASH_TYPE_L4;
    977				break;
    978
    979			default:
    980				break;
    981			}
    982
    983			if (type != PKT_HASH_TYPE_NONE)
    984				skb_set_hash(skb,
    985					     *(u32 *)extra->u.hash.value,
    986					     type);
    987		}
    988
    989		XENVIF_TX_CB(skb)->pending_idx = pending_idx;
    990
    991		__skb_put(skb, data_len);
    992		queue->tx_copy_ops[*copy_ops].source.u.ref = txreq.gref;
    993		queue->tx_copy_ops[*copy_ops].source.domid = queue->vif->domid;
    994		queue->tx_copy_ops[*copy_ops].source.offset = txreq.offset;
    995
    996		queue->tx_copy_ops[*copy_ops].dest.u.gmfn =
    997			virt_to_gfn(skb->data);
    998		queue->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF;
    999		queue->tx_copy_ops[*copy_ops].dest.offset =
   1000			offset_in_page(skb->data) & ~XEN_PAGE_MASK;
   1001
   1002		queue->tx_copy_ops[*copy_ops].len = data_len;
   1003		queue->tx_copy_ops[*copy_ops].flags = GNTCOPY_source_gref;
   1004
   1005		(*copy_ops)++;
   1006
   1007		if (data_len < txreq.size) {
   1008			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
   1009					     pending_idx);
   1010			xenvif_tx_create_map_op(queue, pending_idx, &txreq,
   1011						extra_count, gop);
   1012			gop++;
   1013		} else {
   1014			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
   1015					     INVALID_PENDING_IDX);
   1016			memcpy(&queue->pending_tx_info[pending_idx].req,
   1017			       &txreq, sizeof(txreq));
   1018			queue->pending_tx_info[pending_idx].extra_count =
   1019				extra_count;
   1020		}
   1021
   1022		queue->pending_cons++;
   1023
   1024		gop = xenvif_get_requests(queue, skb, txfrags, gop,
   1025				          frag_overflow, nskb);
   1026
   1027		__skb_queue_tail(&queue->tx_queue, skb);
   1028
   1029		queue->tx.req_cons = idx;
   1030
   1031		if (((gop-queue->tx_map_ops) >= ARRAY_SIZE(queue->tx_map_ops)) ||
   1032		    (*copy_ops >= ARRAY_SIZE(queue->tx_copy_ops)))
   1033			break;
   1034	}
   1035
   1036	(*map_ops) = gop - queue->tx_map_ops;
   1037	return;
   1038}
   1039
   1040/* Consolidate skb with a frag_list into a brand new one with local pages on
   1041 * frags. Returns 0 or -ENOMEM if can't allocate new pages.
   1042 */
   1043static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *skb)
   1044{
   1045	unsigned int offset = skb_headlen(skb);
   1046	skb_frag_t frags[MAX_SKB_FRAGS];
   1047	int i, f;
   1048	struct ubuf_info *uarg;
   1049	struct sk_buff *nskb = skb_shinfo(skb)->frag_list;
   1050
   1051	queue->stats.tx_zerocopy_sent += 2;
   1052	queue->stats.tx_frag_overflow++;
   1053
   1054	xenvif_fill_frags(queue, nskb);
   1055	/* Subtract frags size, we will correct it later */
   1056	skb->truesize -= skb->data_len;
   1057	skb->len += nskb->len;
   1058	skb->data_len += nskb->len;
   1059
   1060	/* create a brand new frags array and coalesce there */
   1061	for (i = 0; offset < skb->len; i++) {
   1062		struct page *page;
   1063		unsigned int len;
   1064
   1065		BUG_ON(i >= MAX_SKB_FRAGS);
   1066		page = alloc_page(GFP_ATOMIC);
   1067		if (!page) {
   1068			int j;
   1069			skb->truesize += skb->data_len;
   1070			for (j = 0; j < i; j++)
   1071				put_page(skb_frag_page(&frags[j]));
   1072			return -ENOMEM;
   1073		}
   1074
   1075		if (offset + PAGE_SIZE < skb->len)
   1076			len = PAGE_SIZE;
   1077		else
   1078			len = skb->len - offset;
   1079		if (skb_copy_bits(skb, offset, page_address(page), len))
   1080			BUG();
   1081
   1082		offset += len;
   1083		__skb_frag_set_page(&frags[i], page);
   1084		skb_frag_off_set(&frags[i], 0);
   1085		skb_frag_size_set(&frags[i], len);
   1086	}
   1087
   1088	/* Release all the original (foreign) frags. */
   1089	for (f = 0; f < skb_shinfo(skb)->nr_frags; f++)
   1090		skb_frag_unref(skb, f);
   1091	uarg = skb_shinfo(skb)->destructor_arg;
   1092	/* increase inflight counter to offset decrement in callback */
   1093	atomic_inc(&queue->inflight_packets);
   1094	uarg->callback(NULL, uarg, true);
   1095	skb_shinfo(skb)->destructor_arg = NULL;
   1096
   1097	/* Fill the skb with the new (local) frags. */
   1098	memcpy(skb_shinfo(skb)->frags, frags, i * sizeof(skb_frag_t));
   1099	skb_shinfo(skb)->nr_frags = i;
   1100	skb->truesize += i * PAGE_SIZE;
   1101
   1102	return 0;
   1103}
   1104
   1105static int xenvif_tx_submit(struct xenvif_queue *queue)
   1106{
   1107	struct gnttab_map_grant_ref *gop_map = queue->tx_map_ops;
   1108	struct gnttab_copy *gop_copy = queue->tx_copy_ops;
   1109	struct sk_buff *skb;
   1110	int work_done = 0;
   1111
   1112	while ((skb = __skb_dequeue(&queue->tx_queue)) != NULL) {
   1113		struct xen_netif_tx_request *txp;
   1114		u16 pending_idx;
   1115		unsigned data_len;
   1116
   1117		pending_idx = XENVIF_TX_CB(skb)->pending_idx;
   1118		txp = &queue->pending_tx_info[pending_idx].req;
   1119
   1120		/* Check the remap error code. */
   1121		if (unlikely(xenvif_tx_check_gop(queue, skb, &gop_map, &gop_copy))) {
   1122			/* If there was an error, xenvif_tx_check_gop is
   1123			 * expected to release all the frags which were mapped,
   1124			 * so kfree_skb shouldn't do it again
   1125			 */
   1126			skb_shinfo(skb)->nr_frags = 0;
   1127			if (skb_has_frag_list(skb)) {
   1128				struct sk_buff *nskb =
   1129						skb_shinfo(skb)->frag_list;
   1130				skb_shinfo(nskb)->nr_frags = 0;
   1131			}
   1132			kfree_skb(skb);
   1133			continue;
   1134		}
   1135
   1136		data_len = skb->len;
   1137		callback_param(queue, pending_idx).ctx = NULL;
   1138		if (data_len < txp->size) {
   1139			/* Append the packet payload as a fragment. */
   1140			txp->offset += data_len;
   1141			txp->size -= data_len;
   1142		} else {
   1143			/* Schedule a response immediately. */
   1144			xenvif_idx_release(queue, pending_idx,
   1145					   XEN_NETIF_RSP_OKAY);
   1146		}
   1147
   1148		if (txp->flags & XEN_NETTXF_csum_blank)
   1149			skb->ip_summed = CHECKSUM_PARTIAL;
   1150		else if (txp->flags & XEN_NETTXF_data_validated)
   1151			skb->ip_summed = CHECKSUM_UNNECESSARY;
   1152
   1153		xenvif_fill_frags(queue, skb);
   1154
   1155		if (unlikely(skb_has_frag_list(skb))) {
   1156			struct sk_buff *nskb = skb_shinfo(skb)->frag_list;
   1157			xenvif_skb_zerocopy_prepare(queue, nskb);
   1158			if (xenvif_handle_frag_list(queue, skb)) {
   1159				if (net_ratelimit())
   1160					netdev_err(queue->vif->dev,
   1161						   "Not enough memory to consolidate frag_list!\n");
   1162				xenvif_skb_zerocopy_prepare(queue, skb);
   1163				kfree_skb(skb);
   1164				continue;
   1165			}
   1166			/* Copied all the bits from the frag list -- free it. */
   1167			skb_frag_list_init(skb);
   1168			kfree_skb(nskb);
   1169		}
   1170
   1171		skb->dev      = queue->vif->dev;
   1172		skb->protocol = eth_type_trans(skb, skb->dev);
   1173		skb_reset_network_header(skb);
   1174
   1175		if (checksum_setup(queue, skb)) {
   1176			netdev_dbg(queue->vif->dev,
   1177				   "Can't setup checksum in net_tx_action\n");
   1178			/* We have to set this flag to trigger the callback */
   1179			if (skb_shinfo(skb)->destructor_arg)
   1180				xenvif_skb_zerocopy_prepare(queue, skb);
   1181			kfree_skb(skb);
   1182			continue;
   1183		}
   1184
   1185		skb_probe_transport_header(skb);
   1186
   1187		/* If the packet is GSO then we will have just set up the
   1188		 * transport header offset in checksum_setup so it's now
   1189		 * straightforward to calculate gso_segs.
   1190		 */
   1191		if (skb_is_gso(skb)) {
   1192			int mss, hdrlen;
   1193
   1194			/* GSO implies having the L4 header. */
   1195			WARN_ON_ONCE(!skb_transport_header_was_set(skb));
   1196			if (unlikely(!skb_transport_header_was_set(skb))) {
   1197				kfree_skb(skb);
   1198				continue;
   1199			}
   1200
   1201			mss = skb_shinfo(skb)->gso_size;
   1202			hdrlen = skb_transport_header(skb) -
   1203				skb_mac_header(skb) +
   1204				tcp_hdrlen(skb);
   1205
   1206			skb_shinfo(skb)->gso_segs =
   1207				DIV_ROUND_UP(skb->len - hdrlen, mss);
   1208		}
   1209
   1210		queue->stats.rx_bytes += skb->len;
   1211		queue->stats.rx_packets++;
   1212
   1213		work_done++;
   1214
   1215		/* Set this flag right before netif_receive_skb, otherwise
   1216		 * someone might think this packet already left netback, and
   1217		 * do a skb_copy_ubufs while we are still in control of the
   1218		 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
   1219		 */
   1220		if (skb_shinfo(skb)->destructor_arg) {
   1221			xenvif_skb_zerocopy_prepare(queue, skb);
   1222			queue->stats.tx_zerocopy_sent++;
   1223		}
   1224
   1225		netif_receive_skb(skb);
   1226	}
   1227
   1228	return work_done;
   1229}
   1230
   1231void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf,
   1232			      bool zerocopy_success)
   1233{
   1234	unsigned long flags;
   1235	pending_ring_idx_t index;
   1236	struct xenvif_queue *queue = ubuf_to_queue(ubuf);
   1237
   1238	/* This is the only place where we grab this lock, to protect callbacks
   1239	 * from each other.
   1240	 */
   1241	spin_lock_irqsave(&queue->callback_lock, flags);
   1242	do {
   1243		u16 pending_idx = ubuf->desc;
   1244		ubuf = (struct ubuf_info *) ubuf->ctx;
   1245		BUG_ON(queue->dealloc_prod - queue->dealloc_cons >=
   1246			MAX_PENDING_REQS);
   1247		index = pending_index(queue->dealloc_prod);
   1248		queue->dealloc_ring[index] = pending_idx;
   1249		/* Sync with xenvif_tx_dealloc_action:
   1250		 * insert idx then incr producer.
   1251		 */
   1252		smp_wmb();
   1253		queue->dealloc_prod++;
   1254	} while (ubuf);
   1255	spin_unlock_irqrestore(&queue->callback_lock, flags);
   1256
   1257	if (likely(zerocopy_success))
   1258		queue->stats.tx_zerocopy_success++;
   1259	else
   1260		queue->stats.tx_zerocopy_fail++;
   1261	xenvif_skb_zerocopy_complete(queue);
   1262}
   1263
   1264static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue)
   1265{
   1266	struct gnttab_unmap_grant_ref *gop;
   1267	pending_ring_idx_t dc, dp;
   1268	u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
   1269	unsigned int i = 0;
   1270
   1271	dc = queue->dealloc_cons;
   1272	gop = queue->tx_unmap_ops;
   1273
   1274	/* Free up any grants we have finished using */
   1275	do {
   1276		dp = queue->dealloc_prod;
   1277
   1278		/* Ensure we see all indices enqueued by all
   1279		 * xenvif_zerocopy_callback().
   1280		 */
   1281		smp_rmb();
   1282
   1283		while (dc != dp) {
   1284			BUG_ON(gop - queue->tx_unmap_ops >= MAX_PENDING_REQS);
   1285			pending_idx =
   1286				queue->dealloc_ring[pending_index(dc++)];
   1287
   1288			pending_idx_release[gop - queue->tx_unmap_ops] =
   1289				pending_idx;
   1290			queue->pages_to_unmap[gop - queue->tx_unmap_ops] =
   1291				queue->mmap_pages[pending_idx];
   1292			gnttab_set_unmap_op(gop,
   1293					    idx_to_kaddr(queue, pending_idx),
   1294					    GNTMAP_host_map,
   1295					    queue->grant_tx_handle[pending_idx]);
   1296			xenvif_grant_handle_reset(queue, pending_idx);
   1297			++gop;
   1298		}
   1299
   1300	} while (dp != queue->dealloc_prod);
   1301
   1302	queue->dealloc_cons = dc;
   1303
   1304	if (gop - queue->tx_unmap_ops > 0) {
   1305		int ret;
   1306		ret = gnttab_unmap_refs(queue->tx_unmap_ops,
   1307					NULL,
   1308					queue->pages_to_unmap,
   1309					gop - queue->tx_unmap_ops);
   1310		if (ret) {
   1311			netdev_err(queue->vif->dev, "Unmap fail: nr_ops %tu ret %d\n",
   1312				   gop - queue->tx_unmap_ops, ret);
   1313			for (i = 0; i < gop - queue->tx_unmap_ops; ++i) {
   1314				if (gop[i].status != GNTST_okay)
   1315					netdev_err(queue->vif->dev,
   1316						   " host_addr: 0x%llx handle: 0x%x status: %d\n",
   1317						   gop[i].host_addr,
   1318						   gop[i].handle,
   1319						   gop[i].status);
   1320			}
   1321			BUG();
   1322		}
   1323	}
   1324
   1325	for (i = 0; i < gop - queue->tx_unmap_ops; ++i)
   1326		xenvif_idx_release(queue, pending_idx_release[i],
   1327				   XEN_NETIF_RSP_OKAY);
   1328}
   1329
   1330
   1331/* Called after netfront has transmitted */
   1332int xenvif_tx_action(struct xenvif_queue *queue, int budget)
   1333{
   1334	unsigned nr_mops, nr_cops = 0;
   1335	int work_done, ret;
   1336
   1337	if (unlikely(!tx_work_todo(queue)))
   1338		return 0;
   1339
   1340	xenvif_tx_build_gops(queue, budget, &nr_cops, &nr_mops);
   1341
   1342	if (nr_cops == 0)
   1343		return 0;
   1344
   1345	gnttab_batch_copy(queue->tx_copy_ops, nr_cops);
   1346	if (nr_mops != 0) {
   1347		ret = gnttab_map_refs(queue->tx_map_ops,
   1348				      NULL,
   1349				      queue->pages_to_map,
   1350				      nr_mops);
   1351		if (ret) {
   1352			unsigned int i;
   1353
   1354			netdev_err(queue->vif->dev, "Map fail: nr %u ret %d\n",
   1355				   nr_mops, ret);
   1356			for (i = 0; i < nr_mops; ++i)
   1357				WARN_ON_ONCE(queue->tx_map_ops[i].status ==
   1358				             GNTST_okay);
   1359		}
   1360	}
   1361
   1362	work_done = xenvif_tx_submit(queue);
   1363
   1364	return work_done;
   1365}
   1366
   1367static void xenvif_idx_release(struct xenvif_queue *queue, u16 pending_idx,
   1368			       u8 status)
   1369{
   1370	struct pending_tx_info *pending_tx_info;
   1371	pending_ring_idx_t index;
   1372	unsigned long flags;
   1373
   1374	pending_tx_info = &queue->pending_tx_info[pending_idx];
   1375
   1376	spin_lock_irqsave(&queue->response_lock, flags);
   1377
   1378	make_tx_response(queue, &pending_tx_info->req,
   1379			 pending_tx_info->extra_count, status);
   1380
   1381	/* Release the pending index before pusing the Tx response so
   1382	 * its available before a new Tx request is pushed by the
   1383	 * frontend.
   1384	 */
   1385	index = pending_index(queue->pending_prod++);
   1386	queue->pending_ring[index] = pending_idx;
   1387
   1388	push_tx_responses(queue);
   1389
   1390	spin_unlock_irqrestore(&queue->response_lock, flags);
   1391}
   1392
   1393
   1394static void make_tx_response(struct xenvif_queue *queue,
   1395			     struct xen_netif_tx_request *txp,
   1396			     unsigned int extra_count,
   1397			     s8       st)
   1398{
   1399	RING_IDX i = queue->tx.rsp_prod_pvt;
   1400	struct xen_netif_tx_response *resp;
   1401
   1402	resp = RING_GET_RESPONSE(&queue->tx, i);
   1403	resp->id     = txp->id;
   1404	resp->status = st;
   1405
   1406	while (extra_count-- != 0)
   1407		RING_GET_RESPONSE(&queue->tx, ++i)->status = XEN_NETIF_RSP_NULL;
   1408
   1409	queue->tx.rsp_prod_pvt = ++i;
   1410}
   1411
   1412static void push_tx_responses(struct xenvif_queue *queue)
   1413{
   1414	int notify;
   1415
   1416	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&queue->tx, notify);
   1417	if (notify)
   1418		notify_remote_via_irq(queue->tx_irq);
   1419}
   1420
   1421void xenvif_idx_unmap(struct xenvif_queue *queue, u16 pending_idx)
   1422{
   1423	int ret;
   1424	struct gnttab_unmap_grant_ref tx_unmap_op;
   1425
   1426	gnttab_set_unmap_op(&tx_unmap_op,
   1427			    idx_to_kaddr(queue, pending_idx),
   1428			    GNTMAP_host_map,
   1429			    queue->grant_tx_handle[pending_idx]);
   1430	xenvif_grant_handle_reset(queue, pending_idx);
   1431
   1432	ret = gnttab_unmap_refs(&tx_unmap_op, NULL,
   1433				&queue->mmap_pages[pending_idx], 1);
   1434	if (ret) {
   1435		netdev_err(queue->vif->dev,
   1436			   "Unmap fail: ret: %d pending_idx: %d host_addr: %llx handle: 0x%x status: %d\n",
   1437			   ret,
   1438			   pending_idx,
   1439			   tx_unmap_op.host_addr,
   1440			   tx_unmap_op.handle,
   1441			   tx_unmap_op.status);
   1442		BUG();
   1443	}
   1444}
   1445
   1446static inline int tx_work_todo(struct xenvif_queue *queue)
   1447{
   1448	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&queue->tx)))
   1449		return 1;
   1450
   1451	return 0;
   1452}
   1453
   1454static inline bool tx_dealloc_work_todo(struct xenvif_queue *queue)
   1455{
   1456	return queue->dealloc_cons != queue->dealloc_prod;
   1457}
   1458
   1459void xenvif_unmap_frontend_data_rings(struct xenvif_queue *queue)
   1460{
   1461	if (queue->tx.sring)
   1462		xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(queue->vif),
   1463					queue->tx.sring);
   1464	if (queue->rx.sring)
   1465		xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(queue->vif),
   1466					queue->rx.sring);
   1467}
   1468
   1469int xenvif_map_frontend_data_rings(struct xenvif_queue *queue,
   1470				   grant_ref_t tx_ring_ref,
   1471				   grant_ref_t rx_ring_ref)
   1472{
   1473	void *addr;
   1474	struct xen_netif_tx_sring *txs;
   1475	struct xen_netif_rx_sring *rxs;
   1476	RING_IDX rsp_prod, req_prod;
   1477	int err;
   1478
   1479	err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue->vif),
   1480				     &tx_ring_ref, 1, &addr);
   1481	if (err)
   1482		goto err;
   1483
   1484	txs = (struct xen_netif_tx_sring *)addr;
   1485	rsp_prod = READ_ONCE(txs->rsp_prod);
   1486	req_prod = READ_ONCE(txs->req_prod);
   1487
   1488	BACK_RING_ATTACH(&queue->tx, txs, rsp_prod, XEN_PAGE_SIZE);
   1489
   1490	err = -EIO;
   1491	if (req_prod - rsp_prod > RING_SIZE(&queue->tx))
   1492		goto err;
   1493
   1494	err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue->vif),
   1495				     &rx_ring_ref, 1, &addr);
   1496	if (err)
   1497		goto err;
   1498
   1499	rxs = (struct xen_netif_rx_sring *)addr;
   1500	rsp_prod = READ_ONCE(rxs->rsp_prod);
   1501	req_prod = READ_ONCE(rxs->req_prod);
   1502
   1503	BACK_RING_ATTACH(&queue->rx, rxs, rsp_prod, XEN_PAGE_SIZE);
   1504
   1505	err = -EIO;
   1506	if (req_prod - rsp_prod > RING_SIZE(&queue->rx))
   1507		goto err;
   1508
   1509	return 0;
   1510
   1511err:
   1512	xenvif_unmap_frontend_data_rings(queue);
   1513	return err;
   1514}
   1515
   1516static bool xenvif_dealloc_kthread_should_stop(struct xenvif_queue *queue)
   1517{
   1518	/* Dealloc thread must remain running until all inflight
   1519	 * packets complete.
   1520	 */
   1521	return kthread_should_stop() &&
   1522		!atomic_read(&queue->inflight_packets);
   1523}
   1524
   1525int xenvif_dealloc_kthread(void *data)
   1526{
   1527	struct xenvif_queue *queue = data;
   1528
   1529	for (;;) {
   1530		wait_event_interruptible(queue->dealloc_wq,
   1531					 tx_dealloc_work_todo(queue) ||
   1532					 xenvif_dealloc_kthread_should_stop(queue));
   1533		if (xenvif_dealloc_kthread_should_stop(queue))
   1534			break;
   1535
   1536		xenvif_tx_dealloc_action(queue);
   1537		cond_resched();
   1538	}
   1539
   1540	/* Unmap anything remaining*/
   1541	if (tx_dealloc_work_todo(queue))
   1542		xenvif_tx_dealloc_action(queue);
   1543
   1544	return 0;
   1545}
   1546
   1547static void make_ctrl_response(struct xenvif *vif,
   1548			       const struct xen_netif_ctrl_request *req,
   1549			       u32 status, u32 data)
   1550{
   1551	RING_IDX idx = vif->ctrl.rsp_prod_pvt;
   1552	struct xen_netif_ctrl_response rsp = {
   1553		.id = req->id,
   1554		.type = req->type,
   1555		.status = status,
   1556		.data = data,
   1557	};
   1558
   1559	*RING_GET_RESPONSE(&vif->ctrl, idx) = rsp;
   1560	vif->ctrl.rsp_prod_pvt = ++idx;
   1561}
   1562
   1563static void push_ctrl_response(struct xenvif *vif)
   1564{
   1565	int notify;
   1566
   1567	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->ctrl, notify);
   1568	if (notify)
   1569		notify_remote_via_irq(vif->ctrl_irq);
   1570}
   1571
   1572static void process_ctrl_request(struct xenvif *vif,
   1573				 const struct xen_netif_ctrl_request *req)
   1574{
   1575	u32 status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED;
   1576	u32 data = 0;
   1577
   1578	switch (req->type) {
   1579	case XEN_NETIF_CTRL_TYPE_SET_HASH_ALGORITHM:
   1580		status = xenvif_set_hash_alg(vif, req->data[0]);
   1581		break;
   1582
   1583	case XEN_NETIF_CTRL_TYPE_GET_HASH_FLAGS:
   1584		status = xenvif_get_hash_flags(vif, &data);
   1585		break;
   1586
   1587	case XEN_NETIF_CTRL_TYPE_SET_HASH_FLAGS:
   1588		status = xenvif_set_hash_flags(vif, req->data[0]);
   1589		break;
   1590
   1591	case XEN_NETIF_CTRL_TYPE_SET_HASH_KEY:
   1592		status = xenvif_set_hash_key(vif, req->data[0],
   1593					     req->data[1]);
   1594		break;
   1595
   1596	case XEN_NETIF_CTRL_TYPE_GET_HASH_MAPPING_SIZE:
   1597		status = XEN_NETIF_CTRL_STATUS_SUCCESS;
   1598		data = XEN_NETBK_MAX_HASH_MAPPING_SIZE;
   1599		break;
   1600
   1601	case XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING_SIZE:
   1602		status = xenvif_set_hash_mapping_size(vif,
   1603						      req->data[0]);
   1604		break;
   1605
   1606	case XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING:
   1607		status = xenvif_set_hash_mapping(vif, req->data[0],
   1608						 req->data[1],
   1609						 req->data[2]);
   1610		break;
   1611
   1612	default:
   1613		break;
   1614	}
   1615
   1616	make_ctrl_response(vif, req, status, data);
   1617	push_ctrl_response(vif);
   1618}
   1619
   1620static void xenvif_ctrl_action(struct xenvif *vif)
   1621{
   1622	for (;;) {
   1623		RING_IDX req_prod, req_cons;
   1624
   1625		req_prod = vif->ctrl.sring->req_prod;
   1626		req_cons = vif->ctrl.req_cons;
   1627
   1628		/* Make sure we can see requests before we process them. */
   1629		rmb();
   1630
   1631		if (req_cons == req_prod)
   1632			break;
   1633
   1634		while (req_cons != req_prod) {
   1635			struct xen_netif_ctrl_request req;
   1636
   1637			RING_COPY_REQUEST(&vif->ctrl, req_cons, &req);
   1638			req_cons++;
   1639
   1640			process_ctrl_request(vif, &req);
   1641		}
   1642
   1643		vif->ctrl.req_cons = req_cons;
   1644		vif->ctrl.sring->req_event = req_cons + 1;
   1645	}
   1646}
   1647
   1648static bool xenvif_ctrl_work_todo(struct xenvif *vif)
   1649{
   1650	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->ctrl)))
   1651		return true;
   1652
   1653	return false;
   1654}
   1655
   1656irqreturn_t xenvif_ctrl_irq_fn(int irq, void *data)
   1657{
   1658	struct xenvif *vif = data;
   1659	unsigned int eoi_flag = XEN_EOI_FLAG_SPURIOUS;
   1660
   1661	while (xenvif_ctrl_work_todo(vif)) {
   1662		xenvif_ctrl_action(vif);
   1663		eoi_flag = 0;
   1664	}
   1665
   1666	xen_irq_lateeoi(irq, eoi_flag);
   1667
   1668	return IRQ_HANDLED;
   1669}
   1670
   1671static int __init netback_init(void)
   1672{
   1673	int rc = 0;
   1674
   1675	if (!xen_domain())
   1676		return -ENODEV;
   1677
   1678	/* Allow as many queues as there are CPUs but max. 8 if user has not
   1679	 * specified a value.
   1680	 */
   1681	if (xenvif_max_queues == 0)
   1682		xenvif_max_queues = min_t(unsigned int, MAX_QUEUES_DEFAULT,
   1683					  num_online_cpus());
   1684
   1685	if (fatal_skb_slots < XEN_NETBK_LEGACY_SLOTS_MAX) {
   1686		pr_info("fatal_skb_slots too small (%d), bump it to XEN_NETBK_LEGACY_SLOTS_MAX (%d)\n",
   1687			fatal_skb_slots, XEN_NETBK_LEGACY_SLOTS_MAX);
   1688		fatal_skb_slots = XEN_NETBK_LEGACY_SLOTS_MAX;
   1689	}
   1690
   1691	rc = xenvif_xenbus_init();
   1692	if (rc)
   1693		goto failed_init;
   1694
   1695#ifdef CONFIG_DEBUG_FS
   1696	xen_netback_dbg_root = debugfs_create_dir("xen-netback", NULL);
   1697#endif /* CONFIG_DEBUG_FS */
   1698
   1699	return 0;
   1700
   1701failed_init:
   1702	return rc;
   1703}
   1704
   1705module_init(netback_init);
   1706
   1707static void __exit netback_fini(void)
   1708{
   1709#ifdef CONFIG_DEBUG_FS
   1710	debugfs_remove_recursive(xen_netback_dbg_root);
   1711#endif /* CONFIG_DEBUG_FS */
   1712	xenvif_xenbus_fini();
   1713}
   1714module_exit(netback_fini);
   1715
   1716MODULE_LICENSE("Dual BSD/GPL");
   1717MODULE_ALIAS("xen-backend:vif");