blkback.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
blkback.c (42006B)
      1/******************************************************************************
      2 *
      3 * Back-end of the driver for virtual block devices. This portion of the
      4 * driver exports a 'unified' block-device interface that can be accessed
      5 * by any operating system that implements a compatible front end. A
      6 * reference front-end implementation can be found in:
      7 *  drivers/block/xen-blkfront.c
      8 *
      9 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
     10 * Copyright (c) 2005, Christopher Clark
     11 *
     12 * This program is free software; you can redistribute it and/or
     13 * modify it under the terms of the GNU General Public License version 2
     14 * as published by the Free Software Foundation; or, when distributed
     15 * separately from the Linux kernel or incorporated into other
     16 * software packages, subject to the following license:
     17 *
     18 * Permission is hereby granted, free of charge, to any person obtaining a copy
     19 * of this source file (the "Software"), to deal in the Software without
     20 * restriction, including without limitation the rights to use, copy, modify,
     21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
     22 * and to permit persons to whom the Software is furnished to do so, subject to
     23 * the following conditions:
     24 *
     25 * The above copyright notice and this permission notice shall be included in
     26 * all copies or substantial portions of the Software.
     27 *
     28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     34 * IN THE SOFTWARE.
     35 */
     36
     37#define pr_fmt(fmt) "xen-blkback: " fmt
     38
     39#include <linux/spinlock.h>
     40#include <linux/kthread.h>
     41#include <linux/list.h>
     42#include <linux/delay.h>
     43#include <linux/freezer.h>
     44#include <linux/bitmap.h>
     45
     46#include <xen/events.h>
     47#include <xen/page.h>
     48#include <xen/xen.h>
     49#include <asm/xen/hypervisor.h>
     50#include <asm/xen/hypercall.h>
     51#include <xen/balloon.h>
     52#include <xen/grant_table.h>
     53#include "common.h"
     54
     55/*
     56 * Maximum number of unused free pages to keep in the internal buffer.
     57 * Setting this to a value too low will reduce memory used in each backend,
     58 * but can have a performance penalty.
     59 *
     60 * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
     61 * be set to a lower value that might degrade performance on some intensive
     62 * IO workloads.
     63 */
     64
     65static int max_buffer_pages = 1024;
     66module_param_named(max_buffer_pages, max_buffer_pages, int, 0644);
     67MODULE_PARM_DESC(max_buffer_pages,
     68"Maximum number of free pages to keep in each block backend buffer");
     69
     70/*
     71 * Maximum number of grants to map persistently in blkback. For maximum
     72 * performance this should be the total numbers of grants that can be used
     73 * to fill the ring, but since this might become too high, specially with
     74 * the use of indirect descriptors, we set it to a value that provides good
     75 * performance without using too much memory.
     76 *
     77 * When the list of persistent grants is full we clean it up using a LRU
     78 * algorithm.
     79 */
     80
     81static int max_pgrants = 1056;
     82module_param_named(max_persistent_grants, max_pgrants, int, 0644);
     83MODULE_PARM_DESC(max_persistent_grants,
     84                 "Maximum number of grants to map persistently");
     85
     86/*
     87 * How long a persistent grant is allowed to remain allocated without being in
     88 * use. The time is in seconds, 0 means indefinitely long.
     89 */
     90
     91static unsigned int pgrant_timeout = 60;
     92module_param_named(persistent_grant_unused_seconds, pgrant_timeout,
     93		   uint, 0644);
     94MODULE_PARM_DESC(persistent_grant_unused_seconds,
     95		 "Time in seconds an unused persistent grant is allowed to "
     96		 "remain allocated. Default is 60, 0 means unlimited.");
     97
     98/*
     99 * Maximum number of rings/queues blkback supports, allow as many queues as there
    100 * are CPUs if user has not specified a value.
    101 */
    102unsigned int xenblk_max_queues;
    103module_param_named(max_queues, xenblk_max_queues, uint, 0644);
    104MODULE_PARM_DESC(max_queues,
    105		 "Maximum number of hardware queues per virtual disk." \
    106		 "By default it is the number of online CPUs.");
    107
    108/*
    109 * Maximum order of pages to be used for the shared ring between front and
    110 * backend, 4KB page granularity is used.
    111 */
    112unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
    113module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
    114MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
    115/*
    116 * The LRU mechanism to clean the lists of persistent grants needs to
    117 * be executed periodically. The time interval between consecutive executions
    118 * of the purge mechanism is set in ms.
    119 */
    120#define LRU_INTERVAL 100
    121
    122/*
    123 * When the persistent grants list is full we will remove unused grants
    124 * from the list. The percent number of grants to be removed at each LRU
    125 * execution.
    126 */
    127#define LRU_PERCENT_CLEAN 5
    128
    129/* Run-time switchable: /sys/module/blkback/parameters/ */
    130static unsigned int log_stats;
    131module_param(log_stats, int, 0644);
    132
    133#define BLKBACK_INVALID_HANDLE (~0)
    134
    135static inline bool persistent_gnt_timeout(struct persistent_gnt *persistent_gnt)
    136{
    137	return pgrant_timeout && (jiffies - persistent_gnt->last_used >=
    138			HZ * pgrant_timeout);
    139}
    140
    141#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
    142
    143static int do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags);
    144static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
    145				struct blkif_request *req,
    146				struct pending_req *pending_req);
    147static void make_response(struct xen_blkif_ring *ring, u64 id,
    148			  unsigned short op, int st);
    149
    150#define foreach_grant_safe(pos, n, rbtree, node) \
    151	for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
    152	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
    153	     &(pos)->node != NULL; \
    154	     (pos) = container_of(n, typeof(*(pos)), node), \
    155	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
    156
    157
    158/*
    159 * We don't need locking around the persistent grant helpers
    160 * because blkback uses a single-thread for each backend, so we
    161 * can be sure that this functions will never be called recursively.
    162 *
    163 * The only exception to that is put_persistent_grant, that can be called
    164 * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
    165 * bit operations to modify the flags of a persistent grant and to count
    166 * the number of used grants.
    167 */
    168static int add_persistent_gnt(struct xen_blkif_ring *ring,
    169			       struct persistent_gnt *persistent_gnt)
    170{
    171	struct rb_node **new = NULL, *parent = NULL;
    172	struct persistent_gnt *this;
    173	struct xen_blkif *blkif = ring->blkif;
    174
    175	if (ring->persistent_gnt_c >= max_pgrants) {
    176		if (!blkif->vbd.overflow_max_grants)
    177			blkif->vbd.overflow_max_grants = 1;
    178		return -EBUSY;
    179	}
    180	/* Figure out where to put new node */
    181	new = &ring->persistent_gnts.rb_node;
    182	while (*new) {
    183		this = container_of(*new, struct persistent_gnt, node);
    184
    185		parent = *new;
    186		if (persistent_gnt->gnt < this->gnt)
    187			new = &((*new)->rb_left);
    188		else if (persistent_gnt->gnt > this->gnt)
    189			new = &((*new)->rb_right);
    190		else {
    191			pr_alert_ratelimited("trying to add a gref that's already in the tree\n");
    192			return -EINVAL;
    193		}
    194	}
    195
    196	persistent_gnt->active = true;
    197	/* Add new node and rebalance tree. */
    198	rb_link_node(&(persistent_gnt->node), parent, new);
    199	rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
    200	ring->persistent_gnt_c++;
    201	atomic_inc(&ring->persistent_gnt_in_use);
    202	return 0;
    203}
    204
    205static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
    206						 grant_ref_t gref)
    207{
    208	struct persistent_gnt *data;
    209	struct rb_node *node = NULL;
    210
    211	node = ring->persistent_gnts.rb_node;
    212	while (node) {
    213		data = container_of(node, struct persistent_gnt, node);
    214
    215		if (gref < data->gnt)
    216			node = node->rb_left;
    217		else if (gref > data->gnt)
    218			node = node->rb_right;
    219		else {
    220			if (data->active) {
    221				pr_alert_ratelimited("requesting a grant already in use\n");
    222				return NULL;
    223			}
    224			data->active = true;
    225			atomic_inc(&ring->persistent_gnt_in_use);
    226			return data;
    227		}
    228	}
    229	return NULL;
    230}
    231
    232static void put_persistent_gnt(struct xen_blkif_ring *ring,
    233                               struct persistent_gnt *persistent_gnt)
    234{
    235	if (!persistent_gnt->active)
    236		pr_alert_ratelimited("freeing a grant already unused\n");
    237	persistent_gnt->last_used = jiffies;
    238	persistent_gnt->active = false;
    239	atomic_dec(&ring->persistent_gnt_in_use);
    240}
    241
    242static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
    243                                 unsigned int num)
    244{
    245	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    246	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    247	struct persistent_gnt *persistent_gnt;
    248	struct rb_node *n;
    249	int segs_to_unmap = 0;
    250	struct gntab_unmap_queue_data unmap_data;
    251
    252	unmap_data.pages = pages;
    253	unmap_data.unmap_ops = unmap;
    254	unmap_data.kunmap_ops = NULL;
    255
    256	foreach_grant_safe(persistent_gnt, n, root, node) {
    257		BUG_ON(persistent_gnt->handle ==
    258			BLKBACK_INVALID_HANDLE);
    259		gnttab_set_unmap_op(&unmap[segs_to_unmap],
    260			(unsigned long) pfn_to_kaddr(page_to_pfn(
    261				persistent_gnt->page)),
    262			GNTMAP_host_map,
    263			persistent_gnt->handle);
    264
    265		pages[segs_to_unmap] = persistent_gnt->page;
    266
    267		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
    268			!rb_next(&persistent_gnt->node)) {
    269
    270			unmap_data.count = segs_to_unmap;
    271			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
    272
    273			gnttab_page_cache_put(&ring->free_pages, pages,
    274					      segs_to_unmap);
    275			segs_to_unmap = 0;
    276		}
    277
    278		rb_erase(&persistent_gnt->node, root);
    279		kfree(persistent_gnt);
    280		num--;
    281	}
    282	BUG_ON(num != 0);
    283}
    284
    285void xen_blkbk_unmap_purged_grants(struct work_struct *work)
    286{
    287	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    288	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    289	struct persistent_gnt *persistent_gnt;
    290	int segs_to_unmap = 0;
    291	struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
    292	struct gntab_unmap_queue_data unmap_data;
    293
    294	unmap_data.pages = pages;
    295	unmap_data.unmap_ops = unmap;
    296	unmap_data.kunmap_ops = NULL;
    297
    298	while(!list_empty(&ring->persistent_purge_list)) {
    299		persistent_gnt = list_first_entry(&ring->persistent_purge_list,
    300		                                  struct persistent_gnt,
    301		                                  remove_node);
    302		list_del(&persistent_gnt->remove_node);
    303
    304		gnttab_set_unmap_op(&unmap[segs_to_unmap],
    305			vaddr(persistent_gnt->page),
    306			GNTMAP_host_map,
    307			persistent_gnt->handle);
    308
    309		pages[segs_to_unmap] = persistent_gnt->page;
    310
    311		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
    312			unmap_data.count = segs_to_unmap;
    313			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
    314			gnttab_page_cache_put(&ring->free_pages, pages,
    315					      segs_to_unmap);
    316			segs_to_unmap = 0;
    317		}
    318		kfree(persistent_gnt);
    319	}
    320	if (segs_to_unmap > 0) {
    321		unmap_data.count = segs_to_unmap;
    322		BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
    323		gnttab_page_cache_put(&ring->free_pages, pages, segs_to_unmap);
    324	}
    325}
    326
    327static void purge_persistent_gnt(struct xen_blkif_ring *ring)
    328{
    329	struct persistent_gnt *persistent_gnt;
    330	struct rb_node *n;
    331	unsigned int num_clean, total;
    332	bool scan_used = false;
    333	struct rb_root *root;
    334
    335	if (work_busy(&ring->persistent_purge_work)) {
    336		pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
    337		goto out;
    338	}
    339
    340	if (ring->persistent_gnt_c < max_pgrants ||
    341	    (ring->persistent_gnt_c == max_pgrants &&
    342	    !ring->blkif->vbd.overflow_max_grants)) {
    343		num_clean = 0;
    344	} else {
    345		num_clean = (max_pgrants / 100) * LRU_PERCENT_CLEAN;
    346		num_clean = ring->persistent_gnt_c - max_pgrants + num_clean;
    347		num_clean = min(ring->persistent_gnt_c, num_clean);
    348		pr_debug("Going to purge at least %u persistent grants\n",
    349			 num_clean);
    350	}
    351
    352	/*
    353	 * At this point, we can assure that there will be no calls
    354         * to get_persistent_grant (because we are executing this code from
    355         * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
    356         * which means that the number of currently used grants will go down,
    357         * but never up, so we will always be able to remove the requested
    358         * number of grants.
    359	 */
    360
    361	total = 0;
    362
    363	BUG_ON(!list_empty(&ring->persistent_purge_list));
    364	root = &ring->persistent_gnts;
    365purge_list:
    366	foreach_grant_safe(persistent_gnt, n, root, node) {
    367		BUG_ON(persistent_gnt->handle ==
    368			BLKBACK_INVALID_HANDLE);
    369
    370		if (persistent_gnt->active)
    371			continue;
    372		if (!scan_used && !persistent_gnt_timeout(persistent_gnt))
    373			continue;
    374		if (scan_used && total >= num_clean)
    375			continue;
    376
    377		rb_erase(&persistent_gnt->node, root);
    378		list_add(&persistent_gnt->remove_node,
    379			 &ring->persistent_purge_list);
    380		total++;
    381	}
    382	/*
    383	 * Check whether we also need to start cleaning
    384	 * grants that were used since last purge in order to cope
    385	 * with the requested num
    386	 */
    387	if (!scan_used && total < num_clean) {
    388		pr_debug("Still missing %u purged frames\n", num_clean - total);
    389		scan_used = true;
    390		goto purge_list;
    391	}
    392
    393	if (total) {
    394		ring->persistent_gnt_c -= total;
    395		ring->blkif->vbd.overflow_max_grants = 0;
    396
    397		/* We can defer this work */
    398		schedule_work(&ring->persistent_purge_work);
    399		pr_debug("Purged %u/%u\n", num_clean, total);
    400	}
    401
    402out:
    403	return;
    404}
    405
    406/*
    407 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
    408 */
    409static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
    410{
    411	struct pending_req *req = NULL;
    412	unsigned long flags;
    413
    414	spin_lock_irqsave(&ring->pending_free_lock, flags);
    415	if (!list_empty(&ring->pending_free)) {
    416		req = list_entry(ring->pending_free.next, struct pending_req,
    417				 free_list);
    418		list_del(&req->free_list);
    419	}
    420	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
    421	return req;
    422}
    423
    424/*
    425 * Return the 'pending_req' structure back to the freepool. We also
    426 * wake up the thread if it was waiting for a free page.
    427 */
    428static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
    429{
    430	unsigned long flags;
    431	int was_empty;
    432
    433	spin_lock_irqsave(&ring->pending_free_lock, flags);
    434	was_empty = list_empty(&ring->pending_free);
    435	list_add(&req->free_list, &ring->pending_free);
    436	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
    437	if (was_empty)
    438		wake_up(&ring->pending_free_wq);
    439}
    440
    441/*
    442 * Routines for managing virtual block devices (vbds).
    443 */
    444static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
    445			     int operation)
    446{
    447	struct xen_vbd *vbd = &blkif->vbd;
    448	int rc = -EACCES;
    449
    450	if ((operation != REQ_OP_READ) && vbd->readonly)
    451		goto out;
    452
    453	if (likely(req->nr_sects)) {
    454		blkif_sector_t end = req->sector_number + req->nr_sects;
    455
    456		if (unlikely(end < req->sector_number))
    457			goto out;
    458		if (unlikely(end > vbd_sz(vbd)))
    459			goto out;
    460	}
    461
    462	req->dev  = vbd->pdevice;
    463	req->bdev = vbd->bdev;
    464	rc = 0;
    465
    466 out:
    467	return rc;
    468}
    469
    470static void xen_vbd_resize(struct xen_blkif *blkif)
    471{
    472	struct xen_vbd *vbd = &blkif->vbd;
    473	struct xenbus_transaction xbt;
    474	int err;
    475	struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
    476	unsigned long long new_size = vbd_sz(vbd);
    477
    478	pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n",
    479		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
    480	pr_info("VBD Resize: new size %llu\n", new_size);
    481	vbd->size = new_size;
    482again:
    483	err = xenbus_transaction_start(&xbt);
    484	if (err) {
    485		pr_warn("Error starting transaction\n");
    486		return;
    487	}
    488	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
    489			    (unsigned long long)vbd_sz(vbd));
    490	if (err) {
    491		pr_warn("Error writing new size\n");
    492		goto abort;
    493	}
    494	/*
    495	 * Write the current state; we will use this to synchronize
    496	 * the front-end. If the current state is "connected" the
    497	 * front-end will get the new size information online.
    498	 */
    499	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
    500	if (err) {
    501		pr_warn("Error writing the state\n");
    502		goto abort;
    503	}
    504
    505	err = xenbus_transaction_end(xbt, 0);
    506	if (err == -EAGAIN)
    507		goto again;
    508	if (err)
    509		pr_warn("Error ending transaction\n");
    510	return;
    511abort:
    512	xenbus_transaction_end(xbt, 1);
    513}
    514
    515/*
    516 * Notification from the guest OS.
    517 */
    518static void blkif_notify_work(struct xen_blkif_ring *ring)
    519{
    520	ring->waiting_reqs = 1;
    521	wake_up(&ring->wq);
    522}
    523
    524irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
    525{
    526	blkif_notify_work(dev_id);
    527	return IRQ_HANDLED;
    528}
    529
    530/*
    531 * SCHEDULER FUNCTIONS
    532 */
    533
    534static void print_stats(struct xen_blkif_ring *ring)
    535{
    536	pr_info("(%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
    537		 "  |  ds %4llu | pg: %4u/%4d\n",
    538		 current->comm, ring->st_oo_req,
    539		 ring->st_rd_req, ring->st_wr_req,
    540		 ring->st_f_req, ring->st_ds_req,
    541		 ring->persistent_gnt_c, max_pgrants);
    542	ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
    543	ring->st_rd_req = 0;
    544	ring->st_wr_req = 0;
    545	ring->st_oo_req = 0;
    546	ring->st_ds_req = 0;
    547}
    548
    549int xen_blkif_schedule(void *arg)
    550{
    551	struct xen_blkif_ring *ring = arg;
    552	struct xen_blkif *blkif = ring->blkif;
    553	struct xen_vbd *vbd = &blkif->vbd;
    554	unsigned long timeout;
    555	int ret;
    556	bool do_eoi;
    557	unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
    558
    559	set_freezable();
    560	while (!kthread_should_stop()) {
    561		if (try_to_freeze())
    562			continue;
    563		if (unlikely(vbd->size != vbd_sz(vbd)))
    564			xen_vbd_resize(blkif);
    565
    566		timeout = msecs_to_jiffies(LRU_INTERVAL);
    567
    568		timeout = wait_event_interruptible_timeout(
    569			ring->wq,
    570			ring->waiting_reqs || kthread_should_stop(),
    571			timeout);
    572		if (timeout == 0)
    573			goto purge_gnt_list;
    574		timeout = wait_event_interruptible_timeout(
    575			ring->pending_free_wq,
    576			!list_empty(&ring->pending_free) ||
    577			kthread_should_stop(),
    578			timeout);
    579		if (timeout == 0)
    580			goto purge_gnt_list;
    581
    582		do_eoi = ring->waiting_reqs;
    583
    584		ring->waiting_reqs = 0;
    585		smp_mb(); /* clear flag *before* checking for work */
    586
    587		ret = do_block_io_op(ring, &eoi_flags);
    588		if (ret > 0)
    589			ring->waiting_reqs = 1;
    590		if (ret == -EACCES)
    591			wait_event_interruptible(ring->shutdown_wq,
    592						 kthread_should_stop());
    593
    594		if (do_eoi && !ring->waiting_reqs) {
    595			xen_irq_lateeoi(ring->irq, eoi_flags);
    596			eoi_flags |= XEN_EOI_FLAG_SPURIOUS;
    597		}
    598
    599purge_gnt_list:
    600		if (blkif->vbd.feature_gnt_persistent &&
    601		    time_after(jiffies, ring->next_lru)) {
    602			purge_persistent_gnt(ring);
    603			ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
    604		}
    605
    606		/* Shrink the free pages pool if it is too large. */
    607		if (time_before(jiffies, blkif->buffer_squeeze_end))
    608			gnttab_page_cache_shrink(&ring->free_pages, 0);
    609		else
    610			gnttab_page_cache_shrink(&ring->free_pages,
    611						 max_buffer_pages);
    612
    613		if (log_stats && time_after(jiffies, ring->st_print))
    614			print_stats(ring);
    615	}
    616
    617	/* Drain pending purge work */
    618	flush_work(&ring->persistent_purge_work);
    619
    620	if (log_stats)
    621		print_stats(ring);
    622
    623	ring->xenblkd = NULL;
    624
    625	return 0;
    626}
    627
    628/*
    629 * Remove persistent grants and empty the pool of free pages
    630 */
    631void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
    632{
    633	/* Free all persistent grant pages */
    634	if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
    635		free_persistent_gnts(ring, &ring->persistent_gnts,
    636			ring->persistent_gnt_c);
    637
    638	BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
    639	ring->persistent_gnt_c = 0;
    640
    641	/* Since we are shutting down remove all pages from the buffer */
    642	gnttab_page_cache_shrink(&ring->free_pages, 0 /* All */);
    643}
    644
    645static unsigned int xen_blkbk_unmap_prepare(
    646	struct xen_blkif_ring *ring,
    647	struct grant_page **pages,
    648	unsigned int num,
    649	struct gnttab_unmap_grant_ref *unmap_ops,
    650	struct page **unmap_pages)
    651{
    652	unsigned int i, invcount = 0;
    653
    654	for (i = 0; i < num; i++) {
    655		if (pages[i]->persistent_gnt != NULL) {
    656			put_persistent_gnt(ring, pages[i]->persistent_gnt);
    657			continue;
    658		}
    659		if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
    660			continue;
    661		unmap_pages[invcount] = pages[i]->page;
    662		gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page),
    663				    GNTMAP_host_map, pages[i]->handle);
    664		pages[i]->handle = BLKBACK_INVALID_HANDLE;
    665		invcount++;
    666	}
    667
    668	return invcount;
    669}
    670
    671static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
    672{
    673	struct pending_req *pending_req = (struct pending_req *)(data->data);
    674	struct xen_blkif_ring *ring = pending_req->ring;
    675	struct xen_blkif *blkif = ring->blkif;
    676
    677	/* BUG_ON used to reproduce existing behaviour,
    678	   but is this the best way to deal with this? */
    679	BUG_ON(result);
    680
    681	gnttab_page_cache_put(&ring->free_pages, data->pages, data->count);
    682	make_response(ring, pending_req->id,
    683		      pending_req->operation, pending_req->status);
    684	free_req(ring, pending_req);
    685	/*
    686	 * Make sure the request is freed before releasing blkif,
    687	 * or there could be a race between free_req and the
    688	 * cleanup done in xen_blkif_free during shutdown.
    689	 *
    690	 * NB: The fact that we might try to wake up pending_free_wq
    691	 * before drain_complete (in case there's a drain going on)
    692	 * it's not a problem with our current implementation
    693	 * because we can assure there's no thread waiting on
    694	 * pending_free_wq if there's a drain going on, but it has
    695	 * to be taken into account if the current model is changed.
    696	 */
    697	if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
    698		complete(&blkif->drain_complete);
    699	}
    700	xen_blkif_put(blkif);
    701}
    702
    703static void xen_blkbk_unmap_and_respond(struct pending_req *req)
    704{
    705	struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
    706	struct xen_blkif_ring *ring = req->ring;
    707	struct grant_page **pages = req->segments;
    708	unsigned int invcount;
    709
    710	invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
    711					   req->unmap, req->unmap_pages);
    712
    713	work->data = req;
    714	work->done = xen_blkbk_unmap_and_respond_callback;
    715	work->unmap_ops = req->unmap;
    716	work->kunmap_ops = NULL;
    717	work->pages = req->unmap_pages;
    718	work->count = invcount;
    719
    720	gnttab_unmap_refs_async(&req->gnttab_unmap_data);
    721}
    722
    723
    724/*
    725 * Unmap the grant references.
    726 *
    727 * This could accumulate ops up to the batch size to reduce the number
    728 * of hypercalls, but since this is only used in error paths there's
    729 * no real need.
    730 */
    731static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
    732                            struct grant_page *pages[],
    733                            int num)
    734{
    735	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    736	struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    737	unsigned int invcount = 0;
    738	int ret;
    739
    740	while (num) {
    741		unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
    742
    743		invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
    744						   unmap, unmap_pages);
    745		if (invcount) {
    746			ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
    747			BUG_ON(ret);
    748			gnttab_page_cache_put(&ring->free_pages, unmap_pages,
    749					      invcount);
    750		}
    751		pages += batch;
    752		num -= batch;
    753	}
    754}
    755
    756static int xen_blkbk_map(struct xen_blkif_ring *ring,
    757			 struct grant_page *pages[],
    758			 int num, bool ro)
    759{
    760	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    761	struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    762	struct persistent_gnt *persistent_gnt = NULL;
    763	phys_addr_t addr = 0;
    764	int i, seg_idx, new_map_idx;
    765	int segs_to_map = 0;
    766	int ret = 0;
    767	int last_map = 0, map_until = 0;
    768	int use_persistent_gnts;
    769	struct xen_blkif *blkif = ring->blkif;
    770
    771	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
    772
    773	/*
    774	 * Fill out preq.nr_sects with proper amount of sectors, and setup
    775	 * assign map[..] with the PFN of the page in our domain with the
    776	 * corresponding grant reference for each page.
    777	 */
    778again:
    779	for (i = map_until; i < num; i++) {
    780		uint32_t flags;
    781
    782		if (use_persistent_gnts) {
    783			persistent_gnt = get_persistent_gnt(
    784				ring,
    785				pages[i]->gref);
    786		}
    787
    788		if (persistent_gnt) {
    789			/*
    790			 * We are using persistent grants and
    791			 * the grant is already mapped
    792			 */
    793			pages[i]->page = persistent_gnt->page;
    794			pages[i]->persistent_gnt = persistent_gnt;
    795		} else {
    796			if (gnttab_page_cache_get(&ring->free_pages,
    797						  &pages[i]->page)) {
    798				gnttab_page_cache_put(&ring->free_pages,
    799						      pages_to_gnt,
    800						      segs_to_map);
    801				ret = -ENOMEM;
    802				goto out;
    803			}
    804			addr = vaddr(pages[i]->page);
    805			pages_to_gnt[segs_to_map] = pages[i]->page;
    806			pages[i]->persistent_gnt = NULL;
    807			flags = GNTMAP_host_map;
    808			if (!use_persistent_gnts && ro)
    809				flags |= GNTMAP_readonly;
    810			gnttab_set_map_op(&map[segs_to_map++], addr,
    811					  flags, pages[i]->gref,
    812					  blkif->domid);
    813		}
    814		map_until = i + 1;
    815		if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
    816			break;
    817	}
    818
    819	if (segs_to_map)
    820		ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
    821
    822	/*
    823	 * Now swizzle the MFN in our domain with the MFN from the other domain
    824	 * so that when we access vaddr(pending_req,i) it has the contents of
    825	 * the page from the other domain.
    826	 */
    827	for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
    828		if (!pages[seg_idx]->persistent_gnt) {
    829			/* This is a newly mapped grant */
    830			BUG_ON(new_map_idx >= segs_to_map);
    831			if (unlikely(map[new_map_idx].status != 0)) {
    832				pr_debug("invalid buffer -- could not remap it\n");
    833				gnttab_page_cache_put(&ring->free_pages,
    834						      &pages[seg_idx]->page, 1);
    835				pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
    836				ret |= !ret;
    837				goto next;
    838			}
    839			pages[seg_idx]->handle = map[new_map_idx].handle;
    840		} else {
    841			continue;
    842		}
    843		if (use_persistent_gnts &&
    844		    ring->persistent_gnt_c < max_pgrants) {
    845			/*
    846			 * We are using persistent grants, the grant is
    847			 * not mapped but we might have room for it.
    848			 */
    849			persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
    850				                 GFP_KERNEL);
    851			if (!persistent_gnt) {
    852				/*
    853				 * If we don't have enough memory to
    854				 * allocate the persistent_gnt struct
    855				 * map this grant non-persistenly
    856				 */
    857				goto next;
    858			}
    859			persistent_gnt->gnt = map[new_map_idx].ref;
    860			persistent_gnt->handle = map[new_map_idx].handle;
    861			persistent_gnt->page = pages[seg_idx]->page;
    862			if (add_persistent_gnt(ring,
    863			                       persistent_gnt)) {
    864				kfree(persistent_gnt);
    865				persistent_gnt = NULL;
    866				goto next;
    867			}
    868			pages[seg_idx]->persistent_gnt = persistent_gnt;
    869			pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
    870				 persistent_gnt->gnt, ring->persistent_gnt_c,
    871				 max_pgrants);
    872			goto next;
    873		}
    874		if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
    875			blkif->vbd.overflow_max_grants = 1;
    876			pr_debug("domain %u, device %#x is using maximum number of persistent grants\n",
    877			         blkif->domid, blkif->vbd.handle);
    878		}
    879		/*
    880		 * We could not map this grant persistently, so use it as
    881		 * a non-persistent grant.
    882		 */
    883next:
    884		new_map_idx++;
    885	}
    886	segs_to_map = 0;
    887	last_map = map_until;
    888	if (!ret && map_until != num)
    889		goto again;
    890
    891out:
    892	for (i = last_map; i < num; i++) {
    893		/* Don't zap current batch's valid persistent grants. */
    894		if(i >= map_until)
    895			pages[i]->persistent_gnt = NULL;
    896		pages[i]->handle = BLKBACK_INVALID_HANDLE;
    897	}
    898
    899	return ret;
    900}
    901
    902static int xen_blkbk_map_seg(struct pending_req *pending_req)
    903{
    904	int rc;
    905
    906	rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
    907			   pending_req->nr_segs,
    908	                   (pending_req->operation != BLKIF_OP_READ));
    909
    910	return rc;
    911}
    912
    913static int xen_blkbk_parse_indirect(struct blkif_request *req,
    914				    struct pending_req *pending_req,
    915				    struct seg_buf seg[],
    916				    struct phys_req *preq)
    917{
    918	struct grant_page **pages = pending_req->indirect_pages;
    919	struct xen_blkif_ring *ring = pending_req->ring;
    920	int indirect_grefs, rc, n, nseg, i;
    921	struct blkif_request_segment *segments = NULL;
    922
    923	nseg = pending_req->nr_segs;
    924	indirect_grefs = INDIRECT_PAGES(nseg);
    925	BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
    926
    927	for (i = 0; i < indirect_grefs; i++)
    928		pages[i]->gref = req->u.indirect.indirect_grefs[i];
    929
    930	rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
    931	if (rc)
    932		goto unmap;
    933
    934	for (n = 0; n < nseg; n++) {
    935		uint8_t first_sect, last_sect;
    936
    937		if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
    938			/* Map indirect segments */
    939			if (segments)
    940				kunmap_atomic(segments);
    941			segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
    942		}
    943		i = n % SEGS_PER_INDIRECT_FRAME;
    944
    945		pending_req->segments[n]->gref = segments[i].gref;
    946
    947		first_sect = READ_ONCE(segments[i].first_sect);
    948		last_sect = READ_ONCE(segments[i].last_sect);
    949		if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) {
    950			rc = -EINVAL;
    951			goto unmap;
    952		}
    953
    954		seg[n].nsec = last_sect - first_sect + 1;
    955		seg[n].offset = first_sect << 9;
    956		preq->nr_sects += seg[n].nsec;
    957	}
    958
    959unmap:
    960	if (segments)
    961		kunmap_atomic(segments);
    962	xen_blkbk_unmap(ring, pages, indirect_grefs);
    963	return rc;
    964}
    965
    966static int dispatch_discard_io(struct xen_blkif_ring *ring,
    967				struct blkif_request *req)
    968{
    969	int err = 0;
    970	int status = BLKIF_RSP_OKAY;
    971	struct xen_blkif *blkif = ring->blkif;
    972	struct block_device *bdev = blkif->vbd.bdev;
    973	struct phys_req preq;
    974
    975	xen_blkif_get(blkif);
    976
    977	preq.sector_number = req->u.discard.sector_number;
    978	preq.nr_sects      = req->u.discard.nr_sectors;
    979
    980	err = xen_vbd_translate(&preq, blkif, REQ_OP_WRITE);
    981	if (err) {
    982		pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n",
    983			preq.sector_number,
    984			preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
    985		goto fail_response;
    986	}
    987	ring->st_ds_req++;
    988
    989	if (blkif->vbd.discard_secure &&
    990	    (req->u.discard.flag & BLKIF_DISCARD_SECURE))
    991		err = blkdev_issue_secure_erase(bdev,
    992				req->u.discard.sector_number,
    993				req->u.discard.nr_sectors, GFP_KERNEL);
    994	else
    995		err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
    996				req->u.discard.nr_sectors, GFP_KERNEL);
    997
    998fail_response:
    999	if (err == -EOPNOTSUPP) {
   1000		pr_debug("discard op failed, not supported\n");
   1001		status = BLKIF_RSP_EOPNOTSUPP;
   1002	} else if (err)
   1003		status = BLKIF_RSP_ERROR;
   1004
   1005	make_response(ring, req->u.discard.id, req->operation, status);
   1006	xen_blkif_put(blkif);
   1007	return err;
   1008}
   1009
   1010static int dispatch_other_io(struct xen_blkif_ring *ring,
   1011			     struct blkif_request *req,
   1012			     struct pending_req *pending_req)
   1013{
   1014	free_req(ring, pending_req);
   1015	make_response(ring, req->u.other.id, req->operation,
   1016		      BLKIF_RSP_EOPNOTSUPP);
   1017	return -EIO;
   1018}
   1019
   1020static void xen_blk_drain_io(struct xen_blkif_ring *ring)
   1021{
   1022	struct xen_blkif *blkif = ring->blkif;
   1023
   1024	atomic_set(&blkif->drain, 1);
   1025	do {
   1026		if (atomic_read(&ring->inflight) == 0)
   1027			break;
   1028		wait_for_completion_interruptible_timeout(
   1029				&blkif->drain_complete, HZ);
   1030
   1031		if (!atomic_read(&blkif->drain))
   1032			break;
   1033	} while (!kthread_should_stop());
   1034	atomic_set(&blkif->drain, 0);
   1035}
   1036
   1037static void __end_block_io_op(struct pending_req *pending_req,
   1038		blk_status_t error)
   1039{
   1040	/* An error fails the entire request. */
   1041	if (pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE &&
   1042	    error == BLK_STS_NOTSUPP) {
   1043		pr_debug("flush diskcache op failed, not supported\n");
   1044		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
   1045		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
   1046	} else if (pending_req->operation == BLKIF_OP_WRITE_BARRIER &&
   1047		   error == BLK_STS_NOTSUPP) {
   1048		pr_debug("write barrier op failed, not supported\n");
   1049		xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
   1050		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
   1051	} else if (error) {
   1052		pr_debug("Buffer not up-to-date at end of operation,"
   1053			 " error=%d\n", error);
   1054		pending_req->status = BLKIF_RSP_ERROR;
   1055	}
   1056
   1057	/*
   1058	 * If all of the bio's have completed it is time to unmap
   1059	 * the grant references associated with 'request' and provide
   1060	 * the proper response on the ring.
   1061	 */
   1062	if (atomic_dec_and_test(&pending_req->pendcnt))
   1063		xen_blkbk_unmap_and_respond(pending_req);
   1064}
   1065
   1066/*
   1067 * bio callback.
   1068 */
   1069static void end_block_io_op(struct bio *bio)
   1070{
   1071	__end_block_io_op(bio->bi_private, bio->bi_status);
   1072	bio_put(bio);
   1073}
   1074
   1075
   1076
   1077/*
   1078 * Function to copy the from the ring buffer the 'struct blkif_request'
   1079 * (which has the sectors we want, number of them, grant references, etc),
   1080 * and transmute  it to the block API to hand it over to the proper block disk.
   1081 */
   1082static int
   1083__do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
   1084{
   1085	union blkif_back_rings *blk_rings = &ring->blk_rings;
   1086	struct blkif_request req;
   1087	struct pending_req *pending_req;
   1088	RING_IDX rc, rp;
   1089	int more_to_do = 0;
   1090
   1091	rc = blk_rings->common.req_cons;
   1092	rp = blk_rings->common.sring->req_prod;
   1093	rmb(); /* Ensure we see queued requests up to 'rp'. */
   1094
   1095	if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
   1096		rc = blk_rings->common.rsp_prod_pvt;
   1097		pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
   1098			rp, rc, rp - rc, ring->blkif->vbd.pdevice);
   1099		return -EACCES;
   1100	}
   1101	while (rc != rp) {
   1102
   1103		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
   1104			break;
   1105
   1106		/* We've seen a request, so clear spurious eoi flag. */
   1107		*eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS;
   1108
   1109		if (kthread_should_stop()) {
   1110			more_to_do = 1;
   1111			break;
   1112		}
   1113
   1114		pending_req = alloc_req(ring);
   1115		if (NULL == pending_req) {
   1116			ring->st_oo_req++;
   1117			more_to_do = 1;
   1118			break;
   1119		}
   1120
   1121		switch (ring->blkif->blk_protocol) {
   1122		case BLKIF_PROTOCOL_NATIVE:
   1123			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
   1124			break;
   1125		case BLKIF_PROTOCOL_X86_32:
   1126			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
   1127			break;
   1128		case BLKIF_PROTOCOL_X86_64:
   1129			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
   1130			break;
   1131		default:
   1132			BUG();
   1133		}
   1134		blk_rings->common.req_cons = ++rc; /* before make_response() */
   1135
   1136		/* Apply all sanity checks to /private copy/ of request. */
   1137		barrier();
   1138
   1139		switch (req.operation) {
   1140		case BLKIF_OP_READ:
   1141		case BLKIF_OP_WRITE:
   1142		case BLKIF_OP_WRITE_BARRIER:
   1143		case BLKIF_OP_FLUSH_DISKCACHE:
   1144		case BLKIF_OP_INDIRECT:
   1145			if (dispatch_rw_block_io(ring, &req, pending_req))
   1146				goto done;
   1147			break;
   1148		case BLKIF_OP_DISCARD:
   1149			free_req(ring, pending_req);
   1150			if (dispatch_discard_io(ring, &req))
   1151				goto done;
   1152			break;
   1153		default:
   1154			if (dispatch_other_io(ring, &req, pending_req))
   1155				goto done;
   1156			break;
   1157		}
   1158
   1159		/* Yield point for this unbounded loop. */
   1160		cond_resched();
   1161	}
   1162done:
   1163	return more_to_do;
   1164}
   1165
   1166static int
   1167do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
   1168{
   1169	union blkif_back_rings *blk_rings = &ring->blk_rings;
   1170	int more_to_do;
   1171
   1172	do {
   1173		more_to_do = __do_block_io_op(ring, eoi_flags);
   1174		if (more_to_do)
   1175			break;
   1176
   1177		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
   1178	} while (more_to_do);
   1179
   1180	return more_to_do;
   1181}
   1182/*
   1183 * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
   1184 * and call the 'submit_bio' to pass it to the underlying storage.
   1185 */
   1186static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
   1187				struct blkif_request *req,
   1188				struct pending_req *pending_req)
   1189{
   1190	struct phys_req preq;
   1191	struct seg_buf *seg = pending_req->seg;
   1192	unsigned int nseg;
   1193	struct bio *bio = NULL;
   1194	struct bio **biolist = pending_req->biolist;
   1195	int i, nbio = 0;
   1196	int operation;
   1197	int operation_flags = 0;
   1198	struct blk_plug plug;
   1199	bool drain = false;
   1200	struct grant_page **pages = pending_req->segments;
   1201	unsigned short req_operation;
   1202
   1203	req_operation = req->operation == BLKIF_OP_INDIRECT ?
   1204			req->u.indirect.indirect_op : req->operation;
   1205
   1206	if ((req->operation == BLKIF_OP_INDIRECT) &&
   1207	    (req_operation != BLKIF_OP_READ) &&
   1208	    (req_operation != BLKIF_OP_WRITE)) {
   1209		pr_debug("Invalid indirect operation (%u)\n", req_operation);
   1210		goto fail_response;
   1211	}
   1212
   1213	switch (req_operation) {
   1214	case BLKIF_OP_READ:
   1215		ring->st_rd_req++;
   1216		operation = REQ_OP_READ;
   1217		break;
   1218	case BLKIF_OP_WRITE:
   1219		ring->st_wr_req++;
   1220		operation = REQ_OP_WRITE;
   1221		operation_flags = REQ_SYNC | REQ_IDLE;
   1222		break;
   1223	case BLKIF_OP_WRITE_BARRIER:
   1224		drain = true;
   1225		fallthrough;
   1226	case BLKIF_OP_FLUSH_DISKCACHE:
   1227		ring->st_f_req++;
   1228		operation = REQ_OP_WRITE;
   1229		operation_flags = REQ_PREFLUSH;
   1230		break;
   1231	default:
   1232		operation = 0; /* make gcc happy */
   1233		goto fail_response;
   1234		break;
   1235	}
   1236
   1237	/* Check that the number of segments is sane. */
   1238	nseg = req->operation == BLKIF_OP_INDIRECT ?
   1239	       req->u.indirect.nr_segments : req->u.rw.nr_segments;
   1240
   1241	if (unlikely(nseg == 0 && operation_flags != REQ_PREFLUSH) ||
   1242	    unlikely((req->operation != BLKIF_OP_INDIRECT) &&
   1243		     (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
   1244	    unlikely((req->operation == BLKIF_OP_INDIRECT) &&
   1245		     (nseg > MAX_INDIRECT_SEGMENTS))) {
   1246		pr_debug("Bad number of segments in request (%d)\n", nseg);
   1247		/* Haven't submitted any bio's yet. */
   1248		goto fail_response;
   1249	}
   1250
   1251	preq.nr_sects      = 0;
   1252
   1253	pending_req->ring      = ring;
   1254	pending_req->id        = req->u.rw.id;
   1255	pending_req->operation = req_operation;
   1256	pending_req->status    = BLKIF_RSP_OKAY;
   1257	pending_req->nr_segs   = nseg;
   1258
   1259	if (req->operation != BLKIF_OP_INDIRECT) {
   1260		preq.dev               = req->u.rw.handle;
   1261		preq.sector_number     = req->u.rw.sector_number;
   1262		for (i = 0; i < nseg; i++) {
   1263			pages[i]->gref = req->u.rw.seg[i].gref;
   1264			seg[i].nsec = req->u.rw.seg[i].last_sect -
   1265				req->u.rw.seg[i].first_sect + 1;
   1266			seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
   1267			if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
   1268			    (req->u.rw.seg[i].last_sect <
   1269			     req->u.rw.seg[i].first_sect))
   1270				goto fail_response;
   1271			preq.nr_sects += seg[i].nsec;
   1272		}
   1273	} else {
   1274		preq.dev               = req->u.indirect.handle;
   1275		preq.sector_number     = req->u.indirect.sector_number;
   1276		if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
   1277			goto fail_response;
   1278	}
   1279
   1280	if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
   1281		pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
   1282			 operation == REQ_OP_READ ? "read" : "write",
   1283			 preq.sector_number,
   1284			 preq.sector_number + preq.nr_sects,
   1285			 ring->blkif->vbd.pdevice);
   1286		goto fail_response;
   1287	}
   1288
   1289	/*
   1290	 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
   1291	 * is set there.
   1292	 */
   1293	for (i = 0; i < nseg; i++) {
   1294		if (((int)preq.sector_number|(int)seg[i].nsec) &
   1295		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
   1296			pr_debug("Misaligned I/O request from domain %d\n",
   1297				 ring->blkif->domid);
   1298			goto fail_response;
   1299		}
   1300	}
   1301
   1302	/* Wait on all outstanding I/O's and once that has been completed
   1303	 * issue the flush.
   1304	 */
   1305	if (drain)
   1306		xen_blk_drain_io(pending_req->ring);
   1307
   1308	/*
   1309	 * If we have failed at this point, we need to undo the M2P override,
   1310	 * set gnttab_set_unmap_op on all of the grant references and perform
   1311	 * the hypercall to unmap the grants - that is all done in
   1312	 * xen_blkbk_unmap.
   1313	 */
   1314	if (xen_blkbk_map_seg(pending_req))
   1315		goto fail_flush;
   1316
   1317	/*
   1318	 * This corresponding xen_blkif_put is done in __end_block_io_op, or
   1319	 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
   1320	 */
   1321	xen_blkif_get(ring->blkif);
   1322	atomic_inc(&ring->inflight);
   1323
   1324	for (i = 0; i < nseg; i++) {
   1325		while ((bio == NULL) ||
   1326		       (bio_add_page(bio,
   1327				     pages[i]->page,
   1328				     seg[i].nsec << 9,
   1329				     seg[i].offset) == 0)) {
   1330			bio = bio_alloc(preq.bdev, bio_max_segs(nseg - i),
   1331					operation | operation_flags,
   1332					GFP_KERNEL);
   1333			biolist[nbio++] = bio;
   1334			bio->bi_private = pending_req;
   1335			bio->bi_end_io  = end_block_io_op;
   1336			bio->bi_iter.bi_sector  = preq.sector_number;
   1337		}
   1338
   1339		preq.sector_number += seg[i].nsec;
   1340	}
   1341
   1342	/* This will be hit if the operation was a flush or discard. */
   1343	if (!bio) {
   1344		BUG_ON(operation_flags != REQ_PREFLUSH);
   1345
   1346		bio = bio_alloc(preq.bdev, 0, operation | operation_flags,
   1347				GFP_KERNEL);
   1348		biolist[nbio++] = bio;
   1349		bio->bi_private = pending_req;
   1350		bio->bi_end_io  = end_block_io_op;
   1351	}
   1352
   1353	atomic_set(&pending_req->pendcnt, nbio);
   1354	blk_start_plug(&plug);
   1355
   1356	for (i = 0; i < nbio; i++)
   1357		submit_bio(biolist[i]);
   1358
   1359	/* Let the I/Os go.. */
   1360	blk_finish_plug(&plug);
   1361
   1362	if (operation == REQ_OP_READ)
   1363		ring->st_rd_sect += preq.nr_sects;
   1364	else if (operation == REQ_OP_WRITE)
   1365		ring->st_wr_sect += preq.nr_sects;
   1366
   1367	return 0;
   1368
   1369 fail_flush:
   1370	xen_blkbk_unmap(ring, pending_req->segments,
   1371	                pending_req->nr_segs);
   1372 fail_response:
   1373	/* Haven't submitted any bio's yet. */
   1374	make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
   1375	free_req(ring, pending_req);
   1376	msleep(1); /* back off a bit */
   1377	return -EIO;
   1378}
   1379
   1380
   1381
   1382/*
   1383 * Put a response on the ring on how the operation fared.
   1384 */
   1385static void make_response(struct xen_blkif_ring *ring, u64 id,
   1386			  unsigned short op, int st)
   1387{
   1388	struct blkif_response *resp;
   1389	unsigned long     flags;
   1390	union blkif_back_rings *blk_rings;
   1391	int notify;
   1392
   1393	spin_lock_irqsave(&ring->blk_ring_lock, flags);
   1394	blk_rings = &ring->blk_rings;
   1395	/* Place on the response ring for the relevant domain. */
   1396	switch (ring->blkif->blk_protocol) {
   1397	case BLKIF_PROTOCOL_NATIVE:
   1398		resp = RING_GET_RESPONSE(&blk_rings->native,
   1399					 blk_rings->native.rsp_prod_pvt);
   1400		break;
   1401	case BLKIF_PROTOCOL_X86_32:
   1402		resp = RING_GET_RESPONSE(&blk_rings->x86_32,
   1403					 blk_rings->x86_32.rsp_prod_pvt);
   1404		break;
   1405	case BLKIF_PROTOCOL_X86_64:
   1406		resp = RING_GET_RESPONSE(&blk_rings->x86_64,
   1407					 blk_rings->x86_64.rsp_prod_pvt);
   1408		break;
   1409	default:
   1410		BUG();
   1411	}
   1412
   1413	resp->id        = id;
   1414	resp->operation = op;
   1415	resp->status    = st;
   1416
   1417	blk_rings->common.rsp_prod_pvt++;
   1418	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
   1419	spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
   1420	if (notify)
   1421		notify_remote_via_irq(ring->irq);
   1422}
   1423
   1424static int __init xen_blkif_init(void)
   1425{
   1426	int rc = 0;
   1427
   1428	if (!xen_domain())
   1429		return -ENODEV;
   1430
   1431	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
   1432		pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
   1433			xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
   1434		xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
   1435	}
   1436
   1437	if (xenblk_max_queues == 0)
   1438		xenblk_max_queues = num_online_cpus();
   1439
   1440	rc = xen_blkif_interface_init();
   1441	if (rc)
   1442		goto failed_init;
   1443
   1444	rc = xen_blkif_xenbus_init();
   1445	if (rc)
   1446		goto failed_init;
   1447
   1448 failed_init:
   1449	return rc;
   1450}
   1451
   1452module_init(xen_blkif_init);
   1453
   1454static void __exit xen_blkif_fini(void)
   1455{
   1456	xen_blkif_xenbus_fini();
   1457	xen_blkif_interface_fini();
   1458}
   1459
   1460module_exit(xen_blkif_fini);
   1461
   1462MODULE_LICENSE("Dual BSD/GPL");
   1463MODULE_ALIAS("xen-backend:vbd");