cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

virtio_mem.c (81114B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * Virtio-mem device driver.
      4 *
      5 * Copyright Red Hat, Inc. 2020
      6 *
      7 * Author(s): David Hildenbrand <david@redhat.com>
      8 */
      9
     10#include <linux/virtio.h>
     11#include <linux/virtio_mem.h>
     12#include <linux/workqueue.h>
     13#include <linux/slab.h>
     14#include <linux/module.h>
     15#include <linux/mm.h>
     16#include <linux/memory_hotplug.h>
     17#include <linux/memory.h>
     18#include <linux/hrtimer.h>
     19#include <linux/crash_dump.h>
     20#include <linux/mutex.h>
     21#include <linux/bitmap.h>
     22#include <linux/lockdep.h>
     23#include <linux/log2.h>
     24
     25#include <acpi/acpi_numa.h>
     26
     27static bool unplug_online = true;
     28module_param(unplug_online, bool, 0644);
     29MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
     30
     31static bool force_bbm;
     32module_param(force_bbm, bool, 0444);
     33MODULE_PARM_DESC(force_bbm,
     34		"Force Big Block Mode. Default is 0 (auto-selection)");
     35
     36static unsigned long bbm_block_size;
     37module_param(bbm_block_size, ulong, 0444);
     38MODULE_PARM_DESC(bbm_block_size,
     39		 "Big Block size in bytes. Default is 0 (auto-detection).");
     40
     41static bool bbm_safe_unplug = true;
     42module_param(bbm_safe_unplug, bool, 0444);
     43MODULE_PARM_DESC(bbm_safe_unplug,
     44	     "Use a safe unplug mechanism in BBM, avoiding long/endless loops");
     45
     46/*
     47 * virtio-mem currently supports the following modes of operation:
     48 *
     49 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
     50 *   size of a Sub Block (SB) is determined based on the device block size, the
     51 *   pageblock size, and the maximum allocation granularity of the buddy.
     52 *   Subblocks within a Linux memory block might either be plugged or unplugged.
     53 *   Memory is added/removed to Linux MM in Linux memory block granularity.
     54 *
     55 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
     56 *   Memory is added/removed to Linux MM in Big Block granularity.
     57 *
     58 * The mode is determined automatically based on the Linux memory block size
     59 * and the device block size.
     60 *
     61 * User space / core MM (auto onlining) is responsible for onlining added
     62 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
     63 * always onlined separately, and all memory within a Linux memory block is
     64 * onlined to the same zone - virtio-mem relies on this behavior.
     65 */
     66
     67/*
     68 * State of a Linux memory block in SBM.
     69 */
     70enum virtio_mem_sbm_mb_state {
     71	/* Unplugged, not added to Linux. Can be reused later. */
     72	VIRTIO_MEM_SBM_MB_UNUSED = 0,
     73	/* (Partially) plugged, not added to Linux. Error on add_memory(). */
     74	VIRTIO_MEM_SBM_MB_PLUGGED,
     75	/* Fully plugged, fully added to Linux, offline. */
     76	VIRTIO_MEM_SBM_MB_OFFLINE,
     77	/* Partially plugged, fully added to Linux, offline. */
     78	VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
     79	/* Fully plugged, fully added to Linux, onlined to a kernel zone. */
     80	VIRTIO_MEM_SBM_MB_KERNEL,
     81	/* Partially plugged, fully added to Linux, online to a kernel zone */
     82	VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
     83	/* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
     84	VIRTIO_MEM_SBM_MB_MOVABLE,
     85	/* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
     86	VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
     87	VIRTIO_MEM_SBM_MB_COUNT
     88};
     89
     90/*
     91 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
     92 */
     93enum virtio_mem_bbm_bb_state {
     94	/* Unplugged, not added to Linux. Can be reused later. */
     95	VIRTIO_MEM_BBM_BB_UNUSED = 0,
     96	/* Plugged, not added to Linux. Error on add_memory(). */
     97	VIRTIO_MEM_BBM_BB_PLUGGED,
     98	/* Plugged and added to Linux. */
     99	VIRTIO_MEM_BBM_BB_ADDED,
    100	/* All online parts are fake-offline, ready to remove. */
    101	VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
    102	VIRTIO_MEM_BBM_BB_COUNT
    103};
    104
    105struct virtio_mem {
    106	struct virtio_device *vdev;
    107
    108	/* We might first have to unplug all memory when starting up. */
    109	bool unplug_all_required;
    110
    111	/* Workqueue that processes the plug/unplug requests. */
    112	struct work_struct wq;
    113	atomic_t wq_active;
    114	atomic_t config_changed;
    115
    116	/* Virtqueue for guest->host requests. */
    117	struct virtqueue *vq;
    118
    119	/* Wait for a host response to a guest request. */
    120	wait_queue_head_t host_resp;
    121
    122	/* Space for one guest request and the host response. */
    123	struct virtio_mem_req req;
    124	struct virtio_mem_resp resp;
    125
    126	/* The current size of the device. */
    127	uint64_t plugged_size;
    128	/* The requested size of the device. */
    129	uint64_t requested_size;
    130
    131	/* The device block size (for communicating with the device). */
    132	uint64_t device_block_size;
    133	/* The determined node id for all memory of the device. */
    134	int nid;
    135	/* Physical start address of the memory region. */
    136	uint64_t addr;
    137	/* Maximum region size in bytes. */
    138	uint64_t region_size;
    139
    140	/* The parent resource for all memory added via this device. */
    141	struct resource *parent_resource;
    142	/*
    143	 * Copy of "System RAM (virtio_mem)" to be used for
    144	 * add_memory_driver_managed().
    145	 */
    146	const char *resource_name;
    147	/* Memory group identification. */
    148	int mgid;
    149
    150	/*
    151	 * We don't want to add too much memory if it's not getting onlined,
    152	 * to avoid running OOM. Besides this threshold, we allow to have at
    153	 * least two offline blocks at a time (whatever is bigger).
    154	 */
    155#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD		(1024 * 1024 * 1024)
    156	atomic64_t offline_size;
    157	uint64_t offline_threshold;
    158
    159	/* If set, the driver is in SBM, otherwise in BBM. */
    160	bool in_sbm;
    161
    162	union {
    163		struct {
    164			/* Id of the first memory block of this device. */
    165			unsigned long first_mb_id;
    166			/* Id of the last usable memory block of this device. */
    167			unsigned long last_usable_mb_id;
    168			/* Id of the next memory bock to prepare when needed. */
    169			unsigned long next_mb_id;
    170
    171			/* The subblock size. */
    172			uint64_t sb_size;
    173			/* The number of subblocks per Linux memory block. */
    174			uint32_t sbs_per_mb;
    175
    176			/* Summary of all memory block states. */
    177			unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
    178
    179			/*
    180			 * One byte state per memory block. Allocated via
    181			 * vmalloc(). Resized (alloc+copy+free) on demand.
    182			 *
    183			 * With 128 MiB memory blocks, we have states for 512
    184			 * GiB of memory in one 4 KiB page.
    185			 */
    186			uint8_t *mb_states;
    187
    188			/*
    189			 * Bitmap: one bit per subblock. Allocated similar to
    190			 * sbm.mb_states.
    191			 *
    192			 * A set bit means the corresponding subblock is
    193			 * plugged, otherwise it's unblocked.
    194			 *
    195			 * With 4 MiB subblocks, we manage 128 GiB of memory
    196			 * in one 4 KiB page.
    197			 */
    198			unsigned long *sb_states;
    199		} sbm;
    200
    201		struct {
    202			/* Id of the first big block of this device. */
    203			unsigned long first_bb_id;
    204			/* Id of the last usable big block of this device. */
    205			unsigned long last_usable_bb_id;
    206			/* Id of the next device bock to prepare when needed. */
    207			unsigned long next_bb_id;
    208
    209			/* Summary of all big block states. */
    210			unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
    211
    212			/* One byte state per big block. See sbm.mb_states. */
    213			uint8_t *bb_states;
    214
    215			/* The block size used for plugging/adding/removing. */
    216			uint64_t bb_size;
    217		} bbm;
    218	};
    219
    220	/*
    221	 * Mutex that protects the sbm.mb_count, sbm.mb_states,
    222	 * sbm.sb_states, bbm.bb_count, and bbm.bb_states
    223	 *
    224	 * When this lock is held the pointers can't change, ONLINE and
    225	 * OFFLINE blocks can't change the state and no subblocks will get
    226	 * plugged/unplugged.
    227	 *
    228	 * In kdump mode, used to serialize requests, last_block_addr and
    229	 * last_block_plugged.
    230	 */
    231	struct mutex hotplug_mutex;
    232	bool hotplug_active;
    233
    234	/* An error occurred we cannot handle - stop processing requests. */
    235	bool broken;
    236
    237	/* Cached valued of is_kdump_kernel() when the device was probed. */
    238	bool in_kdump;
    239
    240	/* The driver is being removed. */
    241	spinlock_t removal_lock;
    242	bool removing;
    243
    244	/* Timer for retrying to plug/unplug memory. */
    245	struct hrtimer retry_timer;
    246	unsigned int retry_timer_ms;
    247#define VIRTIO_MEM_RETRY_TIMER_MIN_MS		50000
    248#define VIRTIO_MEM_RETRY_TIMER_MAX_MS		300000
    249
    250	/* Memory notifier (online/offline events). */
    251	struct notifier_block memory_notifier;
    252
    253#ifdef CONFIG_PROC_VMCORE
    254	/* vmcore callback for /proc/vmcore handling in kdump mode */
    255	struct vmcore_cb vmcore_cb;
    256	uint64_t last_block_addr;
    257	bool last_block_plugged;
    258#endif /* CONFIG_PROC_VMCORE */
    259
    260	/* Next device in the list of virtio-mem devices. */
    261	struct list_head next;
    262};
    263
    264/*
    265 * We have to share a single online_page callback among all virtio-mem
    266 * devices. We use RCU to iterate the list in the callback.
    267 */
    268static DEFINE_MUTEX(virtio_mem_mutex);
    269static LIST_HEAD(virtio_mem_devices);
    270
    271static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
    272static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
    273						  unsigned long nr_pages);
    274static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
    275						   unsigned long nr_pages);
    276static void virtio_mem_retry(struct virtio_mem *vm);
    277static int virtio_mem_create_resource(struct virtio_mem *vm);
    278static void virtio_mem_delete_resource(struct virtio_mem *vm);
    279
    280/*
    281 * Register a virtio-mem device so it will be considered for the online_page
    282 * callback.
    283 */
    284static int register_virtio_mem_device(struct virtio_mem *vm)
    285{
    286	int rc = 0;
    287
    288	/* First device registers the callback. */
    289	mutex_lock(&virtio_mem_mutex);
    290	if (list_empty(&virtio_mem_devices))
    291		rc = set_online_page_callback(&virtio_mem_online_page_cb);
    292	if (!rc)
    293		list_add_rcu(&vm->next, &virtio_mem_devices);
    294	mutex_unlock(&virtio_mem_mutex);
    295
    296	return rc;
    297}
    298
    299/*
    300 * Unregister a virtio-mem device so it will no longer be considered for the
    301 * online_page callback.
    302 */
    303static void unregister_virtio_mem_device(struct virtio_mem *vm)
    304{
    305	/* Last device unregisters the callback. */
    306	mutex_lock(&virtio_mem_mutex);
    307	list_del_rcu(&vm->next);
    308	if (list_empty(&virtio_mem_devices))
    309		restore_online_page_callback(&virtio_mem_online_page_cb);
    310	mutex_unlock(&virtio_mem_mutex);
    311
    312	synchronize_rcu();
    313}
    314
    315/*
    316 * Calculate the memory block id of a given address.
    317 */
    318static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
    319{
    320	return addr / memory_block_size_bytes();
    321}
    322
    323/*
    324 * Calculate the physical start address of a given memory block id.
    325 */
    326static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
    327{
    328	return mb_id * memory_block_size_bytes();
    329}
    330
    331/*
    332 * Calculate the big block id of a given address.
    333 */
    334static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
    335					      uint64_t addr)
    336{
    337	return addr / vm->bbm.bb_size;
    338}
    339
    340/*
    341 * Calculate the physical start address of a given big block id.
    342 */
    343static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
    344					 unsigned long bb_id)
    345{
    346	return bb_id * vm->bbm.bb_size;
    347}
    348
    349/*
    350 * Calculate the subblock id of a given address.
    351 */
    352static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
    353					      unsigned long addr)
    354{
    355	const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
    356	const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
    357
    358	return (addr - mb_addr) / vm->sbm.sb_size;
    359}
    360
    361/*
    362 * Set the state of a big block, taking care of the state counter.
    363 */
    364static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
    365					unsigned long bb_id,
    366					enum virtio_mem_bbm_bb_state state)
    367{
    368	const unsigned long idx = bb_id - vm->bbm.first_bb_id;
    369	enum virtio_mem_bbm_bb_state old_state;
    370
    371	old_state = vm->bbm.bb_states[idx];
    372	vm->bbm.bb_states[idx] = state;
    373
    374	BUG_ON(vm->bbm.bb_count[old_state] == 0);
    375	vm->bbm.bb_count[old_state]--;
    376	vm->bbm.bb_count[state]++;
    377}
    378
    379/*
    380 * Get the state of a big block.
    381 */
    382static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
    383								unsigned long bb_id)
    384{
    385	return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
    386}
    387
    388/*
    389 * Prepare the big block state array for the next big block.
    390 */
    391static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
    392{
    393	unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
    394	unsigned long new_bytes = old_bytes + 1;
    395	int old_pages = PFN_UP(old_bytes);
    396	int new_pages = PFN_UP(new_bytes);
    397	uint8_t *new_array;
    398
    399	if (vm->bbm.bb_states && old_pages == new_pages)
    400		return 0;
    401
    402	new_array = vzalloc(new_pages * PAGE_SIZE);
    403	if (!new_array)
    404		return -ENOMEM;
    405
    406	mutex_lock(&vm->hotplug_mutex);
    407	if (vm->bbm.bb_states)
    408		memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
    409	vfree(vm->bbm.bb_states);
    410	vm->bbm.bb_states = new_array;
    411	mutex_unlock(&vm->hotplug_mutex);
    412
    413	return 0;
    414}
    415
    416#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
    417	for (_bb_id = vm->bbm.first_bb_id; \
    418	     _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
    419	     _bb_id++) \
    420		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
    421
    422#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
    423	for (_bb_id = vm->bbm.next_bb_id - 1; \
    424	     _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
    425	     _bb_id--) \
    426		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
    427
    428/*
    429 * Set the state of a memory block, taking care of the state counter.
    430 */
    431static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
    432					unsigned long mb_id, uint8_t state)
    433{
    434	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
    435	uint8_t old_state;
    436
    437	old_state = vm->sbm.mb_states[idx];
    438	vm->sbm.mb_states[idx] = state;
    439
    440	BUG_ON(vm->sbm.mb_count[old_state] == 0);
    441	vm->sbm.mb_count[old_state]--;
    442	vm->sbm.mb_count[state]++;
    443}
    444
    445/*
    446 * Get the state of a memory block.
    447 */
    448static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
    449					   unsigned long mb_id)
    450{
    451	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
    452
    453	return vm->sbm.mb_states[idx];
    454}
    455
    456/*
    457 * Prepare the state array for the next memory block.
    458 */
    459static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
    460{
    461	int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
    462	int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
    463	uint8_t *new_array;
    464
    465	if (vm->sbm.mb_states && old_pages == new_pages)
    466		return 0;
    467
    468	new_array = vzalloc(new_pages * PAGE_SIZE);
    469	if (!new_array)
    470		return -ENOMEM;
    471
    472	mutex_lock(&vm->hotplug_mutex);
    473	if (vm->sbm.mb_states)
    474		memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
    475	vfree(vm->sbm.mb_states);
    476	vm->sbm.mb_states = new_array;
    477	mutex_unlock(&vm->hotplug_mutex);
    478
    479	return 0;
    480}
    481
    482#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
    483	for (_mb_id = _vm->sbm.first_mb_id; \
    484	     _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
    485	     _mb_id++) \
    486		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
    487
    488#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
    489	for (_mb_id = _vm->sbm.next_mb_id - 1; \
    490	     _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
    491	     _mb_id--) \
    492		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
    493
    494/*
    495 * Calculate the bit number in the subblock bitmap for the given subblock
    496 * inside the given memory block.
    497 */
    498static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
    499					  unsigned long mb_id, int sb_id)
    500{
    501	return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
    502}
    503
    504/*
    505 * Mark all selected subblocks plugged.
    506 *
    507 * Will not modify the state of the memory block.
    508 */
    509static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
    510					  unsigned long mb_id, int sb_id,
    511					  int count)
    512{
    513	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
    514
    515	__bitmap_set(vm->sbm.sb_states, bit, count);
    516}
    517
    518/*
    519 * Mark all selected subblocks unplugged.
    520 *
    521 * Will not modify the state of the memory block.
    522 */
    523static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
    524					    unsigned long mb_id, int sb_id,
    525					    int count)
    526{
    527	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
    528
    529	__bitmap_clear(vm->sbm.sb_states, bit, count);
    530}
    531
    532/*
    533 * Test if all selected subblocks are plugged.
    534 */
    535static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
    536					   unsigned long mb_id, int sb_id,
    537					   int count)
    538{
    539	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
    540
    541	if (count == 1)
    542		return test_bit(bit, vm->sbm.sb_states);
    543
    544	/* TODO: Helper similar to bitmap_set() */
    545	return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
    546	       bit + count;
    547}
    548
    549/*
    550 * Test if all selected subblocks are unplugged.
    551 */
    552static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
    553					     unsigned long mb_id, int sb_id,
    554					     int count)
    555{
    556	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
    557
    558	/* TODO: Helper similar to bitmap_set() */
    559	return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
    560	       bit + count;
    561}
    562
    563/*
    564 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
    565 * none.
    566 */
    567static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
    568					    unsigned long mb_id)
    569{
    570	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
    571
    572	return find_next_zero_bit(vm->sbm.sb_states,
    573				  bit + vm->sbm.sbs_per_mb, bit) - bit;
    574}
    575
    576/*
    577 * Prepare the subblock bitmap for the next memory block.
    578 */
    579static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
    580{
    581	const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
    582	const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
    583	const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
    584	int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
    585	int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
    586	unsigned long *new_bitmap, *old_bitmap;
    587
    588	if (vm->sbm.sb_states && old_pages == new_pages)
    589		return 0;
    590
    591	new_bitmap = vzalloc(new_pages * PAGE_SIZE);
    592	if (!new_bitmap)
    593		return -ENOMEM;
    594
    595	mutex_lock(&vm->hotplug_mutex);
    596	if (vm->sbm.sb_states)
    597		memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
    598
    599	old_bitmap = vm->sbm.sb_states;
    600	vm->sbm.sb_states = new_bitmap;
    601	mutex_unlock(&vm->hotplug_mutex);
    602
    603	vfree(old_bitmap);
    604	return 0;
    605}
    606
    607/*
    608 * Test if we could add memory without creating too much offline memory -
    609 * to avoid running OOM if memory is getting onlined deferred.
    610 */
    611static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
    612{
    613	if (WARN_ON_ONCE(size > vm->offline_threshold))
    614		return false;
    615
    616	return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
    617}
    618
    619/*
    620 * Try adding memory to Linux. Will usually only fail if out of memory.
    621 *
    622 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
    623 * onlining code).
    624 *
    625 * Will not modify the state of memory blocks in virtio-mem.
    626 */
    627static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
    628				 uint64_t size)
    629{
    630	int rc;
    631
    632	/*
    633	 * When force-unloading the driver and we still have memory added to
    634	 * Linux, the resource name has to stay.
    635	 */
    636	if (!vm->resource_name) {
    637		vm->resource_name = kstrdup_const("System RAM (virtio_mem)",
    638						  GFP_KERNEL);
    639		if (!vm->resource_name)
    640			return -ENOMEM;
    641	}
    642
    643	dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
    644		addr + size - 1);
    645	/* Memory might get onlined immediately. */
    646	atomic64_add(size, &vm->offline_size);
    647	rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name,
    648				       MHP_MERGE_RESOURCE | MHP_NID_IS_MGID);
    649	if (rc) {
    650		atomic64_sub(size, &vm->offline_size);
    651		dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
    652		/*
    653		 * TODO: Linux MM does not properly clean up yet in all cases
    654		 * where adding of memory failed - especially on -ENOMEM.
    655		 */
    656	}
    657	return rc;
    658}
    659
    660/*
    661 * See virtio_mem_add_memory(): Try adding a single Linux memory block.
    662 */
    663static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
    664{
    665	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
    666	const uint64_t size = memory_block_size_bytes();
    667
    668	return virtio_mem_add_memory(vm, addr, size);
    669}
    670
    671/*
    672 * See virtio_mem_add_memory(): Try adding a big block.
    673 */
    674static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
    675{
    676	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
    677	const uint64_t size = vm->bbm.bb_size;
    678
    679	return virtio_mem_add_memory(vm, addr, size);
    680}
    681
    682/*
    683 * Try removing memory from Linux. Will only fail if memory blocks aren't
    684 * offline.
    685 *
    686 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
    687 * onlining code).
    688 *
    689 * Will not modify the state of memory blocks in virtio-mem.
    690 */
    691static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
    692				    uint64_t size)
    693{
    694	int rc;
    695
    696	dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
    697		addr + size - 1);
    698	rc = remove_memory(addr, size);
    699	if (!rc) {
    700		atomic64_sub(size, &vm->offline_size);
    701		/*
    702		 * We might have freed up memory we can now unplug, retry
    703		 * immediately instead of waiting.
    704		 */
    705		virtio_mem_retry(vm);
    706	} else {
    707		dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
    708	}
    709	return rc;
    710}
    711
    712/*
    713 * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
    714 */
    715static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
    716{
    717	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
    718	const uint64_t size = memory_block_size_bytes();
    719
    720	return virtio_mem_remove_memory(vm, addr, size);
    721}
    722
    723/*
    724 * Try offlining and removing memory from Linux.
    725 *
    726 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
    727 * onlining code).
    728 *
    729 * Will not modify the state of memory blocks in virtio-mem.
    730 */
    731static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
    732						uint64_t addr,
    733						uint64_t size)
    734{
    735	int rc;
    736
    737	dev_dbg(&vm->vdev->dev,
    738		"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
    739		addr + size - 1);
    740
    741	rc = offline_and_remove_memory(addr, size);
    742	if (!rc) {
    743		atomic64_sub(size, &vm->offline_size);
    744		/*
    745		 * We might have freed up memory we can now unplug, retry
    746		 * immediately instead of waiting.
    747		 */
    748		virtio_mem_retry(vm);
    749	} else {
    750		dev_dbg(&vm->vdev->dev,
    751			"offlining and removing memory failed: %d\n", rc);
    752	}
    753	return rc;
    754}
    755
    756/*
    757 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
    758 * a single Linux memory block.
    759 */
    760static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
    761						unsigned long mb_id)
    762{
    763	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
    764	const uint64_t size = memory_block_size_bytes();
    765
    766	return virtio_mem_offline_and_remove_memory(vm, addr, size);
    767}
    768
    769/*
    770 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
    771 * all Linux memory blocks covered by the big block.
    772 */
    773static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
    774						unsigned long bb_id)
    775{
    776	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
    777	const uint64_t size = vm->bbm.bb_size;
    778
    779	return virtio_mem_offline_and_remove_memory(vm, addr, size);
    780}
    781
    782/*
    783 * Trigger the workqueue so the device can perform its magic.
    784 */
    785static void virtio_mem_retry(struct virtio_mem *vm)
    786{
    787	unsigned long flags;
    788
    789	spin_lock_irqsave(&vm->removal_lock, flags);
    790	if (!vm->removing)
    791		queue_work(system_freezable_wq, &vm->wq);
    792	spin_unlock_irqrestore(&vm->removal_lock, flags);
    793}
    794
    795static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
    796{
    797	int node = NUMA_NO_NODE;
    798
    799#if defined(CONFIG_ACPI_NUMA)
    800	if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
    801		node = pxm_to_node(node_id);
    802#endif
    803	return node;
    804}
    805
    806/*
    807 * Test if a virtio-mem device overlaps with the given range. Can be called
    808 * from (notifier) callbacks lockless.
    809 */
    810static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
    811				      uint64_t size)
    812{
    813	return start < vm->addr + vm->region_size && vm->addr < start + size;
    814}
    815
    816/*
    817 * Test if a virtio-mem device contains a given range. Can be called from
    818 * (notifier) callbacks lockless.
    819 */
    820static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
    821				      uint64_t size)
    822{
    823	return start >= vm->addr && start + size <= vm->addr + vm->region_size;
    824}
    825
    826static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
    827					      unsigned long mb_id)
    828{
    829	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
    830	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
    831	case VIRTIO_MEM_SBM_MB_OFFLINE:
    832		return NOTIFY_OK;
    833	default:
    834		break;
    835	}
    836	dev_warn_ratelimited(&vm->vdev->dev,
    837			     "memory block onlining denied\n");
    838	return NOTIFY_BAD;
    839}
    840
    841static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
    842					  unsigned long mb_id)
    843{
    844	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
    845	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
    846	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
    847		virtio_mem_sbm_set_mb_state(vm, mb_id,
    848					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
    849		break;
    850	case VIRTIO_MEM_SBM_MB_KERNEL:
    851	case VIRTIO_MEM_SBM_MB_MOVABLE:
    852		virtio_mem_sbm_set_mb_state(vm, mb_id,
    853					    VIRTIO_MEM_SBM_MB_OFFLINE);
    854		break;
    855	default:
    856		BUG();
    857		break;
    858	}
    859}
    860
    861static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
    862					 unsigned long mb_id,
    863					 unsigned long start_pfn)
    864{
    865	const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) ==
    866				ZONE_MOVABLE;
    867	int new_state;
    868
    869	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
    870	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
    871		new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL;
    872		if (is_movable)
    873			new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL;
    874		break;
    875	case VIRTIO_MEM_SBM_MB_OFFLINE:
    876		new_state = VIRTIO_MEM_SBM_MB_KERNEL;
    877		if (is_movable)
    878			new_state = VIRTIO_MEM_SBM_MB_MOVABLE;
    879		break;
    880	default:
    881		BUG();
    882		break;
    883	}
    884	virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
    885}
    886
    887static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
    888						unsigned long mb_id)
    889{
    890	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
    891	unsigned long pfn;
    892	int sb_id;
    893
    894	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
    895		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
    896			continue;
    897		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
    898			       sb_id * vm->sbm.sb_size);
    899		virtio_mem_fake_offline_going_offline(pfn, nr_pages);
    900	}
    901}
    902
    903static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
    904						 unsigned long mb_id)
    905{
    906	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
    907	unsigned long pfn;
    908	int sb_id;
    909
    910	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
    911		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
    912			continue;
    913		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
    914			       sb_id * vm->sbm.sb_size);
    915		virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
    916	}
    917}
    918
    919static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
    920						unsigned long bb_id,
    921						unsigned long pfn,
    922						unsigned long nr_pages)
    923{
    924	/*
    925	 * When marked as "fake-offline", all online memory of this device block
    926	 * is allocated by us. Otherwise, we don't have any memory allocated.
    927	 */
    928	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
    929	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
    930		return;
    931	virtio_mem_fake_offline_going_offline(pfn, nr_pages);
    932}
    933
    934static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
    935						 unsigned long bb_id,
    936						 unsigned long pfn,
    937						 unsigned long nr_pages)
    938{
    939	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
    940	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
    941		return;
    942	virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
    943}
    944
    945/*
    946 * This callback will either be called synchronously from add_memory() or
    947 * asynchronously (e.g., triggered via user space). We have to be careful
    948 * with locking when calling add_memory().
    949 */
    950static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
    951					 unsigned long action, void *arg)
    952{
    953	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
    954					     memory_notifier);
    955	struct memory_notify *mhp = arg;
    956	const unsigned long start = PFN_PHYS(mhp->start_pfn);
    957	const unsigned long size = PFN_PHYS(mhp->nr_pages);
    958	int rc = NOTIFY_OK;
    959	unsigned long id;
    960
    961	if (!virtio_mem_overlaps_range(vm, start, size))
    962		return NOTIFY_DONE;
    963
    964	if (vm->in_sbm) {
    965		id = virtio_mem_phys_to_mb_id(start);
    966		/*
    967		 * In SBM, we add memory in separate memory blocks - we expect
    968		 * it to be onlined/offlined in the same granularity. Bail out
    969		 * if this ever changes.
    970		 */
    971		if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
    972				 !IS_ALIGNED(start, memory_block_size_bytes())))
    973			return NOTIFY_BAD;
    974	} else {
    975		id = virtio_mem_phys_to_bb_id(vm, start);
    976		/*
    977		 * In BBM, we only care about onlining/offlining happening
    978		 * within a single big block, we don't care about the
    979		 * actual granularity as we don't track individual Linux
    980		 * memory blocks.
    981		 */
    982		if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
    983			return NOTIFY_BAD;
    984	}
    985
    986	/*
    987	 * Avoid circular locking lockdep warnings. We lock the mutex
    988	 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
    989	 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
    990	 * between both notifier calls and will bail out. False positive.
    991	 */
    992	lockdep_off();
    993
    994	switch (action) {
    995	case MEM_GOING_OFFLINE:
    996		mutex_lock(&vm->hotplug_mutex);
    997		if (vm->removing) {
    998			rc = notifier_from_errno(-EBUSY);
    999			mutex_unlock(&vm->hotplug_mutex);
   1000			break;
   1001		}
   1002		vm->hotplug_active = true;
   1003		if (vm->in_sbm)
   1004			virtio_mem_sbm_notify_going_offline(vm, id);
   1005		else
   1006			virtio_mem_bbm_notify_going_offline(vm, id,
   1007							    mhp->start_pfn,
   1008							    mhp->nr_pages);
   1009		break;
   1010	case MEM_GOING_ONLINE:
   1011		mutex_lock(&vm->hotplug_mutex);
   1012		if (vm->removing) {
   1013			rc = notifier_from_errno(-EBUSY);
   1014			mutex_unlock(&vm->hotplug_mutex);
   1015			break;
   1016		}
   1017		vm->hotplug_active = true;
   1018		if (vm->in_sbm)
   1019			rc = virtio_mem_sbm_notify_going_online(vm, id);
   1020		break;
   1021	case MEM_OFFLINE:
   1022		if (vm->in_sbm)
   1023			virtio_mem_sbm_notify_offline(vm, id);
   1024
   1025		atomic64_add(size, &vm->offline_size);
   1026		/*
   1027		 * Trigger the workqueue. Now that we have some offline memory,
   1028		 * maybe we can handle pending unplug requests.
   1029		 */
   1030		if (!unplug_online)
   1031			virtio_mem_retry(vm);
   1032
   1033		vm->hotplug_active = false;
   1034		mutex_unlock(&vm->hotplug_mutex);
   1035		break;
   1036	case MEM_ONLINE:
   1037		if (vm->in_sbm)
   1038			virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn);
   1039
   1040		atomic64_sub(size, &vm->offline_size);
   1041		/*
   1042		 * Start adding more memory once we onlined half of our
   1043		 * threshold. Don't trigger if it's possibly due to our actipn
   1044		 * (e.g., us adding memory which gets onlined immediately from
   1045		 * the core).
   1046		 */
   1047		if (!atomic_read(&vm->wq_active) &&
   1048		    virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
   1049			virtio_mem_retry(vm);
   1050
   1051		vm->hotplug_active = false;
   1052		mutex_unlock(&vm->hotplug_mutex);
   1053		break;
   1054	case MEM_CANCEL_OFFLINE:
   1055		if (!vm->hotplug_active)
   1056			break;
   1057		if (vm->in_sbm)
   1058			virtio_mem_sbm_notify_cancel_offline(vm, id);
   1059		else
   1060			virtio_mem_bbm_notify_cancel_offline(vm, id,
   1061							     mhp->start_pfn,
   1062							     mhp->nr_pages);
   1063		vm->hotplug_active = false;
   1064		mutex_unlock(&vm->hotplug_mutex);
   1065		break;
   1066	case MEM_CANCEL_ONLINE:
   1067		if (!vm->hotplug_active)
   1068			break;
   1069		vm->hotplug_active = false;
   1070		mutex_unlock(&vm->hotplug_mutex);
   1071		break;
   1072	default:
   1073		break;
   1074	}
   1075
   1076	lockdep_on();
   1077
   1078	return rc;
   1079}
   1080
   1081/*
   1082 * Set a range of pages PG_offline. Remember pages that were never onlined
   1083 * (via generic_online_page()) using PageDirty().
   1084 */
   1085static void virtio_mem_set_fake_offline(unsigned long pfn,
   1086					unsigned long nr_pages, bool onlined)
   1087{
   1088	page_offline_begin();
   1089	for (; nr_pages--; pfn++) {
   1090		struct page *page = pfn_to_page(pfn);
   1091
   1092		__SetPageOffline(page);
   1093		if (!onlined) {
   1094			SetPageDirty(page);
   1095			/* FIXME: remove after cleanups */
   1096			ClearPageReserved(page);
   1097		}
   1098	}
   1099	page_offline_end();
   1100}
   1101
   1102/*
   1103 * Clear PG_offline from a range of pages. If the pages were never onlined,
   1104 * (via generic_online_page()), clear PageDirty().
   1105 */
   1106static void virtio_mem_clear_fake_offline(unsigned long pfn,
   1107					  unsigned long nr_pages, bool onlined)
   1108{
   1109	for (; nr_pages--; pfn++) {
   1110		struct page *page = pfn_to_page(pfn);
   1111
   1112		__ClearPageOffline(page);
   1113		if (!onlined)
   1114			ClearPageDirty(page);
   1115	}
   1116}
   1117
   1118/*
   1119 * Release a range of fake-offline pages to the buddy, effectively
   1120 * fake-onlining them.
   1121 */
   1122static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
   1123{
   1124	unsigned long order = MAX_ORDER - 1;
   1125	unsigned long i;
   1126
   1127	/*
   1128	 * We might get called for ranges that don't cover properly aligned
   1129	 * MAX_ORDER - 1 pages; however, we can only online properly aligned
   1130	 * pages with an order of MAX_ORDER - 1 at maximum.
   1131	 */
   1132	while (!IS_ALIGNED(pfn | nr_pages, 1 << order))
   1133		order--;
   1134
   1135	for (i = 0; i < nr_pages; i += 1 << order) {
   1136		struct page *page = pfn_to_page(pfn + i);
   1137
   1138		/*
   1139		 * If the page is PageDirty(), it was kept fake-offline when
   1140		 * onlining the memory block. Otherwise, it was allocated
   1141		 * using alloc_contig_range(). All pages in a subblock are
   1142		 * alike.
   1143		 */
   1144		if (PageDirty(page)) {
   1145			virtio_mem_clear_fake_offline(pfn + i, 1 << order, false);
   1146			generic_online_page(page, order);
   1147		} else {
   1148			virtio_mem_clear_fake_offline(pfn + i, 1 << order, true);
   1149			free_contig_range(pfn + i, 1 << order);
   1150			adjust_managed_page_count(page, 1 << order);
   1151		}
   1152	}
   1153}
   1154
   1155/*
   1156 * Try to allocate a range, marking pages fake-offline, effectively
   1157 * fake-offlining them.
   1158 */
   1159static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages)
   1160{
   1161	const bool is_movable = page_zonenum(pfn_to_page(pfn)) ==
   1162				ZONE_MOVABLE;
   1163	int rc, retry_count;
   1164
   1165	/*
   1166	 * TODO: We want an alloc_contig_range() mode that tries to allocate
   1167	 * harder (e.g., dealing with temporarily pinned pages, PCP), especially
   1168	 * with ZONE_MOVABLE. So for now, retry a couple of times with
   1169	 * ZONE_MOVABLE before giving up - because that zone is supposed to give
   1170	 * some guarantees.
   1171	 */
   1172	for (retry_count = 0; retry_count < 5; retry_count++) {
   1173		rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
   1174					GFP_KERNEL);
   1175		if (rc == -ENOMEM)
   1176			/* whoops, out of memory */
   1177			return rc;
   1178		else if (rc && !is_movable)
   1179			break;
   1180		else if (rc)
   1181			continue;
   1182
   1183		virtio_mem_set_fake_offline(pfn, nr_pages, true);
   1184		adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
   1185		return 0;
   1186	}
   1187
   1188	return -EBUSY;
   1189}
   1190
   1191/*
   1192 * Handle fake-offline pages when memory is going offline - such that the
   1193 * pages can be skipped by mm-core when offlining.
   1194 */
   1195static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
   1196						  unsigned long nr_pages)
   1197{
   1198	struct page *page;
   1199	unsigned long i;
   1200
   1201	/*
   1202	 * Drop our reference to the pages so the memory can get offlined
   1203	 * and add the unplugged pages to the managed page counters (so
   1204	 * offlining code can correctly subtract them again).
   1205	 */
   1206	adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
   1207	/* Drop our reference to the pages so the memory can get offlined. */
   1208	for (i = 0; i < nr_pages; i++) {
   1209		page = pfn_to_page(pfn + i);
   1210		if (WARN_ON(!page_ref_dec_and_test(page)))
   1211			dump_page(page, "fake-offline page referenced");
   1212	}
   1213}
   1214
   1215/*
   1216 * Handle fake-offline pages when memory offlining is canceled - to undo
   1217 * what we did in virtio_mem_fake_offline_going_offline().
   1218 */
   1219static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
   1220						   unsigned long nr_pages)
   1221{
   1222	unsigned long i;
   1223
   1224	/*
   1225	 * Get the reference we dropped when going offline and subtract the
   1226	 * unplugged pages from the managed page counters.
   1227	 */
   1228	adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
   1229	for (i = 0; i < nr_pages; i++)
   1230		page_ref_inc(pfn_to_page(pfn + i));
   1231}
   1232
   1233static void virtio_mem_online_page(struct virtio_mem *vm,
   1234				   struct page *page, unsigned int order)
   1235{
   1236	const unsigned long start = page_to_phys(page);
   1237	const unsigned long end = start + PFN_PHYS(1 << order);
   1238	unsigned long addr, next, id, sb_id, count;
   1239	bool do_online;
   1240
   1241	/*
   1242	 * We can get called with any order up to MAX_ORDER - 1. If our
   1243	 * subblock size is smaller than that and we have a mixture of plugged
   1244	 * and unplugged subblocks within such a page, we have to process in
   1245	 * smaller granularity. In that case we'll adjust the order exactly once
   1246	 * within the loop.
   1247	 */
   1248	for (addr = start; addr < end; ) {
   1249		next = addr + PFN_PHYS(1 << order);
   1250
   1251		if (vm->in_sbm) {
   1252			id = virtio_mem_phys_to_mb_id(addr);
   1253			sb_id = virtio_mem_phys_to_sb_id(vm, addr);
   1254			count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1;
   1255
   1256			if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) {
   1257				/* Fully plugged. */
   1258				do_online = true;
   1259			} else if (count == 1 ||
   1260				   virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) {
   1261				/* Fully unplugged. */
   1262				do_online = false;
   1263			} else {
   1264				/*
   1265				 * Mixture, process sub-blocks instead. This
   1266				 * will be at least the size of a pageblock.
   1267				 * We'll run into this case exactly once.
   1268				 */
   1269				order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT;
   1270				do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1);
   1271				continue;
   1272			}
   1273		} else {
   1274			/*
   1275			 * If the whole block is marked fake offline, keep
   1276			 * everything that way.
   1277			 */
   1278			id = virtio_mem_phys_to_bb_id(vm, addr);
   1279			do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
   1280				    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
   1281		}
   1282
   1283		if (do_online)
   1284			generic_online_page(pfn_to_page(PFN_DOWN(addr)), order);
   1285		else
   1286			virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
   1287						    false);
   1288		addr = next;
   1289	}
   1290}
   1291
   1292static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
   1293{
   1294	const unsigned long addr = page_to_phys(page);
   1295	struct virtio_mem *vm;
   1296
   1297	rcu_read_lock();
   1298	list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
   1299		/*
   1300		 * Pages we're onlining will never cross memory blocks and,
   1301		 * therefore, not virtio-mem devices.
   1302		 */
   1303		if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
   1304			continue;
   1305
   1306		/*
   1307		 * virtio_mem_set_fake_offline() might sleep. We can safely
   1308		 * drop the RCU lock at this point because the device
   1309		 * cannot go away. See virtio_mem_remove() how races
   1310		 * between memory onlining and device removal are handled.
   1311		 */
   1312		rcu_read_unlock();
   1313
   1314		virtio_mem_online_page(vm, page, order);
   1315		return;
   1316	}
   1317	rcu_read_unlock();
   1318
   1319	/* not virtio-mem memory, but e.g., a DIMM. online it */
   1320	generic_online_page(page, order);
   1321}
   1322
   1323static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
   1324					const struct virtio_mem_req *req)
   1325{
   1326	struct scatterlist *sgs[2], sg_req, sg_resp;
   1327	unsigned int len;
   1328	int rc;
   1329
   1330	/* don't use the request residing on the stack (vaddr) */
   1331	vm->req = *req;
   1332
   1333	/* out: buffer for request */
   1334	sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
   1335	sgs[0] = &sg_req;
   1336
   1337	/* in: buffer for response */
   1338	sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
   1339	sgs[1] = &sg_resp;
   1340
   1341	rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
   1342	if (rc < 0)
   1343		return rc;
   1344
   1345	virtqueue_kick(vm->vq);
   1346
   1347	/* wait for a response */
   1348	wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
   1349
   1350	return virtio16_to_cpu(vm->vdev, vm->resp.type);
   1351}
   1352
   1353static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
   1354					uint64_t size)
   1355{
   1356	const uint64_t nb_vm_blocks = size / vm->device_block_size;
   1357	const struct virtio_mem_req req = {
   1358		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
   1359		.u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
   1360		.u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
   1361	};
   1362	int rc = -ENOMEM;
   1363
   1364	if (atomic_read(&vm->config_changed))
   1365		return -EAGAIN;
   1366
   1367	dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
   1368		addr + size - 1);
   1369
   1370	switch (virtio_mem_send_request(vm, &req)) {
   1371	case VIRTIO_MEM_RESP_ACK:
   1372		vm->plugged_size += size;
   1373		return 0;
   1374	case VIRTIO_MEM_RESP_NACK:
   1375		rc = -EAGAIN;
   1376		break;
   1377	case VIRTIO_MEM_RESP_BUSY:
   1378		rc = -ETXTBSY;
   1379		break;
   1380	case VIRTIO_MEM_RESP_ERROR:
   1381		rc = -EINVAL;
   1382		break;
   1383	default:
   1384		break;
   1385	}
   1386
   1387	dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
   1388	return rc;
   1389}
   1390
   1391static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
   1392					  uint64_t size)
   1393{
   1394	const uint64_t nb_vm_blocks = size / vm->device_block_size;
   1395	const struct virtio_mem_req req = {
   1396		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
   1397		.u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
   1398		.u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
   1399	};
   1400	int rc = -ENOMEM;
   1401
   1402	if (atomic_read(&vm->config_changed))
   1403		return -EAGAIN;
   1404
   1405	dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
   1406		addr + size - 1);
   1407
   1408	switch (virtio_mem_send_request(vm, &req)) {
   1409	case VIRTIO_MEM_RESP_ACK:
   1410		vm->plugged_size -= size;
   1411		return 0;
   1412	case VIRTIO_MEM_RESP_BUSY:
   1413		rc = -ETXTBSY;
   1414		break;
   1415	case VIRTIO_MEM_RESP_ERROR:
   1416		rc = -EINVAL;
   1417		break;
   1418	default:
   1419		break;
   1420	}
   1421
   1422	dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
   1423	return rc;
   1424}
   1425
   1426static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
   1427{
   1428	const struct virtio_mem_req req = {
   1429		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
   1430	};
   1431	int rc = -ENOMEM;
   1432
   1433	dev_dbg(&vm->vdev->dev, "unplugging all memory");
   1434
   1435	switch (virtio_mem_send_request(vm, &req)) {
   1436	case VIRTIO_MEM_RESP_ACK:
   1437		vm->unplug_all_required = false;
   1438		vm->plugged_size = 0;
   1439		/* usable region might have shrunk */
   1440		atomic_set(&vm->config_changed, 1);
   1441		return 0;
   1442	case VIRTIO_MEM_RESP_BUSY:
   1443		rc = -ETXTBSY;
   1444		break;
   1445	default:
   1446		break;
   1447	}
   1448
   1449	dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
   1450	return rc;
   1451}
   1452
   1453/*
   1454 * Plug selected subblocks. Updates the plugged state, but not the state
   1455 * of the memory block.
   1456 */
   1457static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
   1458				  int sb_id, int count)
   1459{
   1460	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
   1461			      sb_id * vm->sbm.sb_size;
   1462	const uint64_t size = count * vm->sbm.sb_size;
   1463	int rc;
   1464
   1465	rc = virtio_mem_send_plug_request(vm, addr, size);
   1466	if (!rc)
   1467		virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
   1468	return rc;
   1469}
   1470
   1471/*
   1472 * Unplug selected subblocks. Updates the plugged state, but not the state
   1473 * of the memory block.
   1474 */
   1475static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
   1476				    int sb_id, int count)
   1477{
   1478	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
   1479			      sb_id * vm->sbm.sb_size;
   1480	const uint64_t size = count * vm->sbm.sb_size;
   1481	int rc;
   1482
   1483	rc = virtio_mem_send_unplug_request(vm, addr, size);
   1484	if (!rc)
   1485		virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
   1486	return rc;
   1487}
   1488
   1489/*
   1490 * Request to unplug a big block.
   1491 *
   1492 * Will not modify the state of the big block.
   1493 */
   1494static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
   1495{
   1496	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
   1497	const uint64_t size = vm->bbm.bb_size;
   1498
   1499	return virtio_mem_send_unplug_request(vm, addr, size);
   1500}
   1501
   1502/*
   1503 * Request to plug a big block.
   1504 *
   1505 * Will not modify the state of the big block.
   1506 */
   1507static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
   1508{
   1509	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
   1510	const uint64_t size = vm->bbm.bb_size;
   1511
   1512	return virtio_mem_send_plug_request(vm, addr, size);
   1513}
   1514
   1515/*
   1516 * Unplug the desired number of plugged subblocks of a offline or not-added
   1517 * memory block. Will fail if any subblock cannot get unplugged (instead of
   1518 * skipping it).
   1519 *
   1520 * Will not modify the state of the memory block.
   1521 *
   1522 * Note: can fail after some subblocks were unplugged.
   1523 */
   1524static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm,
   1525					    unsigned long mb_id, uint64_t *nb_sb)
   1526{
   1527	int sb_id, count;
   1528	int rc;
   1529
   1530	sb_id = vm->sbm.sbs_per_mb - 1;
   1531	while (*nb_sb) {
   1532		/* Find the next candidate subblock */
   1533		while (sb_id >= 0 &&
   1534		       virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
   1535			sb_id--;
   1536		if (sb_id < 0)
   1537			break;
   1538		/* Try to unplug multiple subblocks at a time */
   1539		count = 1;
   1540		while (count < *nb_sb && sb_id > 0 &&
   1541		       virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
   1542			count++;
   1543			sb_id--;
   1544		}
   1545
   1546		rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
   1547		if (rc)
   1548			return rc;
   1549		*nb_sb -= count;
   1550		sb_id--;
   1551	}
   1552
   1553	return 0;
   1554}
   1555
   1556/*
   1557 * Unplug all plugged subblocks of an offline or not-added memory block.
   1558 *
   1559 * Will not modify the state of the memory block.
   1560 *
   1561 * Note: can fail after some subblocks were unplugged.
   1562 */
   1563static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
   1564{
   1565	uint64_t nb_sb = vm->sbm.sbs_per_mb;
   1566
   1567	return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb);
   1568}
   1569
   1570/*
   1571 * Prepare tracking data for the next memory block.
   1572 */
   1573static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
   1574					  unsigned long *mb_id)
   1575{
   1576	int rc;
   1577
   1578	if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
   1579		return -ENOSPC;
   1580
   1581	/* Resize the state array if required. */
   1582	rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
   1583	if (rc)
   1584		return rc;
   1585
   1586	/* Resize the subblock bitmap if required. */
   1587	rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
   1588	if (rc)
   1589		return rc;
   1590
   1591	vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
   1592	*mb_id = vm->sbm.next_mb_id++;
   1593	return 0;
   1594}
   1595
   1596/*
   1597 * Try to plug the desired number of subblocks and add the memory block
   1598 * to Linux.
   1599 *
   1600 * Will modify the state of the memory block.
   1601 */
   1602static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
   1603					  unsigned long mb_id, uint64_t *nb_sb)
   1604{
   1605	const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
   1606	int rc;
   1607
   1608	if (WARN_ON_ONCE(!count))
   1609		return -EINVAL;
   1610
   1611	/*
   1612	 * Plug the requested number of subblocks before adding it to linux,
   1613	 * so that onlining will directly online all plugged subblocks.
   1614	 */
   1615	rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
   1616	if (rc)
   1617		return rc;
   1618
   1619	/*
   1620	 * Mark the block properly offline before adding it to Linux,
   1621	 * so the memory notifiers will find the block in the right state.
   1622	 */
   1623	if (count == vm->sbm.sbs_per_mb)
   1624		virtio_mem_sbm_set_mb_state(vm, mb_id,
   1625					    VIRTIO_MEM_SBM_MB_OFFLINE);
   1626	else
   1627		virtio_mem_sbm_set_mb_state(vm, mb_id,
   1628					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
   1629
   1630	/* Add the memory block to linux - if that fails, try to unplug. */
   1631	rc = virtio_mem_sbm_add_mb(vm, mb_id);
   1632	if (rc) {
   1633		int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
   1634
   1635		if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
   1636			new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
   1637		virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
   1638		return rc;
   1639	}
   1640
   1641	*nb_sb -= count;
   1642	return 0;
   1643}
   1644
   1645/*
   1646 * Try to plug the desired number of subblocks of a memory block that
   1647 * is already added to Linux.
   1648 *
   1649 * Will modify the state of the memory block.
   1650 *
   1651 * Note: Can fail after some subblocks were successfully plugged.
   1652 */
   1653static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
   1654				      unsigned long mb_id, uint64_t *nb_sb)
   1655{
   1656	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
   1657	unsigned long pfn, nr_pages;
   1658	int sb_id, count;
   1659	int rc;
   1660
   1661	if (WARN_ON_ONCE(!*nb_sb))
   1662		return -EINVAL;
   1663
   1664	while (*nb_sb) {
   1665		sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
   1666		if (sb_id >= vm->sbm.sbs_per_mb)
   1667			break;
   1668		count = 1;
   1669		while (count < *nb_sb &&
   1670		       sb_id + count < vm->sbm.sbs_per_mb &&
   1671		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
   1672			count++;
   1673
   1674		rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
   1675		if (rc)
   1676			return rc;
   1677		*nb_sb -= count;
   1678		if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
   1679			continue;
   1680
   1681		/* fake-online the pages if the memory block is online */
   1682		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
   1683			       sb_id * vm->sbm.sb_size);
   1684		nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
   1685		virtio_mem_fake_online(pfn, nr_pages);
   1686	}
   1687
   1688	if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
   1689		virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1);
   1690
   1691	return 0;
   1692}
   1693
   1694static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
   1695{
   1696	const int mb_states[] = {
   1697		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
   1698		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
   1699		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
   1700	};
   1701	uint64_t nb_sb = diff / vm->sbm.sb_size;
   1702	unsigned long mb_id;
   1703	int rc, i;
   1704
   1705	if (!nb_sb)
   1706		return 0;
   1707
   1708	/* Don't race with onlining/offlining */
   1709	mutex_lock(&vm->hotplug_mutex);
   1710
   1711	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
   1712		virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) {
   1713			rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb);
   1714			if (rc || !nb_sb)
   1715				goto out_unlock;
   1716			cond_resched();
   1717		}
   1718	}
   1719
   1720	/*
   1721	 * We won't be working on online/offline memory blocks from this point,
   1722	 * so we can't race with memory onlining/offlining. Drop the mutex.
   1723	 */
   1724	mutex_unlock(&vm->hotplug_mutex);
   1725
   1726	/* Try to plug and add unused blocks */
   1727	virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
   1728		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
   1729			return -ENOSPC;
   1730
   1731		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
   1732		if (rc || !nb_sb)
   1733			return rc;
   1734		cond_resched();
   1735	}
   1736
   1737	/* Try to prepare, plug and add new blocks */
   1738	while (nb_sb) {
   1739		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
   1740			return -ENOSPC;
   1741
   1742		rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
   1743		if (rc)
   1744			return rc;
   1745		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
   1746		if (rc)
   1747			return rc;
   1748		cond_resched();
   1749	}
   1750
   1751	return 0;
   1752out_unlock:
   1753	mutex_unlock(&vm->hotplug_mutex);
   1754	return rc;
   1755}
   1756
   1757/*
   1758 * Plug a big block and add it to Linux.
   1759 *
   1760 * Will modify the state of the big block.
   1761 */
   1762static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
   1763					  unsigned long bb_id)
   1764{
   1765	int rc;
   1766
   1767	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
   1768			 VIRTIO_MEM_BBM_BB_UNUSED))
   1769		return -EINVAL;
   1770
   1771	rc = virtio_mem_bbm_plug_bb(vm, bb_id);
   1772	if (rc)
   1773		return rc;
   1774	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
   1775
   1776	rc = virtio_mem_bbm_add_bb(vm, bb_id);
   1777	if (rc) {
   1778		if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
   1779			virtio_mem_bbm_set_bb_state(vm, bb_id,
   1780						    VIRTIO_MEM_BBM_BB_UNUSED);
   1781		else
   1782			/* Retry from the main loop. */
   1783			virtio_mem_bbm_set_bb_state(vm, bb_id,
   1784						    VIRTIO_MEM_BBM_BB_PLUGGED);
   1785		return rc;
   1786	}
   1787	return 0;
   1788}
   1789
   1790/*
   1791 * Prepare tracking data for the next big block.
   1792 */
   1793static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
   1794					  unsigned long *bb_id)
   1795{
   1796	int rc;
   1797
   1798	if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
   1799		return -ENOSPC;
   1800
   1801	/* Resize the big block state array if required. */
   1802	rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
   1803	if (rc)
   1804		return rc;
   1805
   1806	vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
   1807	*bb_id = vm->bbm.next_bb_id;
   1808	vm->bbm.next_bb_id++;
   1809	return 0;
   1810}
   1811
   1812static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
   1813{
   1814	uint64_t nb_bb = diff / vm->bbm.bb_size;
   1815	unsigned long bb_id;
   1816	int rc;
   1817
   1818	if (!nb_bb)
   1819		return 0;
   1820
   1821	/* Try to plug and add unused big blocks */
   1822	virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
   1823		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
   1824			return -ENOSPC;
   1825
   1826		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
   1827		if (!rc)
   1828			nb_bb--;
   1829		if (rc || !nb_bb)
   1830			return rc;
   1831		cond_resched();
   1832	}
   1833
   1834	/* Try to prepare, plug and add new big blocks */
   1835	while (nb_bb) {
   1836		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
   1837			return -ENOSPC;
   1838
   1839		rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
   1840		if (rc)
   1841			return rc;
   1842		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
   1843		if (!rc)
   1844			nb_bb--;
   1845		if (rc)
   1846			return rc;
   1847		cond_resched();
   1848	}
   1849
   1850	return 0;
   1851}
   1852
   1853/*
   1854 * Try to plug the requested amount of memory.
   1855 */
   1856static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
   1857{
   1858	if (vm->in_sbm)
   1859		return virtio_mem_sbm_plug_request(vm, diff);
   1860	return virtio_mem_bbm_plug_request(vm, diff);
   1861}
   1862
   1863/*
   1864 * Unplug the desired number of plugged subblocks of an offline memory block.
   1865 * Will fail if any subblock cannot get unplugged (instead of skipping it).
   1866 *
   1867 * Will modify the state of the memory block. Might temporarily drop the
   1868 * hotplug_mutex.
   1869 *
   1870 * Note: Can fail after some subblocks were successfully unplugged.
   1871 */
   1872static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
   1873						unsigned long mb_id,
   1874						uint64_t *nb_sb)
   1875{
   1876	int rc;
   1877
   1878	rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb);
   1879
   1880	/* some subblocks might have been unplugged even on failure */
   1881	if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
   1882		virtio_mem_sbm_set_mb_state(vm, mb_id,
   1883					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
   1884	if (rc)
   1885		return rc;
   1886
   1887	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
   1888		/*
   1889		 * Remove the block from Linux - this should never fail.
   1890		 * Hinder the block from getting onlined by marking it
   1891		 * unplugged. Temporarily drop the mutex, so
   1892		 * any pending GOING_ONLINE requests can be serviced/rejected.
   1893		 */
   1894		virtio_mem_sbm_set_mb_state(vm, mb_id,
   1895					    VIRTIO_MEM_SBM_MB_UNUSED);
   1896
   1897		mutex_unlock(&vm->hotplug_mutex);
   1898		rc = virtio_mem_sbm_remove_mb(vm, mb_id);
   1899		BUG_ON(rc);
   1900		mutex_lock(&vm->hotplug_mutex);
   1901	}
   1902	return 0;
   1903}
   1904
   1905/*
   1906 * Unplug the given plugged subblocks of an online memory block.
   1907 *
   1908 * Will modify the state of the memory block.
   1909 */
   1910static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
   1911					   unsigned long mb_id, int sb_id,
   1912					   int count)
   1913{
   1914	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
   1915	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
   1916	unsigned long start_pfn;
   1917	int rc;
   1918
   1919	start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
   1920			     sb_id * vm->sbm.sb_size);
   1921
   1922	rc = virtio_mem_fake_offline(start_pfn, nr_pages);
   1923	if (rc)
   1924		return rc;
   1925
   1926	/* Try to unplug the allocated memory */
   1927	rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
   1928	if (rc) {
   1929		/* Return the memory to the buddy. */
   1930		virtio_mem_fake_online(start_pfn, nr_pages);
   1931		return rc;
   1932	}
   1933
   1934	switch (old_state) {
   1935	case VIRTIO_MEM_SBM_MB_KERNEL:
   1936		virtio_mem_sbm_set_mb_state(vm, mb_id,
   1937					    VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL);
   1938		break;
   1939	case VIRTIO_MEM_SBM_MB_MOVABLE:
   1940		virtio_mem_sbm_set_mb_state(vm, mb_id,
   1941					    VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL);
   1942		break;
   1943	}
   1944
   1945	return 0;
   1946}
   1947
   1948/*
   1949 * Unplug the desired number of plugged subblocks of an online memory block.
   1950 * Will skip subblock that are busy.
   1951 *
   1952 * Will modify the state of the memory block. Might temporarily drop the
   1953 * hotplug_mutex.
   1954 *
   1955 * Note: Can fail after some subblocks were successfully unplugged. Can
   1956 *       return 0 even if subblocks were busy and could not get unplugged.
   1957 */
   1958static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
   1959					       unsigned long mb_id,
   1960					       uint64_t *nb_sb)
   1961{
   1962	int rc, sb_id;
   1963
   1964	/* If possible, try to unplug the complete block in one shot. */
   1965	if (*nb_sb >= vm->sbm.sbs_per_mb &&
   1966	    virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
   1967		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
   1968						     vm->sbm.sbs_per_mb);
   1969		if (!rc) {
   1970			*nb_sb -= vm->sbm.sbs_per_mb;
   1971			goto unplugged;
   1972		} else if (rc != -EBUSY)
   1973			return rc;
   1974	}
   1975
   1976	/* Fallback to single subblocks. */
   1977	for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
   1978		/* Find the next candidate subblock */
   1979		while (sb_id >= 0 &&
   1980		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
   1981			sb_id--;
   1982		if (sb_id < 0)
   1983			break;
   1984
   1985		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
   1986		if (rc == -EBUSY)
   1987			continue;
   1988		else if (rc)
   1989			return rc;
   1990		*nb_sb -= 1;
   1991	}
   1992
   1993unplugged:
   1994	/*
   1995	 * Once all subblocks of a memory block were unplugged, offline and
   1996	 * remove it. This will usually not fail, as no memory is in use
   1997	 * anymore - however some other notifiers might NACK the request.
   1998	 */
   1999	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
   2000		mutex_unlock(&vm->hotplug_mutex);
   2001		rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
   2002		mutex_lock(&vm->hotplug_mutex);
   2003		if (!rc)
   2004			virtio_mem_sbm_set_mb_state(vm, mb_id,
   2005						    VIRTIO_MEM_SBM_MB_UNUSED);
   2006	}
   2007
   2008	return 0;
   2009}
   2010
   2011/*
   2012 * Unplug the desired number of plugged subblocks of a memory block that is
   2013 * already added to Linux. Will skip subblock of online memory blocks that are
   2014 * busy (by the OS). Will fail if any subblock that's not busy cannot get
   2015 * unplugged.
   2016 *
   2017 * Will modify the state of the memory block. Might temporarily drop the
   2018 * hotplug_mutex.
   2019 *
   2020 * Note: Can fail after some subblocks were successfully unplugged. Can
   2021 *       return 0 even if subblocks were busy and could not get unplugged.
   2022 */
   2023static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
   2024					unsigned long mb_id,
   2025					uint64_t *nb_sb)
   2026{
   2027	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
   2028
   2029	switch (old_state) {
   2030	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
   2031	case VIRTIO_MEM_SBM_MB_KERNEL:
   2032	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
   2033	case VIRTIO_MEM_SBM_MB_MOVABLE:
   2034		return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb);
   2035	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
   2036	case VIRTIO_MEM_SBM_MB_OFFLINE:
   2037		return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb);
   2038	}
   2039	return -EINVAL;
   2040}
   2041
   2042static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
   2043{
   2044	const int mb_states[] = {
   2045		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
   2046		VIRTIO_MEM_SBM_MB_OFFLINE,
   2047		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
   2048		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
   2049		VIRTIO_MEM_SBM_MB_MOVABLE,
   2050		VIRTIO_MEM_SBM_MB_KERNEL,
   2051	};
   2052	uint64_t nb_sb = diff / vm->sbm.sb_size;
   2053	unsigned long mb_id;
   2054	int rc, i;
   2055
   2056	if (!nb_sb)
   2057		return 0;
   2058
   2059	/*
   2060	 * We'll drop the mutex a couple of times when it is safe to do so.
   2061	 * This might result in some blocks switching the state (online/offline)
   2062	 * and we could miss them in this run - we will retry again later.
   2063	 */
   2064	mutex_lock(&vm->hotplug_mutex);
   2065
   2066	/*
   2067	 * We try unplug from partially plugged blocks first, to try removing
   2068	 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE
   2069	 * as it's more reliable to unplug memory and remove whole memory
   2070	 * blocks, and we don't want to trigger a zone imbalances by
   2071	 * accidentially removing too much kernel memory.
   2072	 */
   2073	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
   2074		virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) {
   2075			rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
   2076			if (rc || !nb_sb)
   2077				goto out_unlock;
   2078			mutex_unlock(&vm->hotplug_mutex);
   2079			cond_resched();
   2080			mutex_lock(&vm->hotplug_mutex);
   2081		}
   2082		if (!unplug_online && i == 1) {
   2083			mutex_unlock(&vm->hotplug_mutex);
   2084			return 0;
   2085		}
   2086	}
   2087
   2088	mutex_unlock(&vm->hotplug_mutex);
   2089	return nb_sb ? -EBUSY : 0;
   2090out_unlock:
   2091	mutex_unlock(&vm->hotplug_mutex);
   2092	return rc;
   2093}
   2094
   2095/*
   2096 * Try to offline and remove a big block from Linux and unplug it. Will fail
   2097 * with -EBUSY if some memory is busy and cannot get unplugged.
   2098 *
   2099 * Will modify the state of the memory block. Might temporarily drop the
   2100 * hotplug_mutex.
   2101 */
   2102static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
   2103						       unsigned long bb_id)
   2104{
   2105	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
   2106	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
   2107	unsigned long end_pfn = start_pfn + nr_pages;
   2108	unsigned long pfn;
   2109	struct page *page;
   2110	int rc;
   2111
   2112	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
   2113			 VIRTIO_MEM_BBM_BB_ADDED))
   2114		return -EINVAL;
   2115
   2116	if (bbm_safe_unplug) {
   2117		/*
   2118		 * Start by fake-offlining all memory. Once we marked the device
   2119		 * block as fake-offline, all newly onlined memory will
   2120		 * automatically be kept fake-offline. Protect from concurrent
   2121		 * onlining/offlining until we have a consistent state.
   2122		 */
   2123		mutex_lock(&vm->hotplug_mutex);
   2124		virtio_mem_bbm_set_bb_state(vm, bb_id,
   2125					    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
   2126
   2127		for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
   2128			page = pfn_to_online_page(pfn);
   2129			if (!page)
   2130				continue;
   2131
   2132			rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION);
   2133			if (rc) {
   2134				end_pfn = pfn;
   2135				goto rollback_safe_unplug;
   2136			}
   2137		}
   2138		mutex_unlock(&vm->hotplug_mutex);
   2139	}
   2140
   2141	rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
   2142	if (rc) {
   2143		if (bbm_safe_unplug) {
   2144			mutex_lock(&vm->hotplug_mutex);
   2145			goto rollback_safe_unplug;
   2146		}
   2147		return rc;
   2148	}
   2149
   2150	rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
   2151	if (rc)
   2152		virtio_mem_bbm_set_bb_state(vm, bb_id,
   2153					    VIRTIO_MEM_BBM_BB_PLUGGED);
   2154	else
   2155		virtio_mem_bbm_set_bb_state(vm, bb_id,
   2156					    VIRTIO_MEM_BBM_BB_UNUSED);
   2157	return rc;
   2158
   2159rollback_safe_unplug:
   2160	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
   2161		page = pfn_to_online_page(pfn);
   2162		if (!page)
   2163			continue;
   2164		virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
   2165	}
   2166	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
   2167	mutex_unlock(&vm->hotplug_mutex);
   2168	return rc;
   2169}
   2170
   2171/*
   2172 * Test if a big block is completely offline.
   2173 */
   2174static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
   2175					 unsigned long bb_id)
   2176{
   2177	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
   2178	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
   2179	unsigned long pfn;
   2180
   2181	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
   2182	     pfn += PAGES_PER_SECTION) {
   2183		if (pfn_to_online_page(pfn))
   2184			return false;
   2185	}
   2186
   2187	return true;
   2188}
   2189
   2190/*
   2191 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline).
   2192 */
   2193static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm,
   2194					 unsigned long bb_id)
   2195{
   2196	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
   2197	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
   2198	struct page *page;
   2199	unsigned long pfn;
   2200
   2201	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
   2202	     pfn += PAGES_PER_SECTION) {
   2203		page = pfn_to_online_page(pfn);
   2204		if (!page)
   2205			continue;
   2206		if (page_zonenum(page) != ZONE_MOVABLE)
   2207			return false;
   2208	}
   2209
   2210	return true;
   2211}
   2212
   2213static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
   2214{
   2215	uint64_t nb_bb = diff / vm->bbm.bb_size;
   2216	uint64_t bb_id;
   2217	int rc, i;
   2218
   2219	if (!nb_bb)
   2220		return 0;
   2221
   2222	/*
   2223	 * Try to unplug big blocks. Similar to SBM, start with offline
   2224	 * big blocks.
   2225	 */
   2226	for (i = 0; i < 3; i++) {
   2227		virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
   2228			cond_resched();
   2229
   2230			/*
   2231			 * As we're holding no locks, these checks are racy,
   2232			 * but we don't care.
   2233			 */
   2234			if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id))
   2235				continue;
   2236			if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id))
   2237				continue;
   2238			rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
   2239			if (rc == -EBUSY)
   2240				continue;
   2241			if (!rc)
   2242				nb_bb--;
   2243			if (rc || !nb_bb)
   2244				return rc;
   2245		}
   2246		if (i == 0 && !unplug_online)
   2247			return 0;
   2248	}
   2249
   2250	return nb_bb ? -EBUSY : 0;
   2251}
   2252
   2253/*
   2254 * Try to unplug the requested amount of memory.
   2255 */
   2256static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
   2257{
   2258	if (vm->in_sbm)
   2259		return virtio_mem_sbm_unplug_request(vm, diff);
   2260	return virtio_mem_bbm_unplug_request(vm, diff);
   2261}
   2262
   2263/*
   2264 * Try to unplug all blocks that couldn't be unplugged before, for example,
   2265 * because the hypervisor was busy.
   2266 */
   2267static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
   2268{
   2269	unsigned long id;
   2270	int rc;
   2271
   2272	if (!vm->in_sbm) {
   2273		virtio_mem_bbm_for_each_bb(vm, id,
   2274					   VIRTIO_MEM_BBM_BB_PLUGGED) {
   2275			rc = virtio_mem_bbm_unplug_bb(vm, id);
   2276			if (rc)
   2277				return rc;
   2278			virtio_mem_bbm_set_bb_state(vm, id,
   2279						    VIRTIO_MEM_BBM_BB_UNUSED);
   2280		}
   2281		return 0;
   2282	}
   2283
   2284	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
   2285		rc = virtio_mem_sbm_unplug_mb(vm, id);
   2286		if (rc)
   2287			return rc;
   2288		virtio_mem_sbm_set_mb_state(vm, id,
   2289					    VIRTIO_MEM_SBM_MB_UNUSED);
   2290	}
   2291
   2292	return 0;
   2293}
   2294
   2295/*
   2296 * Update all parts of the config that could have changed.
   2297 */
   2298static void virtio_mem_refresh_config(struct virtio_mem *vm)
   2299{
   2300	const struct range pluggable_range = mhp_get_pluggable_range(true);
   2301	uint64_t new_plugged_size, usable_region_size, end_addr;
   2302
   2303	/* the plugged_size is just a reflection of what _we_ did previously */
   2304	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
   2305			&new_plugged_size);
   2306	if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
   2307		vm->plugged_size = new_plugged_size;
   2308
   2309	/* calculate the last usable memory block id */
   2310	virtio_cread_le(vm->vdev, struct virtio_mem_config,
   2311			usable_region_size, &usable_region_size);
   2312	end_addr = min(vm->addr + usable_region_size - 1,
   2313		       pluggable_range.end);
   2314
   2315	if (vm->in_sbm) {
   2316		vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr);
   2317		if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes()))
   2318			vm->sbm.last_usable_mb_id--;
   2319	} else {
   2320		vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm,
   2321								     end_addr);
   2322		if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size))
   2323			vm->bbm.last_usable_bb_id--;
   2324	}
   2325	/*
   2326	 * If we cannot plug any of our device memory (e.g., nothing in the
   2327	 * usable region is addressable), the last usable memory block id will
   2328	 * be smaller than the first usable memory block id. We'll stop
   2329	 * attempting to add memory with -ENOSPC from our main loop.
   2330	 */
   2331
   2332	/* see if there is a request to change the size */
   2333	virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
   2334			&vm->requested_size);
   2335
   2336	dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
   2337	dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
   2338}
   2339
   2340/*
   2341 * Workqueue function for handling plug/unplug requests and config updates.
   2342 */
   2343static void virtio_mem_run_wq(struct work_struct *work)
   2344{
   2345	struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
   2346	uint64_t diff;
   2347	int rc;
   2348
   2349	if (unlikely(vm->in_kdump)) {
   2350		dev_warn_once(&vm->vdev->dev,
   2351			     "unexpected workqueue run in kdump kernel\n");
   2352		return;
   2353	}
   2354
   2355	hrtimer_cancel(&vm->retry_timer);
   2356
   2357	if (vm->broken)
   2358		return;
   2359
   2360	atomic_set(&vm->wq_active, 1);
   2361retry:
   2362	rc = 0;
   2363
   2364	/* Make sure we start with a clean state if there are leftovers. */
   2365	if (unlikely(vm->unplug_all_required))
   2366		rc = virtio_mem_send_unplug_all_request(vm);
   2367
   2368	if (atomic_read(&vm->config_changed)) {
   2369		atomic_set(&vm->config_changed, 0);
   2370		virtio_mem_refresh_config(vm);
   2371	}
   2372
   2373	/* Unplug any leftovers from previous runs */
   2374	if (!rc)
   2375		rc = virtio_mem_unplug_pending_mb(vm);
   2376
   2377	if (!rc && vm->requested_size != vm->plugged_size) {
   2378		if (vm->requested_size > vm->plugged_size) {
   2379			diff = vm->requested_size - vm->plugged_size;
   2380			rc = virtio_mem_plug_request(vm, diff);
   2381		} else {
   2382			diff = vm->plugged_size - vm->requested_size;
   2383			rc = virtio_mem_unplug_request(vm, diff);
   2384		}
   2385	}
   2386
   2387	switch (rc) {
   2388	case 0:
   2389		vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
   2390		break;
   2391	case -ENOSPC:
   2392		/*
   2393		 * We cannot add any more memory (alignment, physical limit)
   2394		 * or we have too many offline memory blocks.
   2395		 */
   2396		break;
   2397	case -ETXTBSY:
   2398		/*
   2399		 * The hypervisor cannot process our request right now
   2400		 * (e.g., out of memory, migrating);
   2401		 */
   2402	case -EBUSY:
   2403		/*
   2404		 * We cannot free up any memory to unplug it (all plugged memory
   2405		 * is busy).
   2406		 */
   2407	case -ENOMEM:
   2408		/* Out of memory, try again later. */
   2409		hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
   2410			      HRTIMER_MODE_REL);
   2411		break;
   2412	case -EAGAIN:
   2413		/* Retry immediately (e.g., the config changed). */
   2414		goto retry;
   2415	default:
   2416		/* Unknown error, mark as broken */
   2417		dev_err(&vm->vdev->dev,
   2418			"unknown error, marking device broken: %d\n", rc);
   2419		vm->broken = true;
   2420	}
   2421
   2422	atomic_set(&vm->wq_active, 0);
   2423}
   2424
   2425static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
   2426{
   2427	struct virtio_mem *vm = container_of(timer, struct virtio_mem,
   2428					     retry_timer);
   2429
   2430	virtio_mem_retry(vm);
   2431	vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
   2432				   VIRTIO_MEM_RETRY_TIMER_MAX_MS);
   2433	return HRTIMER_NORESTART;
   2434}
   2435
   2436static void virtio_mem_handle_response(struct virtqueue *vq)
   2437{
   2438	struct virtio_mem *vm = vq->vdev->priv;
   2439
   2440	wake_up(&vm->host_resp);
   2441}
   2442
   2443static int virtio_mem_init_vq(struct virtio_mem *vm)
   2444{
   2445	struct virtqueue *vq;
   2446
   2447	vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
   2448				   "guest-request");
   2449	if (IS_ERR(vq))
   2450		return PTR_ERR(vq);
   2451	vm->vq = vq;
   2452
   2453	return 0;
   2454}
   2455
   2456static int virtio_mem_init_hotplug(struct virtio_mem *vm)
   2457{
   2458	const struct range pluggable_range = mhp_get_pluggable_range(true);
   2459	uint64_t unit_pages, sb_size, addr;
   2460	int rc;
   2461
   2462	/* bad device setup - warn only */
   2463	if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
   2464		dev_warn(&vm->vdev->dev,
   2465			 "The alignment of the physical start address can make some memory unusable.\n");
   2466	if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
   2467		dev_warn(&vm->vdev->dev,
   2468			 "The alignment of the physical end address can make some memory unusable.\n");
   2469	if (vm->addr < pluggable_range.start ||
   2470	    vm->addr + vm->region_size - 1 > pluggable_range.end)
   2471		dev_warn(&vm->vdev->dev,
   2472			 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n");
   2473
   2474	/* Prepare the offline threshold - make sure we can add two blocks. */
   2475	vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
   2476				      VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
   2477
   2478	/*
   2479	 * alloc_contig_range() works reliably with pageblock
   2480	 * granularity on ZONE_NORMAL, use pageblock_nr_pages.
   2481	 */
   2482	sb_size = PAGE_SIZE * pageblock_nr_pages;
   2483	sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
   2484
   2485	if (sb_size < memory_block_size_bytes() && !force_bbm) {
   2486		/* SBM: At least two subblocks per Linux memory block. */
   2487		vm->in_sbm = true;
   2488		vm->sbm.sb_size = sb_size;
   2489		vm->sbm.sbs_per_mb = memory_block_size_bytes() /
   2490				     vm->sbm.sb_size;
   2491
   2492		/* Round up to the next full memory block */
   2493		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
   2494		       memory_block_size_bytes() - 1;
   2495		vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
   2496		vm->sbm.next_mb_id = vm->sbm.first_mb_id;
   2497	} else {
   2498		/* BBM: At least one Linux memory block. */
   2499		vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
   2500					memory_block_size_bytes());
   2501
   2502		if (bbm_block_size) {
   2503			if (!is_power_of_2(bbm_block_size)) {
   2504				dev_warn(&vm->vdev->dev,
   2505					 "bbm_block_size is not a power of 2");
   2506			} else if (bbm_block_size < vm->bbm.bb_size) {
   2507				dev_warn(&vm->vdev->dev,
   2508					 "bbm_block_size is too small");
   2509			} else {
   2510				vm->bbm.bb_size = bbm_block_size;
   2511			}
   2512		}
   2513
   2514		/* Round up to the next aligned big block */
   2515		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
   2516		       vm->bbm.bb_size - 1;
   2517		vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
   2518		vm->bbm.next_bb_id = vm->bbm.first_bb_id;
   2519
   2520		/* Make sure we can add two big blocks. */
   2521		vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
   2522					      vm->offline_threshold);
   2523	}
   2524
   2525	dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
   2526		 memory_block_size_bytes());
   2527	if (vm->in_sbm)
   2528		dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
   2529			 (unsigned long long)vm->sbm.sb_size);
   2530	else
   2531		dev_info(&vm->vdev->dev, "big block size: 0x%llx",
   2532			 (unsigned long long)vm->bbm.bb_size);
   2533
   2534	/* create the parent resource for all memory */
   2535	rc = virtio_mem_create_resource(vm);
   2536	if (rc)
   2537		return rc;
   2538
   2539	/* use a single dynamic memory group to cover the whole memory device */
   2540	if (vm->in_sbm)
   2541		unit_pages = PHYS_PFN(memory_block_size_bytes());
   2542	else
   2543		unit_pages = PHYS_PFN(vm->bbm.bb_size);
   2544	rc = memory_group_register_dynamic(vm->nid, unit_pages);
   2545	if (rc < 0)
   2546		goto out_del_resource;
   2547	vm->mgid = rc;
   2548
   2549	/*
   2550	 * If we still have memory plugged, we have to unplug all memory first.
   2551	 * Registering our parent resource makes sure that this memory isn't
   2552	 * actually in use (e.g., trying to reload the driver).
   2553	 */
   2554	if (vm->plugged_size) {
   2555		vm->unplug_all_required = true;
   2556		dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
   2557	}
   2558
   2559	/* register callbacks */
   2560	vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
   2561	rc = register_memory_notifier(&vm->memory_notifier);
   2562	if (rc)
   2563		goto out_unreg_group;
   2564	rc = register_virtio_mem_device(vm);
   2565	if (rc)
   2566		goto out_unreg_mem;
   2567
   2568	return 0;
   2569out_unreg_mem:
   2570	unregister_memory_notifier(&vm->memory_notifier);
   2571out_unreg_group:
   2572	memory_group_unregister(vm->mgid);
   2573out_del_resource:
   2574	virtio_mem_delete_resource(vm);
   2575	return rc;
   2576}
   2577
   2578#ifdef CONFIG_PROC_VMCORE
   2579static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr,
   2580					 uint64_t size)
   2581{
   2582	const uint64_t nb_vm_blocks = size / vm->device_block_size;
   2583	const struct virtio_mem_req req = {
   2584		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE),
   2585		.u.state.addr = cpu_to_virtio64(vm->vdev, addr),
   2586		.u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
   2587	};
   2588	int rc = -ENOMEM;
   2589
   2590	dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr,
   2591		addr + size - 1);
   2592
   2593	switch (virtio_mem_send_request(vm, &req)) {
   2594	case VIRTIO_MEM_RESP_ACK:
   2595		return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state);
   2596	case VIRTIO_MEM_RESP_ERROR:
   2597		rc = -EINVAL;
   2598		break;
   2599	default:
   2600		break;
   2601	}
   2602
   2603	dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc);
   2604	return rc;
   2605}
   2606
   2607static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb,
   2608					 unsigned long pfn)
   2609{
   2610	struct virtio_mem *vm = container_of(cb, struct virtio_mem,
   2611					     vmcore_cb);
   2612	uint64_t addr = PFN_PHYS(pfn);
   2613	bool is_ram;
   2614	int rc;
   2615
   2616	if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE))
   2617		return true;
   2618	if (!vm->plugged_size)
   2619		return false;
   2620
   2621	/*
   2622	 * We have to serialize device requests and access to the information
   2623	 * about the block queried last.
   2624	 */
   2625	mutex_lock(&vm->hotplug_mutex);
   2626
   2627	addr = ALIGN_DOWN(addr, vm->device_block_size);
   2628	if (addr != vm->last_block_addr) {
   2629		rc = virtio_mem_send_state_request(vm, addr,
   2630						   vm->device_block_size);
   2631		/* On any kind of error, we're going to signal !ram. */
   2632		if (rc == VIRTIO_MEM_STATE_PLUGGED)
   2633			vm->last_block_plugged = true;
   2634		else
   2635			vm->last_block_plugged = false;
   2636		vm->last_block_addr = addr;
   2637	}
   2638
   2639	is_ram = vm->last_block_plugged;
   2640	mutex_unlock(&vm->hotplug_mutex);
   2641	return is_ram;
   2642}
   2643#endif /* CONFIG_PROC_VMCORE */
   2644
   2645static int virtio_mem_init_kdump(struct virtio_mem *vm)
   2646{
   2647#ifdef CONFIG_PROC_VMCORE
   2648	dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n");
   2649	vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram;
   2650	register_vmcore_cb(&vm->vmcore_cb);
   2651	return 0;
   2652#else /* CONFIG_PROC_VMCORE */
   2653	dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n");
   2654	return -EBUSY;
   2655#endif /* CONFIG_PROC_VMCORE */
   2656}
   2657
   2658static int virtio_mem_init(struct virtio_mem *vm)
   2659{
   2660	uint16_t node_id;
   2661
   2662	if (!vm->vdev->config->get) {
   2663		dev_err(&vm->vdev->dev, "config access disabled\n");
   2664		return -EINVAL;
   2665	}
   2666
   2667	/* Fetch all properties that can't change. */
   2668	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
   2669			&vm->plugged_size);
   2670	virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
   2671			&vm->device_block_size);
   2672	virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
   2673			&node_id);
   2674	vm->nid = virtio_mem_translate_node_id(vm, node_id);
   2675	virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
   2676	virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
   2677			&vm->region_size);
   2678
   2679	/* Determine the nid for the device based on the lowest address. */
   2680	if (vm->nid == NUMA_NO_NODE)
   2681		vm->nid = memory_add_physaddr_to_nid(vm->addr);
   2682
   2683	dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
   2684	dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
   2685	dev_info(&vm->vdev->dev, "device block size: 0x%llx",
   2686		 (unsigned long long)vm->device_block_size);
   2687	if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
   2688		dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
   2689
   2690	/*
   2691	 * We don't want to (un)plug or reuse any memory when in kdump. The
   2692	 * memory is still accessible (but not exposed to Linux).
   2693	 */
   2694	if (vm->in_kdump)
   2695		return virtio_mem_init_kdump(vm);
   2696	return virtio_mem_init_hotplug(vm);
   2697}
   2698
   2699static int virtio_mem_create_resource(struct virtio_mem *vm)
   2700{
   2701	/*
   2702	 * When force-unloading the driver and removing the device, we
   2703	 * could have a garbage pointer. Duplicate the string.
   2704	 */
   2705	const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
   2706
   2707	if (!name)
   2708		return -ENOMEM;
   2709
   2710	/* Disallow mapping device memory via /dev/mem completely. */
   2711	vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
   2712						   name, IORESOURCE_SYSTEM_RAM |
   2713						   IORESOURCE_EXCLUSIVE);
   2714	if (!vm->parent_resource) {
   2715		kfree(name);
   2716		dev_warn(&vm->vdev->dev, "could not reserve device region\n");
   2717		dev_info(&vm->vdev->dev,
   2718			 "reloading the driver is not supported\n");
   2719		return -EBUSY;
   2720	}
   2721
   2722	/* The memory is not actually busy - make add_memory() work. */
   2723	vm->parent_resource->flags &= ~IORESOURCE_BUSY;
   2724	return 0;
   2725}
   2726
   2727static void virtio_mem_delete_resource(struct virtio_mem *vm)
   2728{
   2729	const char *name;
   2730
   2731	if (!vm->parent_resource)
   2732		return;
   2733
   2734	name = vm->parent_resource->name;
   2735	release_resource(vm->parent_resource);
   2736	kfree(vm->parent_resource);
   2737	kfree(name);
   2738	vm->parent_resource = NULL;
   2739}
   2740
   2741static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
   2742{
   2743	return 1;
   2744}
   2745
   2746static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
   2747{
   2748	const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
   2749
   2750	return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
   2751				   vm->addr + vm->region_size, NULL,
   2752				   virtio_mem_range_has_system_ram) == 1;
   2753}
   2754
   2755static int virtio_mem_probe(struct virtio_device *vdev)
   2756{
   2757	struct virtio_mem *vm;
   2758	int rc;
   2759
   2760	BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
   2761	BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
   2762
   2763	vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
   2764	if (!vm)
   2765		return -ENOMEM;
   2766
   2767	init_waitqueue_head(&vm->host_resp);
   2768	vm->vdev = vdev;
   2769	INIT_WORK(&vm->wq, virtio_mem_run_wq);
   2770	mutex_init(&vm->hotplug_mutex);
   2771	INIT_LIST_HEAD(&vm->next);
   2772	spin_lock_init(&vm->removal_lock);
   2773	hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
   2774	vm->retry_timer.function = virtio_mem_timer_expired;
   2775	vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
   2776	vm->in_kdump = is_kdump_kernel();
   2777
   2778	/* register the virtqueue */
   2779	rc = virtio_mem_init_vq(vm);
   2780	if (rc)
   2781		goto out_free_vm;
   2782
   2783	/* initialize the device by querying the config */
   2784	rc = virtio_mem_init(vm);
   2785	if (rc)
   2786		goto out_del_vq;
   2787
   2788	virtio_device_ready(vdev);
   2789
   2790	/* trigger a config update to start processing the requested_size */
   2791	if (!vm->in_kdump) {
   2792		atomic_set(&vm->config_changed, 1);
   2793		queue_work(system_freezable_wq, &vm->wq);
   2794	}
   2795
   2796	return 0;
   2797out_del_vq:
   2798	vdev->config->del_vqs(vdev);
   2799out_free_vm:
   2800	kfree(vm);
   2801	vdev->priv = NULL;
   2802
   2803	return rc;
   2804}
   2805
   2806static void virtio_mem_deinit_hotplug(struct virtio_mem *vm)
   2807{
   2808	unsigned long mb_id;
   2809	int rc;
   2810
   2811	/*
   2812	 * Make sure the workqueue won't be triggered anymore and no memory
   2813	 * blocks can be onlined/offlined until we're finished here.
   2814	 */
   2815	mutex_lock(&vm->hotplug_mutex);
   2816	spin_lock_irq(&vm->removal_lock);
   2817	vm->removing = true;
   2818	spin_unlock_irq(&vm->removal_lock);
   2819	mutex_unlock(&vm->hotplug_mutex);
   2820
   2821	/* wait until the workqueue stopped */
   2822	cancel_work_sync(&vm->wq);
   2823	hrtimer_cancel(&vm->retry_timer);
   2824
   2825	if (vm->in_sbm) {
   2826		/*
   2827		 * After we unregistered our callbacks, user space can online
   2828		 * partially plugged offline blocks. Make sure to remove them.
   2829		 */
   2830		virtio_mem_sbm_for_each_mb(vm, mb_id,
   2831					   VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
   2832			rc = virtio_mem_sbm_remove_mb(vm, mb_id);
   2833			BUG_ON(rc);
   2834			virtio_mem_sbm_set_mb_state(vm, mb_id,
   2835						    VIRTIO_MEM_SBM_MB_UNUSED);
   2836		}
   2837		/*
   2838		 * After we unregistered our callbacks, user space can no longer
   2839		 * offline partially plugged online memory blocks. No need to
   2840		 * worry about them.
   2841		 */
   2842	}
   2843
   2844	/* unregister callbacks */
   2845	unregister_virtio_mem_device(vm);
   2846	unregister_memory_notifier(&vm->memory_notifier);
   2847
   2848	/*
   2849	 * There is no way we could reliably remove all memory we have added to
   2850	 * the system. And there is no way to stop the driver/device from going
   2851	 * away. Warn at least.
   2852	 */
   2853	if (virtio_mem_has_memory_added(vm)) {
   2854		dev_warn(&vm->vdev->dev,
   2855			 "device still has system memory added\n");
   2856	} else {
   2857		virtio_mem_delete_resource(vm);
   2858		kfree_const(vm->resource_name);
   2859		memory_group_unregister(vm->mgid);
   2860	}
   2861
   2862	/* remove all tracking data - no locking needed */
   2863	if (vm->in_sbm) {
   2864		vfree(vm->sbm.mb_states);
   2865		vfree(vm->sbm.sb_states);
   2866	} else {
   2867		vfree(vm->bbm.bb_states);
   2868	}
   2869}
   2870
   2871static void virtio_mem_deinit_kdump(struct virtio_mem *vm)
   2872{
   2873#ifdef CONFIG_PROC_VMCORE
   2874	unregister_vmcore_cb(&vm->vmcore_cb);
   2875#endif /* CONFIG_PROC_VMCORE */
   2876}
   2877
   2878static void virtio_mem_remove(struct virtio_device *vdev)
   2879{
   2880	struct virtio_mem *vm = vdev->priv;
   2881
   2882	if (vm->in_kdump)
   2883		virtio_mem_deinit_kdump(vm);
   2884	else
   2885		virtio_mem_deinit_hotplug(vm);
   2886
   2887	/* reset the device and cleanup the queues */
   2888	virtio_reset_device(vdev);
   2889	vdev->config->del_vqs(vdev);
   2890
   2891	kfree(vm);
   2892	vdev->priv = NULL;
   2893}
   2894
   2895static void virtio_mem_config_changed(struct virtio_device *vdev)
   2896{
   2897	struct virtio_mem *vm = vdev->priv;
   2898
   2899	if (unlikely(vm->in_kdump))
   2900		return;
   2901
   2902	atomic_set(&vm->config_changed, 1);
   2903	virtio_mem_retry(vm);
   2904}
   2905
   2906#ifdef CONFIG_PM_SLEEP
   2907static int virtio_mem_freeze(struct virtio_device *vdev)
   2908{
   2909	/*
   2910	 * When restarting the VM, all memory is usually unplugged. Don't
   2911	 * allow to suspend/hibernate.
   2912	 */
   2913	dev_err(&vdev->dev, "save/restore not supported.\n");
   2914	return -EPERM;
   2915}
   2916
   2917static int virtio_mem_restore(struct virtio_device *vdev)
   2918{
   2919	return -EPERM;
   2920}
   2921#endif
   2922
   2923static unsigned int virtio_mem_features[] = {
   2924#if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
   2925	VIRTIO_MEM_F_ACPI_PXM,
   2926#endif
   2927	VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE,
   2928};
   2929
   2930static const struct virtio_device_id virtio_mem_id_table[] = {
   2931	{ VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
   2932	{ 0 },
   2933};
   2934
   2935static struct virtio_driver virtio_mem_driver = {
   2936	.feature_table = virtio_mem_features,
   2937	.feature_table_size = ARRAY_SIZE(virtio_mem_features),
   2938	.driver.name = KBUILD_MODNAME,
   2939	.driver.owner = THIS_MODULE,
   2940	.id_table = virtio_mem_id_table,
   2941	.probe = virtio_mem_probe,
   2942	.remove = virtio_mem_remove,
   2943	.config_changed = virtio_mem_config_changed,
   2944#ifdef CONFIG_PM_SLEEP
   2945	.freeze	=	virtio_mem_freeze,
   2946	.restore =	virtio_mem_restore,
   2947#endif
   2948};
   2949
   2950module_virtio_driver(virtio_mem_driver);
   2951MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
   2952MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
   2953MODULE_DESCRIPTION("Virtio-mem driver");
   2954MODULE_LICENSE("GPL");