hv_balloon.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
hv_balloon.c (50337B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright (c) 2012, Microsoft Corporation.
      4 *
      5 * Author:
      6 *   K. Y. Srinivasan <kys@microsoft.com>
      7 */
      8
      9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     10
     11#include <linux/kernel.h>
     12#include <linux/jiffies.h>
     13#include <linux/mman.h>
     14#include <linux/delay.h>
     15#include <linux/init.h>
     16#include <linux/module.h>
     17#include <linux/slab.h>
     18#include <linux/kthread.h>
     19#include <linux/completion.h>
     20#include <linux/count_zeros.h>
     21#include <linux/memory_hotplug.h>
     22#include <linux/memory.h>
     23#include <linux/notifier.h>
     24#include <linux/percpu_counter.h>
     25#include <linux/page_reporting.h>
     26
     27#include <linux/hyperv.h>
     28#include <asm/hyperv-tlfs.h>
     29
     30#include <asm/mshyperv.h>
     31
     32#define CREATE_TRACE_POINTS
     33#include "hv_trace_balloon.h"
     34
     35/*
     36 * We begin with definitions supporting the Dynamic Memory protocol
     37 * with the host.
     38 *
     39 * Begin protocol definitions.
     40 */
     41
     42
     43
     44/*
     45 * Protocol versions. The low word is the minor version, the high word the major
     46 * version.
     47 *
     48 * History:
     49 * Initial version 1.0
     50 * Changed to 0.1 on 2009/03/25
     51 * Changes to 0.2 on 2009/05/14
     52 * Changes to 0.3 on 2009/12/03
     53 * Changed to 1.0 on 2011/04/05
     54 */
     55
     56#define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor)))
     57#define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16)
     58#define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff)
     59
     60enum {
     61	DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3),
     62	DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0),
     63	DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0),
     64
     65	DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1,
     66	DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2,
     67	DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3,
     68
     69	DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10
     70};
     71
     72
     73
     74/*
     75 * Message Types
     76 */
     77
     78enum dm_message_type {
     79	/*
     80	 * Version 0.3
     81	 */
     82	DM_ERROR			= 0,
     83	DM_VERSION_REQUEST		= 1,
     84	DM_VERSION_RESPONSE		= 2,
     85	DM_CAPABILITIES_REPORT		= 3,
     86	DM_CAPABILITIES_RESPONSE	= 4,
     87	DM_STATUS_REPORT		= 5,
     88	DM_BALLOON_REQUEST		= 6,
     89	DM_BALLOON_RESPONSE		= 7,
     90	DM_UNBALLOON_REQUEST		= 8,
     91	DM_UNBALLOON_RESPONSE		= 9,
     92	DM_MEM_HOT_ADD_REQUEST		= 10,
     93	DM_MEM_HOT_ADD_RESPONSE		= 11,
     94	DM_VERSION_03_MAX		= 11,
     95	/*
     96	 * Version 1.0.
     97	 */
     98	DM_INFO_MESSAGE			= 12,
     99	DM_VERSION_1_MAX		= 12
    100};
    101
    102
    103/*
    104 * Structures defining the dynamic memory management
    105 * protocol.
    106 */
    107
    108union dm_version {
    109	struct {
    110		__u16 minor_version;
    111		__u16 major_version;
    112	};
    113	__u32 version;
    114} __packed;
    115
    116
    117union dm_caps {
    118	struct {
    119		__u64 balloon:1;
    120		__u64 hot_add:1;
    121		/*
    122		 * To support guests that may have alignment
    123		 * limitations on hot-add, the guest can specify
    124		 * its alignment requirements; a value of n
    125		 * represents an alignment of 2^n in mega bytes.
    126		 */
    127		__u64 hot_add_alignment:4;
    128		__u64 reservedz:58;
    129	} cap_bits;
    130	__u64 caps;
    131} __packed;
    132
    133union dm_mem_page_range {
    134	struct  {
    135		/*
    136		 * The PFN number of the first page in the range.
    137		 * 40 bits is the architectural limit of a PFN
    138		 * number for AMD64.
    139		 */
    140		__u64 start_page:40;
    141		/*
    142		 * The number of pages in the range.
    143		 */
    144		__u64 page_cnt:24;
    145	} finfo;
    146	__u64  page_range;
    147} __packed;
    148
    149
    150
    151/*
    152 * The header for all dynamic memory messages:
    153 *
    154 * type: Type of the message.
    155 * size: Size of the message in bytes; including the header.
    156 * trans_id: The guest is responsible for manufacturing this ID.
    157 */
    158
    159struct dm_header {
    160	__u16 type;
    161	__u16 size;
    162	__u32 trans_id;
    163} __packed;
    164
    165/*
    166 * A generic message format for dynamic memory.
    167 * Specific message formats are defined later in the file.
    168 */
    169
    170struct dm_message {
    171	struct dm_header hdr;
    172	__u8 data[]; /* enclosed message */
    173} __packed;
    174
    175
    176/*
    177 * Specific message types supporting the dynamic memory protocol.
    178 */
    179
    180/*
    181 * Version negotiation message. Sent from the guest to the host.
    182 * The guest is free to try different versions until the host
    183 * accepts the version.
    184 *
    185 * dm_version: The protocol version requested.
    186 * is_last_attempt: If TRUE, this is the last version guest will request.
    187 * reservedz: Reserved field, set to zero.
    188 */
    189
    190struct dm_version_request {
    191	struct dm_header hdr;
    192	union dm_version version;
    193	__u32 is_last_attempt:1;
    194	__u32 reservedz:31;
    195} __packed;
    196
    197/*
    198 * Version response message; Host to Guest and indicates
    199 * if the host has accepted the version sent by the guest.
    200 *
    201 * is_accepted: If TRUE, host has accepted the version and the guest
    202 * should proceed to the next stage of the protocol. FALSE indicates that
    203 * guest should re-try with a different version.
    204 *
    205 * reservedz: Reserved field, set to zero.
    206 */
    207
    208struct dm_version_response {
    209	struct dm_header hdr;
    210	__u64 is_accepted:1;
    211	__u64 reservedz:63;
    212} __packed;
    213
    214/*
    215 * Message reporting capabilities. This is sent from the guest to the
    216 * host.
    217 */
    218
    219struct dm_capabilities {
    220	struct dm_header hdr;
    221	union dm_caps caps;
    222	__u64 min_page_cnt;
    223	__u64 max_page_number;
    224} __packed;
    225
    226/*
    227 * Response to the capabilities message. This is sent from the host to the
    228 * guest. This message notifies if the host has accepted the guest's
    229 * capabilities. If the host has not accepted, the guest must shutdown
    230 * the service.
    231 *
    232 * is_accepted: Indicates if the host has accepted guest's capabilities.
    233 * reservedz: Must be 0.
    234 */
    235
    236struct dm_capabilities_resp_msg {
    237	struct dm_header hdr;
    238	__u64 is_accepted:1;
    239	__u64 reservedz:63;
    240} __packed;
    241
    242/*
    243 * This message is used to report memory pressure from the guest.
    244 * This message is not part of any transaction and there is no
    245 * response to this message.
    246 *
    247 * num_avail: Available memory in pages.
    248 * num_committed: Committed memory in pages.
    249 * page_file_size: The accumulated size of all page files
    250 *		   in the system in pages.
    251 * zero_free: The nunber of zero and free pages.
    252 * page_file_writes: The writes to the page file in pages.
    253 * io_diff: An indicator of file cache efficiency or page file activity,
    254 *	    calculated as File Cache Page Fault Count - Page Read Count.
    255 *	    This value is in pages.
    256 *
    257 * Some of these metrics are Windows specific and fortunately
    258 * the algorithm on the host side that computes the guest memory
    259 * pressure only uses num_committed value.
    260 */
    261
    262struct dm_status {
    263	struct dm_header hdr;
    264	__u64 num_avail;
    265	__u64 num_committed;
    266	__u64 page_file_size;
    267	__u64 zero_free;
    268	__u32 page_file_writes;
    269	__u32 io_diff;
    270} __packed;
    271
    272
    273/*
    274 * Message to ask the guest to allocate memory - balloon up message.
    275 * This message is sent from the host to the guest. The guest may not be
    276 * able to allocate as much memory as requested.
    277 *
    278 * num_pages: number of pages to allocate.
    279 */
    280
    281struct dm_balloon {
    282	struct dm_header hdr;
    283	__u32 num_pages;
    284	__u32 reservedz;
    285} __packed;
    286
    287
    288/*
    289 * Balloon response message; this message is sent from the guest
    290 * to the host in response to the balloon message.
    291 *
    292 * reservedz: Reserved; must be set to zero.
    293 * more_pages: If FALSE, this is the last message of the transaction.
    294 * if TRUE there will atleast one more message from the guest.
    295 *
    296 * range_count: The number of ranges in the range array.
    297 *
    298 * range_array: An array of page ranges returned to the host.
    299 *
    300 */
    301
    302struct dm_balloon_response {
    303	struct dm_header hdr;
    304	__u32 reservedz;
    305	__u32 more_pages:1;
    306	__u32 range_count:31;
    307	union dm_mem_page_range range_array[];
    308} __packed;
    309
    310/*
    311 * Un-balloon message; this message is sent from the host
    312 * to the guest to give guest more memory.
    313 *
    314 * more_pages: If FALSE, this is the last message of the transaction.
    315 * if TRUE there will atleast one more message from the guest.
    316 *
    317 * reservedz: Reserved; must be set to zero.
    318 *
    319 * range_count: The number of ranges in the range array.
    320 *
    321 * range_array: An array of page ranges returned to the host.
    322 *
    323 */
    324
    325struct dm_unballoon_request {
    326	struct dm_header hdr;
    327	__u32 more_pages:1;
    328	__u32 reservedz:31;
    329	__u32 range_count;
    330	union dm_mem_page_range range_array[];
    331} __packed;
    332
    333/*
    334 * Un-balloon response message; this message is sent from the guest
    335 * to the host in response to an unballoon request.
    336 *
    337 */
    338
    339struct dm_unballoon_response {
    340	struct dm_header hdr;
    341} __packed;
    342
    343
    344/*
    345 * Hot add request message. Message sent from the host to the guest.
    346 *
    347 * mem_range: Memory range to hot add.
    348 *
    349 */
    350
    351struct dm_hot_add {
    352	struct dm_header hdr;
    353	union dm_mem_page_range range;
    354} __packed;
    355
    356/*
    357 * Hot add response message.
    358 * This message is sent by the guest to report the status of a hot add request.
    359 * If page_count is less than the requested page count, then the host should
    360 * assume all further hot add requests will fail, since this indicates that
    361 * the guest has hit an upper physical memory barrier.
    362 *
    363 * Hot adds may also fail due to low resources; in this case, the guest must
    364 * not complete this message until the hot add can succeed, and the host must
    365 * not send a new hot add request until the response is sent.
    366 * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS
    367 * times it fails the request.
    368 *
    369 *
    370 * page_count: number of pages that were successfully hot added.
    371 *
    372 * result: result of the operation 1: success, 0: failure.
    373 *
    374 */
    375
    376struct dm_hot_add_response {
    377	struct dm_header hdr;
    378	__u32 page_count;
    379	__u32 result;
    380} __packed;
    381
    382/*
    383 * Types of information sent from host to the guest.
    384 */
    385
    386enum dm_info_type {
    387	INFO_TYPE_MAX_PAGE_CNT = 0,
    388	MAX_INFO_TYPE
    389};
    390
    391
    392/*
    393 * Header for the information message.
    394 */
    395
    396struct dm_info_header {
    397	enum dm_info_type type;
    398	__u32 data_size;
    399} __packed;
    400
    401/*
    402 * This message is sent from the host to the guest to pass
    403 * some relevant information (win8 addition).
    404 *
    405 * reserved: no used.
    406 * info_size: size of the information blob.
    407 * info: information blob.
    408 */
    409
    410struct dm_info_msg {
    411	struct dm_header hdr;
    412	__u32 reserved;
    413	__u32 info_size;
    414	__u8  info[];
    415};
    416
    417/*
    418 * End protocol definitions.
    419 */
    420
    421/*
    422 * State to manage hot adding memory into the guest.
    423 * The range start_pfn : end_pfn specifies the range
    424 * that the host has asked us to hot add. The range
    425 * start_pfn : ha_end_pfn specifies the range that we have
    426 * currently hot added. We hot add in multiples of 128M
    427 * chunks; it is possible that we may not be able to bring
    428 * online all the pages in the region. The range
    429 * covered_start_pfn:covered_end_pfn defines the pages that can
    430 * be brough online.
    431 */
    432
    433struct hv_hotadd_state {
    434	struct list_head list;
    435	unsigned long start_pfn;
    436	unsigned long covered_start_pfn;
    437	unsigned long covered_end_pfn;
    438	unsigned long ha_end_pfn;
    439	unsigned long end_pfn;
    440	/*
    441	 * A list of gaps.
    442	 */
    443	struct list_head gap_list;
    444};
    445
    446struct hv_hotadd_gap {
    447	struct list_head list;
    448	unsigned long start_pfn;
    449	unsigned long end_pfn;
    450};
    451
    452struct balloon_state {
    453	__u32 num_pages;
    454	struct work_struct wrk;
    455};
    456
    457struct hot_add_wrk {
    458	union dm_mem_page_range ha_page_range;
    459	union dm_mem_page_range ha_region_range;
    460	struct work_struct wrk;
    461};
    462
    463static bool allow_hibernation;
    464static bool hot_add = true;
    465static bool do_hot_add;
    466/*
    467 * Delay reporting memory pressure by
    468 * the specified number of seconds.
    469 */
    470static uint pressure_report_delay = 45;
    471
    472/*
    473 * The last time we posted a pressure report to host.
    474 */
    475static unsigned long last_post_time;
    476
    477module_param(hot_add, bool, (S_IRUGO | S_IWUSR));
    478MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add");
    479
    480module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR));
    481MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure");
    482static atomic_t trans_id = ATOMIC_INIT(0);
    483
    484static int dm_ring_size = VMBUS_RING_SIZE(16 * 1024);
    485
    486/*
    487 * Driver specific state.
    488 */
    489
    490enum hv_dm_state {
    491	DM_INITIALIZING = 0,
    492	DM_INITIALIZED,
    493	DM_BALLOON_UP,
    494	DM_BALLOON_DOWN,
    495	DM_HOT_ADD,
    496	DM_INIT_ERROR
    497};
    498
    499
    500static __u8 recv_buffer[HV_HYP_PAGE_SIZE];
    501static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE];
    502#define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE)
    503#define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE)
    504
    505struct hv_dynmem_device {
    506	struct hv_device *dev;
    507	enum hv_dm_state state;
    508	struct completion host_event;
    509	struct completion config_event;
    510
    511	/*
    512	 * Number of pages we have currently ballooned out.
    513	 */
    514	unsigned int num_pages_ballooned;
    515	unsigned int num_pages_onlined;
    516	unsigned int num_pages_added;
    517
    518	/*
    519	 * State to manage the ballooning (up) operation.
    520	 */
    521	struct balloon_state balloon_wrk;
    522
    523	/*
    524	 * State to execute the "hot-add" operation.
    525	 */
    526	struct hot_add_wrk ha_wrk;
    527
    528	/*
    529	 * This state tracks if the host has specified a hot-add
    530	 * region.
    531	 */
    532	bool host_specified_ha_region;
    533
    534	/*
    535	 * State to synchronize hot-add.
    536	 */
    537	struct completion  ol_waitevent;
    538	/*
    539	 * This thread handles hot-add
    540	 * requests from the host as well as notifying
    541	 * the host with regards to memory pressure in
    542	 * the guest.
    543	 */
    544	struct task_struct *thread;
    545
    546	/*
    547	 * Protects ha_region_list, num_pages_onlined counter and individual
    548	 * regions from ha_region_list.
    549	 */
    550	spinlock_t ha_lock;
    551
    552	/*
    553	 * A list of hot-add regions.
    554	 */
    555	struct list_head ha_region_list;
    556
    557	/*
    558	 * We start with the highest version we can support
    559	 * and downgrade based on the host; we save here the
    560	 * next version to try.
    561	 */
    562	__u32 next_version;
    563
    564	/*
    565	 * The negotiated version agreed by host.
    566	 */
    567	__u32 version;
    568
    569	struct page_reporting_dev_info pr_dev_info;
    570};
    571
    572static struct hv_dynmem_device dm_device;
    573
    574static void post_status(struct hv_dynmem_device *dm);
    575
    576#ifdef CONFIG_MEMORY_HOTPLUG
    577static inline bool has_pfn_is_backed(struct hv_hotadd_state *has,
    578				     unsigned long pfn)
    579{
    580	struct hv_hotadd_gap *gap;
    581
    582	/* The page is not backed. */
    583	if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn))
    584		return false;
    585
    586	/* Check for gaps. */
    587	list_for_each_entry(gap, &has->gap_list, list) {
    588		if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn))
    589			return false;
    590	}
    591
    592	return true;
    593}
    594
    595static unsigned long hv_page_offline_check(unsigned long start_pfn,
    596					   unsigned long nr_pages)
    597{
    598	unsigned long pfn = start_pfn, count = 0;
    599	struct hv_hotadd_state *has;
    600	bool found;
    601
    602	while (pfn < start_pfn + nr_pages) {
    603		/*
    604		 * Search for HAS which covers the pfn and when we find one
    605		 * count how many consequitive PFNs are covered.
    606		 */
    607		found = false;
    608		list_for_each_entry(has, &dm_device.ha_region_list, list) {
    609			while ((pfn >= has->start_pfn) &&
    610			       (pfn < has->end_pfn) &&
    611			       (pfn < start_pfn + nr_pages)) {
    612				found = true;
    613				if (has_pfn_is_backed(has, pfn))
    614					count++;
    615				pfn++;
    616			}
    617		}
    618
    619		/*
    620		 * This PFN is not in any HAS (e.g. we're offlining a region
    621		 * which was present at boot), no need to account for it. Go
    622		 * to the next one.
    623		 */
    624		if (!found)
    625			pfn++;
    626	}
    627
    628	return count;
    629}
    630
    631static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
    632			      void *v)
    633{
    634	struct memory_notify *mem = (struct memory_notify *)v;
    635	unsigned long flags, pfn_count;
    636
    637	switch (val) {
    638	case MEM_ONLINE:
    639	case MEM_CANCEL_ONLINE:
    640		complete(&dm_device.ol_waitevent);
    641		break;
    642
    643	case MEM_OFFLINE:
    644		spin_lock_irqsave(&dm_device.ha_lock, flags);
    645		pfn_count = hv_page_offline_check(mem->start_pfn,
    646						  mem->nr_pages);
    647		if (pfn_count <= dm_device.num_pages_onlined) {
    648			dm_device.num_pages_onlined -= pfn_count;
    649		} else {
    650			/*
    651			 * We're offlining more pages than we managed to online.
    652			 * This is unexpected. In any case don't let
    653			 * num_pages_onlined wrap around zero.
    654			 */
    655			WARN_ON_ONCE(1);
    656			dm_device.num_pages_onlined = 0;
    657		}
    658		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
    659		break;
    660	case MEM_GOING_ONLINE:
    661	case MEM_GOING_OFFLINE:
    662	case MEM_CANCEL_OFFLINE:
    663		break;
    664	}
    665	return NOTIFY_OK;
    666}
    667
    668static struct notifier_block hv_memory_nb = {
    669	.notifier_call = hv_memory_notifier,
    670	.priority = 0
    671};
    672
    673/* Check if the particular page is backed and can be onlined and online it. */
    674static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
    675{
    676	if (!has_pfn_is_backed(has, page_to_pfn(pg))) {
    677		if (!PageOffline(pg))
    678			__SetPageOffline(pg);
    679		return;
    680	}
    681	if (PageOffline(pg))
    682		__ClearPageOffline(pg);
    683
    684	/* This frame is currently backed; online the page. */
    685	generic_online_page(pg, 0);
    686
    687	lockdep_assert_held(&dm_device.ha_lock);
    688	dm_device.num_pages_onlined++;
    689}
    690
    691static void hv_bring_pgs_online(struct hv_hotadd_state *has,
    692				unsigned long start_pfn, unsigned long size)
    693{
    694	int i;
    695
    696	pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn);
    697	for (i = 0; i < size; i++)
    698		hv_page_online_one(has, pfn_to_page(start_pfn + i));
    699}
    700
    701static void hv_mem_hot_add(unsigned long start, unsigned long size,
    702				unsigned long pfn_count,
    703				struct hv_hotadd_state *has)
    704{
    705	int ret = 0;
    706	int i, nid;
    707	unsigned long start_pfn;
    708	unsigned long processed_pfn;
    709	unsigned long total_pfn = pfn_count;
    710	unsigned long flags;
    711
    712	for (i = 0; i < (size/HA_CHUNK); i++) {
    713		start_pfn = start + (i * HA_CHUNK);
    714
    715		spin_lock_irqsave(&dm_device.ha_lock, flags);
    716		has->ha_end_pfn +=  HA_CHUNK;
    717
    718		if (total_pfn > HA_CHUNK) {
    719			processed_pfn = HA_CHUNK;
    720			total_pfn -= HA_CHUNK;
    721		} else {
    722			processed_pfn = total_pfn;
    723			total_pfn = 0;
    724		}
    725
    726		has->covered_end_pfn +=  processed_pfn;
    727		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
    728
    729		reinit_completion(&dm_device.ol_waitevent);
    730
    731		nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
    732		ret = add_memory(nid, PFN_PHYS((start_pfn)),
    733				(HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE);
    734
    735		if (ret) {
    736			pr_err("hot_add memory failed error is %d\n", ret);
    737			if (ret == -EEXIST) {
    738				/*
    739				 * This error indicates that the error
    740				 * is not a transient failure. This is the
    741				 * case where the guest's physical address map
    742				 * precludes hot adding memory. Stop all further
    743				 * memory hot-add.
    744				 */
    745				do_hot_add = false;
    746			}
    747			spin_lock_irqsave(&dm_device.ha_lock, flags);
    748			has->ha_end_pfn -= HA_CHUNK;
    749			has->covered_end_pfn -=  processed_pfn;
    750			spin_unlock_irqrestore(&dm_device.ha_lock, flags);
    751			break;
    752		}
    753
    754		/*
    755		 * Wait for memory to get onlined. If the kernel onlined the
    756		 * memory when adding it, this will return directly. Otherwise,
    757		 * it will wait for user space to online the memory. This helps
    758		 * to avoid adding memory faster than it is getting onlined. As
    759		 * adding succeeded, it is ok to proceed even if the memory was
    760		 * not onlined in time.
    761		 */
    762		wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ);
    763		post_status(&dm_device);
    764	}
    765}
    766
    767static void hv_online_page(struct page *pg, unsigned int order)
    768{
    769	struct hv_hotadd_state *has;
    770	unsigned long flags;
    771	unsigned long pfn = page_to_pfn(pg);
    772
    773	spin_lock_irqsave(&dm_device.ha_lock, flags);
    774	list_for_each_entry(has, &dm_device.ha_region_list, list) {
    775		/* The page belongs to a different HAS. */
    776		if ((pfn < has->start_pfn) ||
    777				(pfn + (1UL << order) > has->end_pfn))
    778			continue;
    779
    780		hv_bring_pgs_online(has, pfn, 1UL << order);
    781		break;
    782	}
    783	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
    784}
    785
    786static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
    787{
    788	struct hv_hotadd_state *has;
    789	struct hv_hotadd_gap *gap;
    790	unsigned long residual, new_inc;
    791	int ret = 0;
    792	unsigned long flags;
    793
    794	spin_lock_irqsave(&dm_device.ha_lock, flags);
    795	list_for_each_entry(has, &dm_device.ha_region_list, list) {
    796		/*
    797		 * If the pfn range we are dealing with is not in the current
    798		 * "hot add block", move on.
    799		 */
    800		if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn)
    801			continue;
    802
    803		/*
    804		 * If the current start pfn is not where the covered_end
    805		 * is, create a gap and update covered_end_pfn.
    806		 */
    807		if (has->covered_end_pfn != start_pfn) {
    808			gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC);
    809			if (!gap) {
    810				ret = -ENOMEM;
    811				break;
    812			}
    813
    814			INIT_LIST_HEAD(&gap->list);
    815			gap->start_pfn = has->covered_end_pfn;
    816			gap->end_pfn = start_pfn;
    817			list_add_tail(&gap->list, &has->gap_list);
    818
    819			has->covered_end_pfn = start_pfn;
    820		}
    821
    822		/*
    823		 * If the current hot add-request extends beyond
    824		 * our current limit; extend it.
    825		 */
    826		if ((start_pfn + pfn_cnt) > has->end_pfn) {
    827			residual = (start_pfn + pfn_cnt - has->end_pfn);
    828			/*
    829			 * Extend the region by multiples of HA_CHUNK.
    830			 */
    831			new_inc = (residual / HA_CHUNK) * HA_CHUNK;
    832			if (residual % HA_CHUNK)
    833				new_inc += HA_CHUNK;
    834
    835			has->end_pfn += new_inc;
    836		}
    837
    838		ret = 1;
    839		break;
    840	}
    841	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
    842
    843	return ret;
    844}
    845
    846static unsigned long handle_pg_range(unsigned long pg_start,
    847					unsigned long pg_count)
    848{
    849	unsigned long start_pfn = pg_start;
    850	unsigned long pfn_cnt = pg_count;
    851	unsigned long size;
    852	struct hv_hotadd_state *has;
    853	unsigned long pgs_ol = 0;
    854	unsigned long old_covered_state;
    855	unsigned long res = 0, flags;
    856
    857	pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count,
    858		pg_start);
    859
    860	spin_lock_irqsave(&dm_device.ha_lock, flags);
    861	list_for_each_entry(has, &dm_device.ha_region_list, list) {
    862		/*
    863		 * If the pfn range we are dealing with is not in the current
    864		 * "hot add block", move on.
    865		 */
    866		if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn)
    867			continue;
    868
    869		old_covered_state = has->covered_end_pfn;
    870
    871		if (start_pfn < has->ha_end_pfn) {
    872			/*
    873			 * This is the case where we are backing pages
    874			 * in an already hot added region. Bring
    875			 * these pages online first.
    876			 */
    877			pgs_ol = has->ha_end_pfn - start_pfn;
    878			if (pgs_ol > pfn_cnt)
    879				pgs_ol = pfn_cnt;
    880
    881			has->covered_end_pfn +=  pgs_ol;
    882			pfn_cnt -= pgs_ol;
    883			/*
    884			 * Check if the corresponding memory block is already
    885			 * online. It is possible to observe struct pages still
    886			 * being uninitialized here so check section instead.
    887			 * In case the section is online we need to bring the
    888			 * rest of pfns (which were not backed previously)
    889			 * online too.
    890			 */
    891			if (start_pfn > has->start_pfn &&
    892			    online_section_nr(pfn_to_section_nr(start_pfn)))
    893				hv_bring_pgs_online(has, start_pfn, pgs_ol);
    894
    895		}
    896
    897		if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) {
    898			/*
    899			 * We have some residual hot add range
    900			 * that needs to be hot added; hot add
    901			 * it now. Hot add a multiple of
    902			 * of HA_CHUNK that fully covers the pages
    903			 * we have.
    904			 */
    905			size = (has->end_pfn - has->ha_end_pfn);
    906			if (pfn_cnt <= size) {
    907				size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK);
    908				if (pfn_cnt % HA_CHUNK)
    909					size += HA_CHUNK;
    910			} else {
    911				pfn_cnt = size;
    912			}
    913			spin_unlock_irqrestore(&dm_device.ha_lock, flags);
    914			hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has);
    915			spin_lock_irqsave(&dm_device.ha_lock, flags);
    916		}
    917		/*
    918		 * If we managed to online any pages that were given to us,
    919		 * we declare success.
    920		 */
    921		res = has->covered_end_pfn - old_covered_state;
    922		break;
    923	}
    924	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
    925
    926	return res;
    927}
    928
    929static unsigned long process_hot_add(unsigned long pg_start,
    930					unsigned long pfn_cnt,
    931					unsigned long rg_start,
    932					unsigned long rg_size)
    933{
    934	struct hv_hotadd_state *ha_region = NULL;
    935	int covered;
    936	unsigned long flags;
    937
    938	if (pfn_cnt == 0)
    939		return 0;
    940
    941	if (!dm_device.host_specified_ha_region) {
    942		covered = pfn_covered(pg_start, pfn_cnt);
    943		if (covered < 0)
    944			return 0;
    945
    946		if (covered)
    947			goto do_pg_range;
    948	}
    949
    950	/*
    951	 * If the host has specified a hot-add range; deal with it first.
    952	 */
    953
    954	if (rg_size != 0) {
    955		ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL);
    956		if (!ha_region)
    957			return 0;
    958
    959		INIT_LIST_HEAD(&ha_region->list);
    960		INIT_LIST_HEAD(&ha_region->gap_list);
    961
    962		ha_region->start_pfn = rg_start;
    963		ha_region->ha_end_pfn = rg_start;
    964		ha_region->covered_start_pfn = pg_start;
    965		ha_region->covered_end_pfn = pg_start;
    966		ha_region->end_pfn = rg_start + rg_size;
    967
    968		spin_lock_irqsave(&dm_device.ha_lock, flags);
    969		list_add_tail(&ha_region->list, &dm_device.ha_region_list);
    970		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
    971	}
    972
    973do_pg_range:
    974	/*
    975	 * Process the page range specified; bringing them
    976	 * online if possible.
    977	 */
    978	return handle_pg_range(pg_start, pfn_cnt);
    979}
    980
    981#endif
    982
    983static void hot_add_req(struct work_struct *dummy)
    984{
    985	struct dm_hot_add_response resp;
    986#ifdef CONFIG_MEMORY_HOTPLUG
    987	unsigned long pg_start, pfn_cnt;
    988	unsigned long rg_start, rg_sz;
    989#endif
    990	struct hv_dynmem_device *dm = &dm_device;
    991
    992	memset(&resp, 0, sizeof(struct dm_hot_add_response));
    993	resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE;
    994	resp.hdr.size = sizeof(struct dm_hot_add_response);
    995
    996#ifdef CONFIG_MEMORY_HOTPLUG
    997	pg_start = dm->ha_wrk.ha_page_range.finfo.start_page;
    998	pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt;
    999
   1000	rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
   1001	rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;
   1002
   1003	if ((rg_start == 0) && (!dm->host_specified_ha_region)) {
   1004		unsigned long region_size;
   1005		unsigned long region_start;
   1006
   1007		/*
   1008		 * The host has not specified the hot-add region.
   1009		 * Based on the hot-add page range being specified,
   1010		 * compute a hot-add region that can cover the pages
   1011		 * that need to be hot-added while ensuring the alignment
   1012		 * and size requirements of Linux as it relates to hot-add.
   1013		 */
   1014		region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
   1015		if (pfn_cnt % HA_CHUNK)
   1016			region_size += HA_CHUNK;
   1017
   1018		region_start = (pg_start / HA_CHUNK) * HA_CHUNK;
   1019
   1020		rg_start = region_start;
   1021		rg_sz = region_size;
   1022	}
   1023
   1024	if (do_hot_add)
   1025		resp.page_count = process_hot_add(pg_start, pfn_cnt,
   1026						rg_start, rg_sz);
   1027
   1028	dm->num_pages_added += resp.page_count;
   1029#endif
   1030	/*
   1031	 * The result field of the response structure has the
   1032	 * following semantics:
   1033	 *
   1034	 * 1. If all or some pages hot-added: Guest should return success.
   1035	 *
   1036	 * 2. If no pages could be hot-added:
   1037	 *
   1038	 * If the guest returns success, then the host
   1039	 * will not attempt any further hot-add operations. This
   1040	 * signifies a permanent failure.
   1041	 *
   1042	 * If the guest returns failure, then this failure will be
   1043	 * treated as a transient failure and the host may retry the
   1044	 * hot-add operation after some delay.
   1045	 */
   1046	if (resp.page_count > 0)
   1047		resp.result = 1;
   1048	else if (!do_hot_add)
   1049		resp.result = 1;
   1050	else
   1051		resp.result = 0;
   1052
   1053	if (!do_hot_add || resp.page_count == 0) {
   1054		if (!allow_hibernation)
   1055			pr_err("Memory hot add failed\n");
   1056		else
   1057			pr_info("Ignore hot-add request!\n");
   1058	}
   1059
   1060	dm->state = DM_INITIALIZED;
   1061	resp.hdr.trans_id = atomic_inc_return(&trans_id);
   1062	vmbus_sendpacket(dm->dev->channel, &resp,
   1063			sizeof(struct dm_hot_add_response),
   1064			(unsigned long)NULL,
   1065			VM_PKT_DATA_INBAND, 0);
   1066}
   1067
   1068static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg)
   1069{
   1070	struct dm_info_header *info_hdr;
   1071
   1072	info_hdr = (struct dm_info_header *)msg->info;
   1073
   1074	switch (info_hdr->type) {
   1075	case INFO_TYPE_MAX_PAGE_CNT:
   1076		if (info_hdr->data_size == sizeof(__u64)) {
   1077			__u64 *max_page_count = (__u64 *)&info_hdr[1];
   1078
   1079			pr_info("Max. dynamic memory size: %llu MB\n",
   1080				(*max_page_count) >> (20 - HV_HYP_PAGE_SHIFT));
   1081		}
   1082
   1083		break;
   1084	default:
   1085		pr_warn("Received Unknown type: %d\n", info_hdr->type);
   1086	}
   1087}
   1088
   1089static unsigned long compute_balloon_floor(void)
   1090{
   1091	unsigned long min_pages;
   1092	unsigned long nr_pages = totalram_pages();
   1093#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
   1094	/* Simple continuous piecewiese linear function:
   1095	 *  max MiB -> min MiB  gradient
   1096	 *       0         0
   1097	 *      16        16
   1098	 *      32        24
   1099	 *     128        72    (1/2)
   1100	 *     512       168    (1/4)
   1101	 *    2048       360    (1/8)
   1102	 *    8192       744    (1/16)
   1103	 *   32768      1512	(1/32)
   1104	 */
   1105	if (nr_pages < MB2PAGES(128))
   1106		min_pages = MB2PAGES(8) + (nr_pages >> 1);
   1107	else if (nr_pages < MB2PAGES(512))
   1108		min_pages = MB2PAGES(40) + (nr_pages >> 2);
   1109	else if (nr_pages < MB2PAGES(2048))
   1110		min_pages = MB2PAGES(104) + (nr_pages >> 3);
   1111	else if (nr_pages < MB2PAGES(8192))
   1112		min_pages = MB2PAGES(232) + (nr_pages >> 4);
   1113	else
   1114		min_pages = MB2PAGES(488) + (nr_pages >> 5);
   1115#undef MB2PAGES
   1116	return min_pages;
   1117}
   1118
   1119/*
   1120 * Post our status as it relates memory pressure to the
   1121 * host. Host expects the guests to post this status
   1122 * periodically at 1 second intervals.
   1123 *
   1124 * The metrics specified in this protocol are very Windows
   1125 * specific and so we cook up numbers here to convey our memory
   1126 * pressure.
   1127 */
   1128
   1129static void post_status(struct hv_dynmem_device *dm)
   1130{
   1131	struct dm_status status;
   1132	unsigned long now = jiffies;
   1133	unsigned long last_post = last_post_time;
   1134	unsigned long num_pages_avail, num_pages_committed;
   1135
   1136	if (pressure_report_delay > 0) {
   1137		--pressure_report_delay;
   1138		return;
   1139	}
   1140
   1141	if (!time_after(now, (last_post_time + HZ)))
   1142		return;
   1143
   1144	memset(&status, 0, sizeof(struct dm_status));
   1145	status.hdr.type = DM_STATUS_REPORT;
   1146	status.hdr.size = sizeof(struct dm_status);
   1147	status.hdr.trans_id = atomic_inc_return(&trans_id);
   1148
   1149	/*
   1150	 * The host expects the guest to report free and committed memory.
   1151	 * Furthermore, the host expects the pressure information to include
   1152	 * the ballooned out pages. For a given amount of memory that we are
   1153	 * managing we need to compute a floor below which we should not
   1154	 * balloon. Compute this and add it to the pressure report.
   1155	 * We also need to report all offline pages (num_pages_added -
   1156	 * num_pages_onlined) as committed to the host, otherwise it can try
   1157	 * asking us to balloon them out.
   1158	 */
   1159	num_pages_avail = si_mem_available();
   1160	num_pages_committed = vm_memory_committed() +
   1161		dm->num_pages_ballooned +
   1162		(dm->num_pages_added > dm->num_pages_onlined ?
   1163		 dm->num_pages_added - dm->num_pages_onlined : 0) +
   1164		compute_balloon_floor();
   1165
   1166	trace_balloon_status(num_pages_avail, num_pages_committed,
   1167			     vm_memory_committed(), dm->num_pages_ballooned,
   1168			     dm->num_pages_added, dm->num_pages_onlined);
   1169
   1170	/* Convert numbers of pages into numbers of HV_HYP_PAGEs. */
   1171	status.num_avail = num_pages_avail * NR_HV_HYP_PAGES_IN_PAGE;
   1172	status.num_committed = num_pages_committed * NR_HV_HYP_PAGES_IN_PAGE;
   1173
   1174	/*
   1175	 * If our transaction ID is no longer current, just don't
   1176	 * send the status. This can happen if we were interrupted
   1177	 * after we picked our transaction ID.
   1178	 */
   1179	if (status.hdr.trans_id != atomic_read(&trans_id))
   1180		return;
   1181
   1182	/*
   1183	 * If the last post time that we sampled has changed,
   1184	 * we have raced, don't post the status.
   1185	 */
   1186	if (last_post != last_post_time)
   1187		return;
   1188
   1189	last_post_time = jiffies;
   1190	vmbus_sendpacket(dm->dev->channel, &status,
   1191				sizeof(struct dm_status),
   1192				(unsigned long)NULL,
   1193				VM_PKT_DATA_INBAND, 0);
   1194
   1195}
   1196
   1197static void free_balloon_pages(struct hv_dynmem_device *dm,
   1198			 union dm_mem_page_range *range_array)
   1199{
   1200	int num_pages = range_array->finfo.page_cnt;
   1201	__u64 start_frame = range_array->finfo.start_page;
   1202	struct page *pg;
   1203	int i;
   1204
   1205	for (i = 0; i < num_pages; i++) {
   1206		pg = pfn_to_page(i + start_frame);
   1207		__ClearPageOffline(pg);
   1208		__free_page(pg);
   1209		dm->num_pages_ballooned--;
   1210		adjust_managed_page_count(pg, 1);
   1211	}
   1212}
   1213
   1214
   1215
   1216static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
   1217					unsigned int num_pages,
   1218					struct dm_balloon_response *bl_resp,
   1219					int alloc_unit)
   1220{
   1221	unsigned int i, j;
   1222	struct page *pg;
   1223
   1224	for (i = 0; i < num_pages / alloc_unit; i++) {
   1225		if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
   1226			HV_HYP_PAGE_SIZE)
   1227			return i * alloc_unit;
   1228
   1229		/*
   1230		 * We execute this code in a thread context. Furthermore,
   1231		 * we don't want the kernel to try too hard.
   1232		 */
   1233		pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY |
   1234				__GFP_NOMEMALLOC | __GFP_NOWARN,
   1235				get_order(alloc_unit << PAGE_SHIFT));
   1236
   1237		if (!pg)
   1238			return i * alloc_unit;
   1239
   1240		dm->num_pages_ballooned += alloc_unit;
   1241
   1242		/*
   1243		 * If we allocatted 2M pages; split them so we
   1244		 * can free them in any order we get.
   1245		 */
   1246
   1247		if (alloc_unit != 1)
   1248			split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
   1249
   1250		/* mark all pages offline */
   1251		for (j = 0; j < alloc_unit; j++) {
   1252			__SetPageOffline(pg + j);
   1253			adjust_managed_page_count(pg + j, -1);
   1254		}
   1255
   1256		bl_resp->range_count++;
   1257		bl_resp->range_array[i].finfo.start_page =
   1258			page_to_pfn(pg);
   1259		bl_resp->range_array[i].finfo.page_cnt = alloc_unit;
   1260		bl_resp->hdr.size += sizeof(union dm_mem_page_range);
   1261
   1262	}
   1263
   1264	return i * alloc_unit;
   1265}
   1266
   1267static void balloon_up(struct work_struct *dummy)
   1268{
   1269	unsigned int num_pages = dm_device.balloon_wrk.num_pages;
   1270	unsigned int num_ballooned = 0;
   1271	struct dm_balloon_response *bl_resp;
   1272	int alloc_unit;
   1273	int ret;
   1274	bool done = false;
   1275	int i;
   1276	long avail_pages;
   1277	unsigned long floor;
   1278
   1279	/*
   1280	 * We will attempt 2M allocations. However, if we fail to
   1281	 * allocate 2M chunks, we will go back to PAGE_SIZE allocations.
   1282	 */
   1283	alloc_unit = PAGES_IN_2M;
   1284
   1285	avail_pages = si_mem_available();
   1286	floor = compute_balloon_floor();
   1287
   1288	/* Refuse to balloon below the floor. */
   1289	if (avail_pages < num_pages || avail_pages - num_pages < floor) {
   1290		pr_info("Balloon request will be partially fulfilled. %s\n",
   1291			avail_pages < num_pages ? "Not enough memory." :
   1292			"Balloon floor reached.");
   1293
   1294		num_pages = avail_pages > floor ? (avail_pages - floor) : 0;
   1295	}
   1296
   1297	while (!done) {
   1298		memset(balloon_up_send_buffer, 0, HV_HYP_PAGE_SIZE);
   1299		bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer;
   1300		bl_resp->hdr.type = DM_BALLOON_RESPONSE;
   1301		bl_resp->hdr.size = sizeof(struct dm_balloon_response);
   1302		bl_resp->more_pages = 1;
   1303
   1304		num_pages -= num_ballooned;
   1305		num_ballooned = alloc_balloon_pages(&dm_device, num_pages,
   1306						    bl_resp, alloc_unit);
   1307
   1308		if (alloc_unit != 1 && num_ballooned == 0) {
   1309			alloc_unit = 1;
   1310			continue;
   1311		}
   1312
   1313		if (num_ballooned == 0 || num_ballooned == num_pages) {
   1314			pr_debug("Ballooned %u out of %u requested pages.\n",
   1315				num_pages, dm_device.balloon_wrk.num_pages);
   1316
   1317			bl_resp->more_pages = 0;
   1318			done = true;
   1319			dm_device.state = DM_INITIALIZED;
   1320		}
   1321
   1322		/*
   1323		 * We are pushing a lot of data through the channel;
   1324		 * deal with transient failures caused because of the
   1325		 * lack of space in the ring buffer.
   1326		 */
   1327
   1328		do {
   1329			bl_resp->hdr.trans_id = atomic_inc_return(&trans_id);
   1330			ret = vmbus_sendpacket(dm_device.dev->channel,
   1331						bl_resp,
   1332						bl_resp->hdr.size,
   1333						(unsigned long)NULL,
   1334						VM_PKT_DATA_INBAND, 0);
   1335
   1336			if (ret == -EAGAIN)
   1337				msleep(20);
   1338			post_status(&dm_device);
   1339		} while (ret == -EAGAIN);
   1340
   1341		if (ret) {
   1342			/*
   1343			 * Free up the memory we allocatted.
   1344			 */
   1345			pr_err("Balloon response failed\n");
   1346
   1347			for (i = 0; i < bl_resp->range_count; i++)
   1348				free_balloon_pages(&dm_device,
   1349						 &bl_resp->range_array[i]);
   1350
   1351			done = true;
   1352		}
   1353	}
   1354
   1355}
   1356
   1357static void balloon_down(struct hv_dynmem_device *dm,
   1358			struct dm_unballoon_request *req)
   1359{
   1360	union dm_mem_page_range *range_array = req->range_array;
   1361	int range_count = req->range_count;
   1362	struct dm_unballoon_response resp;
   1363	int i;
   1364	unsigned int prev_pages_ballooned = dm->num_pages_ballooned;
   1365
   1366	for (i = 0; i < range_count; i++) {
   1367		free_balloon_pages(dm, &range_array[i]);
   1368		complete(&dm_device.config_event);
   1369	}
   1370
   1371	pr_debug("Freed %u ballooned pages.\n",
   1372		prev_pages_ballooned - dm->num_pages_ballooned);
   1373
   1374	if (req->more_pages == 1)
   1375		return;
   1376
   1377	memset(&resp, 0, sizeof(struct dm_unballoon_response));
   1378	resp.hdr.type = DM_UNBALLOON_RESPONSE;
   1379	resp.hdr.trans_id = atomic_inc_return(&trans_id);
   1380	resp.hdr.size = sizeof(struct dm_unballoon_response);
   1381
   1382	vmbus_sendpacket(dm_device.dev->channel, &resp,
   1383				sizeof(struct dm_unballoon_response),
   1384				(unsigned long)NULL,
   1385				VM_PKT_DATA_INBAND, 0);
   1386
   1387	dm->state = DM_INITIALIZED;
   1388}
   1389
   1390static void balloon_onchannelcallback(void *context);
   1391
   1392static int dm_thread_func(void *dm_dev)
   1393{
   1394	struct hv_dynmem_device *dm = dm_dev;
   1395
   1396	while (!kthread_should_stop()) {
   1397		wait_for_completion_interruptible_timeout(
   1398						&dm_device.config_event, 1*HZ);
   1399		/*
   1400		 * The host expects us to post information on the memory
   1401		 * pressure every second.
   1402		 */
   1403		reinit_completion(&dm_device.config_event);
   1404		post_status(dm);
   1405	}
   1406
   1407	return 0;
   1408}
   1409
   1410
   1411static void version_resp(struct hv_dynmem_device *dm,
   1412			struct dm_version_response *vresp)
   1413{
   1414	struct dm_version_request version_req;
   1415	int ret;
   1416
   1417	if (vresp->is_accepted) {
   1418		/*
   1419		 * We are done; wakeup the
   1420		 * context waiting for version
   1421		 * negotiation.
   1422		 */
   1423		complete(&dm->host_event);
   1424		return;
   1425	}
   1426	/*
   1427	 * If there are more versions to try, continue
   1428	 * with negotiations; if not
   1429	 * shutdown the service since we are not able
   1430	 * to negotiate a suitable version number
   1431	 * with the host.
   1432	 */
   1433	if (dm->next_version == 0)
   1434		goto version_error;
   1435
   1436	memset(&version_req, 0, sizeof(struct dm_version_request));
   1437	version_req.hdr.type = DM_VERSION_REQUEST;
   1438	version_req.hdr.size = sizeof(struct dm_version_request);
   1439	version_req.hdr.trans_id = atomic_inc_return(&trans_id);
   1440	version_req.version.version = dm->next_version;
   1441	dm->version = version_req.version.version;
   1442
   1443	/*
   1444	 * Set the next version to try in case current version fails.
   1445	 * Win7 protocol ought to be the last one to try.
   1446	 */
   1447	switch (version_req.version.version) {
   1448	case DYNMEM_PROTOCOL_VERSION_WIN8:
   1449		dm->next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
   1450		version_req.is_last_attempt = 0;
   1451		break;
   1452	default:
   1453		dm->next_version = 0;
   1454		version_req.is_last_attempt = 1;
   1455	}
   1456
   1457	ret = vmbus_sendpacket(dm->dev->channel, &version_req,
   1458				sizeof(struct dm_version_request),
   1459				(unsigned long)NULL,
   1460				VM_PKT_DATA_INBAND, 0);
   1461
   1462	if (ret)
   1463		goto version_error;
   1464
   1465	return;
   1466
   1467version_error:
   1468	dm->state = DM_INIT_ERROR;
   1469	complete(&dm->host_event);
   1470}
   1471
   1472static void cap_resp(struct hv_dynmem_device *dm,
   1473			struct dm_capabilities_resp_msg *cap_resp)
   1474{
   1475	if (!cap_resp->is_accepted) {
   1476		pr_err("Capabilities not accepted by host\n");
   1477		dm->state = DM_INIT_ERROR;
   1478	}
   1479	complete(&dm->host_event);
   1480}
   1481
   1482static void balloon_onchannelcallback(void *context)
   1483{
   1484	struct hv_device *dev = context;
   1485	u32 recvlen;
   1486	u64 requestid;
   1487	struct dm_message *dm_msg;
   1488	struct dm_header *dm_hdr;
   1489	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
   1490	struct dm_balloon *bal_msg;
   1491	struct dm_hot_add *ha_msg;
   1492	union dm_mem_page_range *ha_pg_range;
   1493	union dm_mem_page_range *ha_region;
   1494
   1495	memset(recv_buffer, 0, sizeof(recv_buffer));
   1496	vmbus_recvpacket(dev->channel, recv_buffer,
   1497			 HV_HYP_PAGE_SIZE, &recvlen, &requestid);
   1498
   1499	if (recvlen > 0) {
   1500		dm_msg = (struct dm_message *)recv_buffer;
   1501		dm_hdr = &dm_msg->hdr;
   1502
   1503		switch (dm_hdr->type) {
   1504		case DM_VERSION_RESPONSE:
   1505			version_resp(dm,
   1506				 (struct dm_version_response *)dm_msg);
   1507			break;
   1508
   1509		case DM_CAPABILITIES_RESPONSE:
   1510			cap_resp(dm,
   1511				 (struct dm_capabilities_resp_msg *)dm_msg);
   1512			break;
   1513
   1514		case DM_BALLOON_REQUEST:
   1515			if (allow_hibernation) {
   1516				pr_info("Ignore balloon-up request!\n");
   1517				break;
   1518			}
   1519
   1520			if (dm->state == DM_BALLOON_UP)
   1521				pr_warn("Currently ballooning\n");
   1522			bal_msg = (struct dm_balloon *)recv_buffer;
   1523			dm->state = DM_BALLOON_UP;
   1524			dm_device.balloon_wrk.num_pages = bal_msg->num_pages;
   1525			schedule_work(&dm_device.balloon_wrk.wrk);
   1526			break;
   1527
   1528		case DM_UNBALLOON_REQUEST:
   1529			if (allow_hibernation) {
   1530				pr_info("Ignore balloon-down request!\n");
   1531				break;
   1532			}
   1533
   1534			dm->state = DM_BALLOON_DOWN;
   1535			balloon_down(dm,
   1536				 (struct dm_unballoon_request *)recv_buffer);
   1537			break;
   1538
   1539		case DM_MEM_HOT_ADD_REQUEST:
   1540			if (dm->state == DM_HOT_ADD)
   1541				pr_warn("Currently hot-adding\n");
   1542			dm->state = DM_HOT_ADD;
   1543			ha_msg = (struct dm_hot_add *)recv_buffer;
   1544			if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) {
   1545				/*
   1546				 * This is a normal hot-add request specifying
   1547				 * hot-add memory.
   1548				 */
   1549				dm->host_specified_ha_region = false;
   1550				ha_pg_range = &ha_msg->range;
   1551				dm->ha_wrk.ha_page_range = *ha_pg_range;
   1552				dm->ha_wrk.ha_region_range.page_range = 0;
   1553			} else {
   1554				/*
   1555				 * Host is specifying that we first hot-add
   1556				 * a region and then partially populate this
   1557				 * region.
   1558				 */
   1559				dm->host_specified_ha_region = true;
   1560				ha_pg_range = &ha_msg->range;
   1561				ha_region = &ha_pg_range[1];
   1562				dm->ha_wrk.ha_page_range = *ha_pg_range;
   1563				dm->ha_wrk.ha_region_range = *ha_region;
   1564			}
   1565			schedule_work(&dm_device.ha_wrk.wrk);
   1566			break;
   1567
   1568		case DM_INFO_MESSAGE:
   1569			process_info(dm, (struct dm_info_msg *)dm_msg);
   1570			break;
   1571
   1572		default:
   1573			pr_warn_ratelimited("Unhandled message: type: %d\n", dm_hdr->type);
   1574
   1575		}
   1576	}
   1577
   1578}
   1579
   1580/* Hyper-V only supports reporting 2MB pages or higher */
   1581#define HV_MIN_PAGE_REPORTING_ORDER	9
   1582#define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << HV_MIN_PAGE_REPORTING_ORDER)
   1583static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
   1584		    struct scatterlist *sgl, unsigned int nents)
   1585{
   1586	unsigned long flags;
   1587	struct hv_memory_hint *hint;
   1588	int i;
   1589	u64 status;
   1590	struct scatterlist *sg;
   1591
   1592	WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
   1593	WARN_ON_ONCE(sgl->length < HV_MIN_PAGE_REPORTING_LEN);
   1594	local_irq_save(flags);
   1595	hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
   1596	if (!hint) {
   1597		local_irq_restore(flags);
   1598		return -ENOSPC;
   1599	}
   1600
   1601	hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
   1602	hint->reserved = 0;
   1603	for_each_sg(sgl, sg, nents, i) {
   1604		union hv_gpa_page_range *range;
   1605
   1606		range = &hint->ranges[i];
   1607		range->address_space = 0;
   1608		/* page reporting only reports 2MB pages or higher */
   1609		range->page.largepage = 1;
   1610		range->page.additional_pages =
   1611			(sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1;
   1612		range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB;
   1613		range->base_large_pfn =
   1614			page_to_hvpfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER;
   1615	}
   1616
   1617	status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
   1618				     hint, NULL);
   1619	local_irq_restore(flags);
   1620	if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
   1621		pr_err("Cold memory discard hypercall failed with status %llx\n",
   1622			status);
   1623		return -EINVAL;
   1624	}
   1625
   1626	return 0;
   1627}
   1628
   1629static void enable_page_reporting(void)
   1630{
   1631	int ret;
   1632
   1633	/* Essentially, validating 'PAGE_REPORTING_MIN_ORDER' is big enough. */
   1634	if (pageblock_order < HV_MIN_PAGE_REPORTING_ORDER) {
   1635		pr_debug("Cold memory discard is only supported on 2MB pages and above\n");
   1636		return;
   1637	}
   1638
   1639	if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
   1640		pr_debug("Cold memory discard hint not supported by Hyper-V\n");
   1641		return;
   1642	}
   1643
   1644	BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
   1645	dm_device.pr_dev_info.report = hv_free_page_report;
   1646	ret = page_reporting_register(&dm_device.pr_dev_info);
   1647	if (ret < 0) {
   1648		dm_device.pr_dev_info.report = NULL;
   1649		pr_err("Failed to enable cold memory discard: %d\n", ret);
   1650	} else {
   1651		pr_info("Cold memory discard hint enabled\n");
   1652	}
   1653}
   1654
   1655static void disable_page_reporting(void)
   1656{
   1657	if (dm_device.pr_dev_info.report) {
   1658		page_reporting_unregister(&dm_device.pr_dev_info);
   1659		dm_device.pr_dev_info.report = NULL;
   1660	}
   1661}
   1662
   1663static int ballooning_enabled(void)
   1664{
   1665	/*
   1666	 * Disable ballooning if the page size is not 4k (HV_HYP_PAGE_SIZE),
   1667	 * since currently it's unclear to us whether an unballoon request can
   1668	 * make sure all page ranges are guest page size aligned.
   1669	 */
   1670	if (PAGE_SIZE != HV_HYP_PAGE_SIZE) {
   1671		pr_info("Ballooning disabled because page size is not 4096 bytes\n");
   1672		return 0;
   1673	}
   1674
   1675	return 1;
   1676}
   1677
   1678static int hot_add_enabled(void)
   1679{
   1680	/*
   1681	 * Disable hot add on ARM64, because we currently rely on
   1682	 * memory_add_physaddr_to_nid() to get a node id of a hot add range,
   1683	 * however ARM64's memory_add_physaddr_to_nid() always return 0 and
   1684	 * DM_MEM_HOT_ADD_REQUEST doesn't have the NUMA node information for
   1685	 * add_memory().
   1686	 */
   1687	if (IS_ENABLED(CONFIG_ARM64)) {
   1688		pr_info("Memory hot add disabled on ARM64\n");
   1689		return 0;
   1690	}
   1691
   1692	return 1;
   1693}
   1694
   1695static int balloon_connect_vsp(struct hv_device *dev)
   1696{
   1697	struct dm_version_request version_req;
   1698	struct dm_capabilities cap_msg;
   1699	unsigned long t;
   1700	int ret;
   1701
   1702	/*
   1703	 * max_pkt_size should be large enough for one vmbus packet header plus
   1704	 * our receive buffer size. Hyper-V sends messages up to
   1705	 * HV_HYP_PAGE_SIZE bytes long on balloon channel.
   1706	 */
   1707	dev->channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2;
   1708
   1709	ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0,
   1710			 balloon_onchannelcallback, dev);
   1711	if (ret)
   1712		return ret;
   1713
   1714	/*
   1715	 * Initiate the hand shake with the host and negotiate
   1716	 * a version that the host can support. We start with the
   1717	 * highest version number and go down if the host cannot
   1718	 * support it.
   1719	 */
   1720	memset(&version_req, 0, sizeof(struct dm_version_request));
   1721	version_req.hdr.type = DM_VERSION_REQUEST;
   1722	version_req.hdr.size = sizeof(struct dm_version_request);
   1723	version_req.hdr.trans_id = atomic_inc_return(&trans_id);
   1724	version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN10;
   1725	version_req.is_last_attempt = 0;
   1726	dm_device.version = version_req.version.version;
   1727
   1728	ret = vmbus_sendpacket(dev->channel, &version_req,
   1729			       sizeof(struct dm_version_request),
   1730			       (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
   1731	if (ret)
   1732		goto out;
   1733
   1734	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
   1735	if (t == 0) {
   1736		ret = -ETIMEDOUT;
   1737		goto out;
   1738	}
   1739
   1740	/*
   1741	 * If we could not negotiate a compatible version with the host
   1742	 * fail the probe function.
   1743	 */
   1744	if (dm_device.state == DM_INIT_ERROR) {
   1745		ret = -EPROTO;
   1746		goto out;
   1747	}
   1748
   1749	pr_info("Using Dynamic Memory protocol version %u.%u\n",
   1750		DYNMEM_MAJOR_VERSION(dm_device.version),
   1751		DYNMEM_MINOR_VERSION(dm_device.version));
   1752
   1753	/*
   1754	 * Now submit our capabilities to the host.
   1755	 */
   1756	memset(&cap_msg, 0, sizeof(struct dm_capabilities));
   1757	cap_msg.hdr.type = DM_CAPABILITIES_REPORT;
   1758	cap_msg.hdr.size = sizeof(struct dm_capabilities);
   1759	cap_msg.hdr.trans_id = atomic_inc_return(&trans_id);
   1760
   1761	/*
   1762	 * When hibernation (i.e. virtual ACPI S4 state) is enabled, the host
   1763	 * currently still requires the bits to be set, so we have to add code
   1764	 * to fail the host's hot-add and balloon up/down requests, if any.
   1765	 */
   1766	cap_msg.caps.cap_bits.balloon = ballooning_enabled();
   1767	cap_msg.caps.cap_bits.hot_add = hot_add_enabled();
   1768
   1769	/*
   1770	 * Specify our alignment requirements as it relates
   1771	 * memory hot-add. Specify 128MB alignment.
   1772	 */
   1773	cap_msg.caps.cap_bits.hot_add_alignment = 7;
   1774
   1775	/*
   1776	 * Currently the host does not use these
   1777	 * values and we set them to what is done in the
   1778	 * Windows driver.
   1779	 */
   1780	cap_msg.min_page_cnt = 0;
   1781	cap_msg.max_page_number = -1;
   1782
   1783	ret = vmbus_sendpacket(dev->channel, &cap_msg,
   1784			       sizeof(struct dm_capabilities),
   1785			       (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
   1786	if (ret)
   1787		goto out;
   1788
   1789	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
   1790	if (t == 0) {
   1791		ret = -ETIMEDOUT;
   1792		goto out;
   1793	}
   1794
   1795	/*
   1796	 * If the host does not like our capabilities,
   1797	 * fail the probe function.
   1798	 */
   1799	if (dm_device.state == DM_INIT_ERROR) {
   1800		ret = -EPROTO;
   1801		goto out;
   1802	}
   1803
   1804	return 0;
   1805out:
   1806	vmbus_close(dev->channel);
   1807	return ret;
   1808}
   1809
   1810static int balloon_probe(struct hv_device *dev,
   1811			 const struct hv_vmbus_device_id *dev_id)
   1812{
   1813	int ret;
   1814
   1815	allow_hibernation = hv_is_hibernation_supported();
   1816	if (allow_hibernation)
   1817		hot_add = false;
   1818
   1819#ifdef CONFIG_MEMORY_HOTPLUG
   1820	do_hot_add = hot_add;
   1821#else
   1822	do_hot_add = false;
   1823#endif
   1824	dm_device.dev = dev;
   1825	dm_device.state = DM_INITIALIZING;
   1826	dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8;
   1827	init_completion(&dm_device.host_event);
   1828	init_completion(&dm_device.config_event);
   1829	INIT_LIST_HEAD(&dm_device.ha_region_list);
   1830	spin_lock_init(&dm_device.ha_lock);
   1831	INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
   1832	INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
   1833	dm_device.host_specified_ha_region = false;
   1834
   1835#ifdef CONFIG_MEMORY_HOTPLUG
   1836	set_online_page_callback(&hv_online_page);
   1837	init_completion(&dm_device.ol_waitevent);
   1838	register_memory_notifier(&hv_memory_nb);
   1839#endif
   1840
   1841	hv_set_drvdata(dev, &dm_device);
   1842
   1843	ret = balloon_connect_vsp(dev);
   1844	if (ret != 0)
   1845		goto connect_error;
   1846
   1847	enable_page_reporting();
   1848	dm_device.state = DM_INITIALIZED;
   1849
   1850	dm_device.thread =
   1851		 kthread_run(dm_thread_func, &dm_device, "hv_balloon");
   1852	if (IS_ERR(dm_device.thread)) {
   1853		ret = PTR_ERR(dm_device.thread);
   1854		goto probe_error;
   1855	}
   1856
   1857	return 0;
   1858
   1859probe_error:
   1860	dm_device.state = DM_INIT_ERROR;
   1861	dm_device.thread  = NULL;
   1862	disable_page_reporting();
   1863	vmbus_close(dev->channel);
   1864connect_error:
   1865#ifdef CONFIG_MEMORY_HOTPLUG
   1866	unregister_memory_notifier(&hv_memory_nb);
   1867	restore_online_page_callback(&hv_online_page);
   1868#endif
   1869	return ret;
   1870}
   1871
   1872static int balloon_remove(struct hv_device *dev)
   1873{
   1874	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
   1875	struct hv_hotadd_state *has, *tmp;
   1876	struct hv_hotadd_gap *gap, *tmp_gap;
   1877	unsigned long flags;
   1878
   1879	if (dm->num_pages_ballooned != 0)
   1880		pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);
   1881
   1882	cancel_work_sync(&dm->balloon_wrk.wrk);
   1883	cancel_work_sync(&dm->ha_wrk.wrk);
   1884
   1885	kthread_stop(dm->thread);
   1886
   1887	/*
   1888	 * This is to handle the case when balloon_resume()
   1889	 * call has failed and some cleanup has been done as
   1890	 * a part of the error handling.
   1891	 */
   1892	if (dm_device.state != DM_INIT_ERROR) {
   1893		disable_page_reporting();
   1894		vmbus_close(dev->channel);
   1895#ifdef CONFIG_MEMORY_HOTPLUG
   1896		unregister_memory_notifier(&hv_memory_nb);
   1897		restore_online_page_callback(&hv_online_page);
   1898#endif
   1899	}
   1900
   1901	spin_lock_irqsave(&dm_device.ha_lock, flags);
   1902	list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
   1903		list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) {
   1904			list_del(&gap->list);
   1905			kfree(gap);
   1906		}
   1907		list_del(&has->list);
   1908		kfree(has);
   1909	}
   1910	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
   1911
   1912	return 0;
   1913}
   1914
   1915static int balloon_suspend(struct hv_device *hv_dev)
   1916{
   1917	struct hv_dynmem_device *dm = hv_get_drvdata(hv_dev);
   1918
   1919	tasklet_disable(&hv_dev->channel->callback_event);
   1920
   1921	cancel_work_sync(&dm->balloon_wrk.wrk);
   1922	cancel_work_sync(&dm->ha_wrk.wrk);
   1923
   1924	if (dm->thread) {
   1925		kthread_stop(dm->thread);
   1926		dm->thread = NULL;
   1927		vmbus_close(hv_dev->channel);
   1928	}
   1929
   1930	tasklet_enable(&hv_dev->channel->callback_event);
   1931
   1932	return 0;
   1933
   1934}
   1935
   1936static int balloon_resume(struct hv_device *dev)
   1937{
   1938	int ret;
   1939
   1940	dm_device.state = DM_INITIALIZING;
   1941
   1942	ret = balloon_connect_vsp(dev);
   1943
   1944	if (ret != 0)
   1945		goto out;
   1946
   1947	dm_device.thread =
   1948		 kthread_run(dm_thread_func, &dm_device, "hv_balloon");
   1949	if (IS_ERR(dm_device.thread)) {
   1950		ret = PTR_ERR(dm_device.thread);
   1951		dm_device.thread = NULL;
   1952		goto close_channel;
   1953	}
   1954
   1955	dm_device.state = DM_INITIALIZED;
   1956	return 0;
   1957close_channel:
   1958	vmbus_close(dev->channel);
   1959out:
   1960	dm_device.state = DM_INIT_ERROR;
   1961	disable_page_reporting();
   1962#ifdef CONFIG_MEMORY_HOTPLUG
   1963	unregister_memory_notifier(&hv_memory_nb);
   1964	restore_online_page_callback(&hv_online_page);
   1965#endif
   1966	return ret;
   1967}
   1968
   1969static const struct hv_vmbus_device_id id_table[] = {
   1970	/* Dynamic Memory Class ID */
   1971	/* 525074DC-8985-46e2-8057-A307DC18A502 */
   1972	{ HV_DM_GUID, },
   1973	{ },
   1974};
   1975
   1976MODULE_DEVICE_TABLE(vmbus, id_table);
   1977
   1978static  struct hv_driver balloon_drv = {
   1979	.name = "hv_balloon",
   1980	.id_table = id_table,
   1981	.probe =  balloon_probe,
   1982	.remove =  balloon_remove,
   1983	.suspend = balloon_suspend,
   1984	.resume = balloon_resume,
   1985	.driver = {
   1986		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
   1987	},
   1988};
   1989
   1990static int __init init_balloon_drv(void)
   1991{
   1992
   1993	return vmbus_driver_register(&balloon_drv);
   1994}
   1995
   1996module_init(init_balloon_drv);
   1997
   1998MODULE_DESCRIPTION("Hyper-V Balloon");
   1999MODULE_LICENSE("GPL");