channel_mgmt.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
channel_mgmt.c (45214B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright (c) 2009, Microsoft Corporation.
      4 *
      5 * Authors:
      6 *   Haiyang Zhang <haiyangz@microsoft.com>
      7 *   Hank Janssen  <hjanssen@microsoft.com>
      8 */
      9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     10
     11#include <linux/kernel.h>
     12#include <linux/interrupt.h>
     13#include <linux/sched.h>
     14#include <linux/wait.h>
     15#include <linux/mm.h>
     16#include <linux/slab.h>
     17#include <linux/list.h>
     18#include <linux/module.h>
     19#include <linux/completion.h>
     20#include <linux/delay.h>
     21#include <linux/cpu.h>
     22#include <linux/hyperv.h>
     23#include <asm/mshyperv.h>
     24#include <linux/sched/isolation.h>
     25
     26#include "hyperv_vmbus.h"
     27
     28static void init_vp_index(struct vmbus_channel *channel);
     29
     30const struct vmbus_device vmbus_devs[] = {
     31	/* IDE */
     32	{ .dev_type = HV_IDE,
     33	  HV_IDE_GUID,
     34	  .perf_device = true,
     35	  .allowed_in_isolated = false,
     36	},
     37
     38	/* SCSI */
     39	{ .dev_type = HV_SCSI,
     40	  HV_SCSI_GUID,
     41	  .perf_device = true,
     42	  .allowed_in_isolated = true,
     43	},
     44
     45	/* Fibre Channel */
     46	{ .dev_type = HV_FC,
     47	  HV_SYNTHFC_GUID,
     48	  .perf_device = true,
     49	  .allowed_in_isolated = false,
     50	},
     51
     52	/* Synthetic NIC */
     53	{ .dev_type = HV_NIC,
     54	  HV_NIC_GUID,
     55	  .perf_device = true,
     56	  .allowed_in_isolated = true,
     57	},
     58
     59	/* Network Direct */
     60	{ .dev_type = HV_ND,
     61	  HV_ND_GUID,
     62	  .perf_device = true,
     63	  .allowed_in_isolated = false,
     64	},
     65
     66	/* PCIE */
     67	{ .dev_type = HV_PCIE,
     68	  HV_PCIE_GUID,
     69	  .perf_device = false,
     70	  .allowed_in_isolated = false,
     71	},
     72
     73	/* Synthetic Frame Buffer */
     74	{ .dev_type = HV_FB,
     75	  HV_SYNTHVID_GUID,
     76	  .perf_device = false,
     77	  .allowed_in_isolated = false,
     78	},
     79
     80	/* Synthetic Keyboard */
     81	{ .dev_type = HV_KBD,
     82	  HV_KBD_GUID,
     83	  .perf_device = false,
     84	  .allowed_in_isolated = false,
     85	},
     86
     87	/* Synthetic MOUSE */
     88	{ .dev_type = HV_MOUSE,
     89	  HV_MOUSE_GUID,
     90	  .perf_device = false,
     91	  .allowed_in_isolated = false,
     92	},
     93
     94	/* KVP */
     95	{ .dev_type = HV_KVP,
     96	  HV_KVP_GUID,
     97	  .perf_device = false,
     98	  .allowed_in_isolated = false,
     99	},
    100
    101	/* Time Synch */
    102	{ .dev_type = HV_TS,
    103	  HV_TS_GUID,
    104	  .perf_device = false,
    105	  .allowed_in_isolated = true,
    106	},
    107
    108	/* Heartbeat */
    109	{ .dev_type = HV_HB,
    110	  HV_HEART_BEAT_GUID,
    111	  .perf_device = false,
    112	  .allowed_in_isolated = true,
    113	},
    114
    115	/* Shutdown */
    116	{ .dev_type = HV_SHUTDOWN,
    117	  HV_SHUTDOWN_GUID,
    118	  .perf_device = false,
    119	  .allowed_in_isolated = true,
    120	},
    121
    122	/* File copy */
    123	{ .dev_type = HV_FCOPY,
    124	  HV_FCOPY_GUID,
    125	  .perf_device = false,
    126	  .allowed_in_isolated = false,
    127	},
    128
    129	/* Backup */
    130	{ .dev_type = HV_BACKUP,
    131	  HV_VSS_GUID,
    132	  .perf_device = false,
    133	  .allowed_in_isolated = false,
    134	},
    135
    136	/* Dynamic Memory */
    137	{ .dev_type = HV_DM,
    138	  HV_DM_GUID,
    139	  .perf_device = false,
    140	  .allowed_in_isolated = false,
    141	},
    142
    143	/* Unknown GUID */
    144	{ .dev_type = HV_UNKNOWN,
    145	  .perf_device = false,
    146	  .allowed_in_isolated = false,
    147	},
    148};
    149
    150static const struct {
    151	guid_t guid;
    152} vmbus_unsupported_devs[] = {
    153	{ HV_AVMA1_GUID },
    154	{ HV_AVMA2_GUID },
    155	{ HV_RDV_GUID	},
    156	{ HV_IMC_GUID	},
    157};
    158
    159/*
    160 * The rescinded channel may be blocked waiting for a response from the host;
    161 * take care of that.
    162 */
    163static void vmbus_rescind_cleanup(struct vmbus_channel *channel)
    164{
    165	struct vmbus_channel_msginfo *msginfo;
    166	unsigned long flags;
    167
    168
    169	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
    170	channel->rescind = true;
    171	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list,
    172				msglistentry) {
    173
    174		if (msginfo->waiting_channel == channel) {
    175			complete(&msginfo->waitevent);
    176			break;
    177		}
    178	}
    179	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
    180}
    181
    182static bool is_unsupported_vmbus_devs(const guid_t *guid)
    183{
    184	int i;
    185
    186	for (i = 0; i < ARRAY_SIZE(vmbus_unsupported_devs); i++)
    187		if (guid_equal(guid, &vmbus_unsupported_devs[i].guid))
    188			return true;
    189	return false;
    190}
    191
    192static u16 hv_get_dev_type(const struct vmbus_channel *channel)
    193{
    194	const guid_t *guid = &channel->offermsg.offer.if_type;
    195	u16 i;
    196
    197	if (is_hvsock_channel(channel) || is_unsupported_vmbus_devs(guid))
    198		return HV_UNKNOWN;
    199
    200	for (i = HV_IDE; i < HV_UNKNOWN; i++) {
    201		if (guid_equal(guid, &vmbus_devs[i].guid))
    202			return i;
    203	}
    204	pr_info("Unknown GUID: %pUl\n", guid);
    205	return i;
    206}
    207
    208/**
    209 * vmbus_prep_negotiate_resp() - Create default response for Negotiate message
    210 * @icmsghdrp: Pointer to msg header structure
    211 * @buf: Raw buffer channel data
    212 * @buflen: Length of the raw buffer channel data.
    213 * @fw_version: The framework versions we can support.
    214 * @fw_vercnt: The size of @fw_version.
    215 * @srv_version: The service versions we can support.
    216 * @srv_vercnt: The size of @srv_version.
    217 * @nego_fw_version: The selected framework version.
    218 * @nego_srv_version: The selected service version.
    219 *
    220 * Note: Versions are given in decreasing order.
    221 *
    222 * Set up and fill in default negotiate response message.
    223 * Mainly used by Hyper-V drivers.
    224 */
    225bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf,
    226				u32 buflen, const int *fw_version, int fw_vercnt,
    227				const int *srv_version, int srv_vercnt,
    228				int *nego_fw_version, int *nego_srv_version)
    229{
    230	int icframe_major, icframe_minor;
    231	int icmsg_major, icmsg_minor;
    232	int fw_major, fw_minor;
    233	int srv_major, srv_minor;
    234	int i, j;
    235	bool found_match = false;
    236	struct icmsg_negotiate *negop;
    237
    238	/* Check that there's enough space for icframe_vercnt, icmsg_vercnt */
    239	if (buflen < ICMSG_HDR + offsetof(struct icmsg_negotiate, reserved)) {
    240		pr_err_ratelimited("Invalid icmsg negotiate\n");
    241		return false;
    242	}
    243
    244	icmsghdrp->icmsgsize = 0x10;
    245	negop = (struct icmsg_negotiate *)&buf[ICMSG_HDR];
    246
    247	icframe_major = negop->icframe_vercnt;
    248	icframe_minor = 0;
    249
    250	icmsg_major = negop->icmsg_vercnt;
    251	icmsg_minor = 0;
    252
    253	/* Validate negop packet */
    254	if (icframe_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT ||
    255	    icmsg_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT ||
    256	    ICMSG_NEGOTIATE_PKT_SIZE(icframe_major, icmsg_major) > buflen) {
    257		pr_err_ratelimited("Invalid icmsg negotiate - icframe_major: %u, icmsg_major: %u\n",
    258				   icframe_major, icmsg_major);
    259		goto fw_error;
    260	}
    261
    262	/*
    263	 * Select the framework version number we will
    264	 * support.
    265	 */
    266
    267	for (i = 0; i < fw_vercnt; i++) {
    268		fw_major = (fw_version[i] >> 16);
    269		fw_minor = (fw_version[i] & 0xFFFF);
    270
    271		for (j = 0; j < negop->icframe_vercnt; j++) {
    272			if ((negop->icversion_data[j].major == fw_major) &&
    273			    (negop->icversion_data[j].minor == fw_minor)) {
    274				icframe_major = negop->icversion_data[j].major;
    275				icframe_minor = negop->icversion_data[j].minor;
    276				found_match = true;
    277				break;
    278			}
    279		}
    280
    281		if (found_match)
    282			break;
    283	}
    284
    285	if (!found_match)
    286		goto fw_error;
    287
    288	found_match = false;
    289
    290	for (i = 0; i < srv_vercnt; i++) {
    291		srv_major = (srv_version[i] >> 16);
    292		srv_minor = (srv_version[i] & 0xFFFF);
    293
    294		for (j = negop->icframe_vercnt;
    295			(j < negop->icframe_vercnt + negop->icmsg_vercnt);
    296			j++) {
    297
    298			if ((negop->icversion_data[j].major == srv_major) &&
    299				(negop->icversion_data[j].minor == srv_minor)) {
    300
    301				icmsg_major = negop->icversion_data[j].major;
    302				icmsg_minor = negop->icversion_data[j].minor;
    303				found_match = true;
    304				break;
    305			}
    306		}
    307
    308		if (found_match)
    309			break;
    310	}
    311
    312	/*
    313	 * Respond with the framework and service
    314	 * version numbers we can support.
    315	 */
    316
    317fw_error:
    318	if (!found_match) {
    319		negop->icframe_vercnt = 0;
    320		negop->icmsg_vercnt = 0;
    321	} else {
    322		negop->icframe_vercnt = 1;
    323		negop->icmsg_vercnt = 1;
    324	}
    325
    326	if (nego_fw_version)
    327		*nego_fw_version = (icframe_major << 16) | icframe_minor;
    328
    329	if (nego_srv_version)
    330		*nego_srv_version = (icmsg_major << 16) | icmsg_minor;
    331
    332	negop->icversion_data[0].major = icframe_major;
    333	negop->icversion_data[0].minor = icframe_minor;
    334	negop->icversion_data[1].major = icmsg_major;
    335	negop->icversion_data[1].minor = icmsg_minor;
    336	return found_match;
    337}
    338EXPORT_SYMBOL_GPL(vmbus_prep_negotiate_resp);
    339
    340/*
    341 * alloc_channel - Allocate and initialize a vmbus channel object
    342 */
    343static struct vmbus_channel *alloc_channel(void)
    344{
    345	struct vmbus_channel *channel;
    346
    347	channel = kzalloc(sizeof(*channel), GFP_ATOMIC);
    348	if (!channel)
    349		return NULL;
    350
    351	spin_lock_init(&channel->sched_lock);
    352	init_completion(&channel->rescind_event);
    353
    354	INIT_LIST_HEAD(&channel->sc_list);
    355
    356	tasklet_init(&channel->callback_event,
    357		     vmbus_on_event, (unsigned long)channel);
    358
    359	hv_ringbuffer_pre_init(channel);
    360
    361	return channel;
    362}
    363
    364/*
    365 * free_channel - Release the resources used by the vmbus channel object
    366 */
    367static void free_channel(struct vmbus_channel *channel)
    368{
    369	tasklet_kill(&channel->callback_event);
    370	vmbus_remove_channel_attr_group(channel);
    371
    372	kobject_put(&channel->kobj);
    373}
    374
    375void vmbus_channel_map_relid(struct vmbus_channel *channel)
    376{
    377	if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
    378		return;
    379	/*
    380	 * The mapping of the channel's relid is visible from the CPUs that
    381	 * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will
    382	 * execute:
    383	 *
    384	 *  (a) In the "normal (i.e., not resuming from hibernation)" path,
    385	 *      the full barrier in virt_store_mb() guarantees that the store
    386	 *      is propagated to all CPUs before the add_channel_work work
    387	 *      is queued.  In turn, add_channel_work is queued before the
    388	 *      channel's ring buffer is allocated/initialized and the
    389	 *      OPENCHANNEL message for the channel is sent in vmbus_open().
    390	 *      Hyper-V won't start sending the interrupts for the channel
    391	 *      before the OPENCHANNEL message is acked.  The memory barrier
    392	 *      in vmbus_chan_sched() -> sync_test_and_clear_bit() ensures
    393	 *      that vmbus_chan_sched() must find the channel's relid in
    394	 *      recv_int_page before retrieving the channel pointer from the
    395	 *      array of channels.
    396	 *
    397	 *  (b) In the "resuming from hibernation" path, the virt_store_mb()
    398	 *      guarantees that the store is propagated to all CPUs before
    399	 *      the VMBus connection is marked as ready for the resume event
    400	 *      (cf. check_ready_for_resume_event()).  The interrupt handler
    401	 *      of the VMBus driver and vmbus_chan_sched() can not run before
    402	 *      vmbus_bus_resume() has completed execution (cf. resume_noirq).
    403	 */
    404	virt_store_mb(
    405		vmbus_connection.channels[channel->offermsg.child_relid],
    406		channel);
    407}
    408
    409void vmbus_channel_unmap_relid(struct vmbus_channel *channel)
    410{
    411	if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
    412		return;
    413	WRITE_ONCE(
    414		vmbus_connection.channels[channel->offermsg.child_relid],
    415		NULL);
    416}
    417
    418static void vmbus_release_relid(u32 relid)
    419{
    420	struct vmbus_channel_relid_released msg;
    421	int ret;
    422
    423	memset(&msg, 0, sizeof(struct vmbus_channel_relid_released));
    424	msg.child_relid = relid;
    425	msg.header.msgtype = CHANNELMSG_RELID_RELEASED;
    426	ret = vmbus_post_msg(&msg, sizeof(struct vmbus_channel_relid_released),
    427			     true);
    428
    429	trace_vmbus_release_relid(&msg, ret);
    430}
    431
    432void hv_process_channel_removal(struct vmbus_channel *channel)
    433{
    434	lockdep_assert_held(&vmbus_connection.channel_mutex);
    435	BUG_ON(!channel->rescind);
    436
    437	/*
    438	 * hv_process_channel_removal() could find INVALID_RELID only for
    439	 * hv_sock channels.  See the inline comments in vmbus_onoffer().
    440	 */
    441	WARN_ON(channel->offermsg.child_relid == INVALID_RELID &&
    442		!is_hvsock_channel(channel));
    443
    444	/*
    445	 * Upon suspend, an in-use hv_sock channel is removed from the array of
    446	 * channels and the relid is invalidated.  After hibernation, when the
    447	 * user-space application destroys the channel, it's unnecessary and
    448	 * unsafe to remove the channel from the array of channels.  See also
    449	 * the inline comments before the call of vmbus_release_relid() below.
    450	 */
    451	if (channel->offermsg.child_relid != INVALID_RELID)
    452		vmbus_channel_unmap_relid(channel);
    453
    454	if (channel->primary_channel == NULL)
    455		list_del(&channel->listentry);
    456	else
    457		list_del(&channel->sc_list);
    458
    459	/*
    460	 * If this is a "perf" channel, updates the hv_numa_map[] masks so that
    461	 * init_vp_index() can (re-)use the CPU.
    462	 */
    463	if (hv_is_perf_channel(channel))
    464		hv_clear_allocated_cpu(channel->target_cpu);
    465
    466	/*
    467	 * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and
    468	 * the relid is invalidated; after hibernation, when the user-space app
    469	 * destroys the channel, the relid is INVALID_RELID, and in this case
    470	 * it's unnecessary and unsafe to release the old relid, since the same
    471	 * relid can refer to a completely different channel now.
    472	 */
    473	if (channel->offermsg.child_relid != INVALID_RELID)
    474		vmbus_release_relid(channel->offermsg.child_relid);
    475
    476	free_channel(channel);
    477}
    478
    479void vmbus_free_channels(void)
    480{
    481	struct vmbus_channel *channel, *tmp;
    482
    483	list_for_each_entry_safe(channel, tmp, &vmbus_connection.chn_list,
    484		listentry) {
    485		/* hv_process_channel_removal() needs this */
    486		channel->rescind = true;
    487
    488		vmbus_device_unregister(channel->device_obj);
    489	}
    490}
    491
    492/* Note: the function can run concurrently for primary/sub channels. */
    493static void vmbus_add_channel_work(struct work_struct *work)
    494{
    495	struct vmbus_channel *newchannel =
    496		container_of(work, struct vmbus_channel, add_channel_work);
    497	struct vmbus_channel *primary_channel = newchannel->primary_channel;
    498	int ret;
    499
    500	/*
    501	 * This state is used to indicate a successful open
    502	 * so that when we do close the channel normally, we
    503	 * can cleanup properly.
    504	 */
    505	newchannel->state = CHANNEL_OPEN_STATE;
    506
    507	if (primary_channel != NULL) {
    508		/* newchannel is a sub-channel. */
    509		struct hv_device *dev = primary_channel->device_obj;
    510
    511		if (vmbus_add_channel_kobj(dev, newchannel))
    512			goto err_deq_chan;
    513
    514		if (primary_channel->sc_creation_callback != NULL)
    515			primary_channel->sc_creation_callback(newchannel);
    516
    517		newchannel->probe_done = true;
    518		return;
    519	}
    520
    521	/*
    522	 * Start the process of binding the primary channel to the driver
    523	 */
    524	newchannel->device_obj = vmbus_device_create(
    525		&newchannel->offermsg.offer.if_type,
    526		&newchannel->offermsg.offer.if_instance,
    527		newchannel);
    528	if (!newchannel->device_obj)
    529		goto err_deq_chan;
    530
    531	newchannel->device_obj->device_id = newchannel->device_id;
    532	/*
    533	 * Add the new device to the bus. This will kick off device-driver
    534	 * binding which eventually invokes the device driver's AddDevice()
    535	 * method.
    536	 */
    537	ret = vmbus_device_register(newchannel->device_obj);
    538
    539	if (ret != 0) {
    540		pr_err("unable to add child device object (relid %d)\n",
    541			newchannel->offermsg.child_relid);
    542		kfree(newchannel->device_obj);
    543		goto err_deq_chan;
    544	}
    545
    546	newchannel->probe_done = true;
    547	return;
    548
    549err_deq_chan:
    550	mutex_lock(&vmbus_connection.channel_mutex);
    551
    552	/*
    553	 * We need to set the flag, otherwise
    554	 * vmbus_onoffer_rescind() can be blocked.
    555	 */
    556	newchannel->probe_done = true;
    557
    558	if (primary_channel == NULL)
    559		list_del(&newchannel->listentry);
    560	else
    561		list_del(&newchannel->sc_list);
    562
    563	/* vmbus_process_offer() has mapped the channel. */
    564	vmbus_channel_unmap_relid(newchannel);
    565
    566	mutex_unlock(&vmbus_connection.channel_mutex);
    567
    568	vmbus_release_relid(newchannel->offermsg.child_relid);
    569
    570	free_channel(newchannel);
    571}
    572
    573/*
    574 * vmbus_process_offer - Process the offer by creating a channel/device
    575 * associated with this offer
    576 */
    577static void vmbus_process_offer(struct vmbus_channel *newchannel)
    578{
    579	struct vmbus_channel *channel;
    580	struct workqueue_struct *wq;
    581	bool fnew = true;
    582
    583	/*
    584	 * Synchronize vmbus_process_offer() and CPU hotplugging:
    585	 *
    586	 * CPU1				CPU2
    587	 *
    588	 * [vmbus_process_offer()]	[Hot removal of the CPU]
    589	 *
    590	 * CPU_READ_LOCK		CPUS_WRITE_LOCK
    591	 * LOAD cpu_online_mask		SEARCH chn_list
    592	 * STORE target_cpu		LOAD target_cpu
    593	 * INSERT chn_list		STORE cpu_online_mask
    594	 * CPUS_READ_UNLOCK		CPUS_WRITE_UNLOCK
    595	 *
    596	 * Forbids: CPU1's LOAD from *not* seing CPU2's STORE &&
    597	 *              CPU2's SEARCH from *not* seeing CPU1's INSERT
    598	 *
    599	 * Forbids: CPU2's SEARCH from seeing CPU1's INSERT &&
    600	 *              CPU2's LOAD from *not* seing CPU1's STORE
    601	 */
    602	cpus_read_lock();
    603
    604	/*
    605	 * Serializes the modifications of the chn_list list as well as
    606	 * the accesses to next_numa_node_id in init_vp_index().
    607	 */
    608	mutex_lock(&vmbus_connection.channel_mutex);
    609
    610	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
    611		if (guid_equal(&channel->offermsg.offer.if_type,
    612			       &newchannel->offermsg.offer.if_type) &&
    613		    guid_equal(&channel->offermsg.offer.if_instance,
    614			       &newchannel->offermsg.offer.if_instance)) {
    615			fnew = false;
    616			newchannel->primary_channel = channel;
    617			break;
    618		}
    619	}
    620
    621	init_vp_index(newchannel);
    622
    623	/* Remember the channels that should be cleaned up upon suspend. */
    624	if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel))
    625		atomic_inc(&vmbus_connection.nr_chan_close_on_suspend);
    626
    627	/*
    628	 * Now that we have acquired the channel_mutex,
    629	 * we can release the potentially racing rescind thread.
    630	 */
    631	atomic_dec(&vmbus_connection.offer_in_progress);
    632
    633	if (fnew) {
    634		list_add_tail(&newchannel->listentry,
    635			      &vmbus_connection.chn_list);
    636	} else {
    637		/*
    638		 * Check to see if this is a valid sub-channel.
    639		 */
    640		if (newchannel->offermsg.offer.sub_channel_index == 0) {
    641			mutex_unlock(&vmbus_connection.channel_mutex);
    642			cpus_read_unlock();
    643			/*
    644			 * Don't call free_channel(), because newchannel->kobj
    645			 * is not initialized yet.
    646			 */
    647			kfree(newchannel);
    648			WARN_ON_ONCE(1);
    649			return;
    650		}
    651		/*
    652		 * Process the sub-channel.
    653		 */
    654		list_add_tail(&newchannel->sc_list, &channel->sc_list);
    655	}
    656
    657	vmbus_channel_map_relid(newchannel);
    658
    659	mutex_unlock(&vmbus_connection.channel_mutex);
    660	cpus_read_unlock();
    661
    662	/*
    663	 * vmbus_process_offer() mustn't call channel->sc_creation_callback()
    664	 * directly for sub-channels, because sc_creation_callback() ->
    665	 * vmbus_open() may never get the host's response to the
    666	 * OPEN_CHANNEL message (the host may rescind a channel at any time,
    667	 * e.g. in the case of hot removing a NIC), and vmbus_onoffer_rescind()
    668	 * may not wake up the vmbus_open() as it's blocked due to a non-zero
    669	 * vmbus_connection.offer_in_progress, and finally we have a deadlock.
    670	 *
    671	 * The above is also true for primary channels, if the related device
    672	 * drivers use sync probing mode by default.
    673	 *
    674	 * And, usually the handling of primary channels and sub-channels can
    675	 * depend on each other, so we should offload them to different
    676	 * workqueues to avoid possible deadlock, e.g. in sync-probing mode,
    677	 * NIC1's netvsc_subchan_work() can race with NIC2's netvsc_probe() ->
    678	 * rtnl_lock(), and causes deadlock: the former gets the rtnl_lock
    679	 * and waits for all the sub-channels to appear, but the latter
    680	 * can't get the rtnl_lock and this blocks the handling of
    681	 * sub-channels.
    682	 */
    683	INIT_WORK(&newchannel->add_channel_work, vmbus_add_channel_work);
    684	wq = fnew ? vmbus_connection.handle_primary_chan_wq :
    685		    vmbus_connection.handle_sub_chan_wq;
    686	queue_work(wq, &newchannel->add_channel_work);
    687}
    688
    689/*
    690 * Check if CPUs used by other channels of the same device.
    691 * It should only be called by init_vp_index().
    692 */
    693static bool hv_cpuself_used(u32 cpu, struct vmbus_channel *chn)
    694{
    695	struct vmbus_channel *primary = chn->primary_channel;
    696	struct vmbus_channel *sc;
    697
    698	lockdep_assert_held(&vmbus_connection.channel_mutex);
    699
    700	if (!primary)
    701		return false;
    702
    703	if (primary->target_cpu == cpu)
    704		return true;
    705
    706	list_for_each_entry(sc, &primary->sc_list, sc_list)
    707		if (sc != chn && sc->target_cpu == cpu)
    708			return true;
    709
    710	return false;
    711}
    712
    713/*
    714 * We use this state to statically distribute the channel interrupt load.
    715 */
    716static int next_numa_node_id;
    717
    718/*
    719 * We can statically distribute the incoming channel interrupt load
    720 * by binding a channel to VCPU.
    721 *
    722 * For non-performance critical channels we assign the VMBUS_CONNECT_CPU.
    723 * Performance critical channels will be distributed evenly among all
    724 * the available NUMA nodes.  Once the node is assigned, we will assign
    725 * the CPU based on a simple round robin scheme.
    726 */
    727static void init_vp_index(struct vmbus_channel *channel)
    728{
    729	bool perf_chn = hv_is_perf_channel(channel);
    730	u32 i, ncpu = num_online_cpus();
    731	cpumask_var_t available_mask;
    732	struct cpumask *allocated_mask;
    733	const struct cpumask *hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
    734	u32 target_cpu;
    735	int numa_node;
    736
    737	if (!perf_chn ||
    738	    !alloc_cpumask_var(&available_mask, GFP_KERNEL) ||
    739	    cpumask_empty(hk_mask)) {
    740		/*
    741		 * If the channel is not a performance critical
    742		 * channel, bind it to VMBUS_CONNECT_CPU.
    743		 * In case alloc_cpumask_var() fails, bind it to
    744		 * VMBUS_CONNECT_CPU.
    745		 * If all the cpus are isolated, bind it to
    746		 * VMBUS_CONNECT_CPU.
    747		 */
    748		channel->target_cpu = VMBUS_CONNECT_CPU;
    749		if (perf_chn)
    750			hv_set_allocated_cpu(VMBUS_CONNECT_CPU);
    751		return;
    752	}
    753
    754	for (i = 1; i <= ncpu + 1; i++) {
    755		while (true) {
    756			numa_node = next_numa_node_id++;
    757			if (numa_node == nr_node_ids) {
    758				next_numa_node_id = 0;
    759				continue;
    760			}
    761			if (cpumask_empty(cpumask_of_node(numa_node)))
    762				continue;
    763			break;
    764		}
    765		allocated_mask = &hv_context.hv_numa_map[numa_node];
    766
    767retry:
    768		cpumask_xor(available_mask, allocated_mask, cpumask_of_node(numa_node));
    769		cpumask_and(available_mask, available_mask, hk_mask);
    770
    771		if (cpumask_empty(available_mask)) {
    772			/*
    773			 * We have cycled through all the CPUs in the node;
    774			 * reset the allocated map.
    775			 */
    776			cpumask_clear(allocated_mask);
    777			goto retry;
    778		}
    779
    780		target_cpu = cpumask_first(available_mask);
    781		cpumask_set_cpu(target_cpu, allocated_mask);
    782
    783		if (channel->offermsg.offer.sub_channel_index >= ncpu ||
    784		    i > ncpu || !hv_cpuself_used(target_cpu, channel))
    785			break;
    786	}
    787
    788	channel->target_cpu = target_cpu;
    789
    790	free_cpumask_var(available_mask);
    791}
    792
    793#define UNLOAD_DELAY_UNIT_MS	10		/* 10 milliseconds */
    794#define UNLOAD_WAIT_MS		(100*1000)	/* 100 seconds */
    795#define UNLOAD_WAIT_LOOPS	(UNLOAD_WAIT_MS/UNLOAD_DELAY_UNIT_MS)
    796#define UNLOAD_MSG_MS		(5*1000)	/* Every 5 seconds */
    797#define UNLOAD_MSG_LOOPS	(UNLOAD_MSG_MS/UNLOAD_DELAY_UNIT_MS)
    798
    799static void vmbus_wait_for_unload(void)
    800{
    801	int cpu;
    802	void *page_addr;
    803	struct hv_message *msg;
    804	struct vmbus_channel_message_header *hdr;
    805	u32 message_type, i;
    806
    807	/*
    808	 * CHANNELMSG_UNLOAD_RESPONSE is always delivered to the CPU which was
    809	 * used for initial contact or to CPU0 depending on host version. When
    810	 * we're crashing on a different CPU let's hope that IRQ handler on
    811	 * the cpu which receives CHANNELMSG_UNLOAD_RESPONSE is still
    812	 * functional and vmbus_unload_response() will complete
    813	 * vmbus_connection.unload_event. If not, the last thing we can do is
    814	 * read message pages for all CPUs directly.
    815	 *
    816	 * Wait up to 100 seconds since an Azure host must writeback any dirty
    817	 * data in its disk cache before the VMbus UNLOAD request will
    818	 * complete. This flushing has been empirically observed to take up
    819	 * to 50 seconds in cases with a lot of dirty data, so allow additional
    820	 * leeway and for inaccuracies in mdelay(). But eventually time out so
    821	 * that the panic path can't get hung forever in case the response
    822	 * message isn't seen.
    823	 */
    824	for (i = 1; i <= UNLOAD_WAIT_LOOPS; i++) {
    825		if (completion_done(&vmbus_connection.unload_event))
    826			goto completed;
    827
    828		for_each_online_cpu(cpu) {
    829			struct hv_per_cpu_context *hv_cpu
    830				= per_cpu_ptr(hv_context.cpu_context, cpu);
    831
    832			page_addr = hv_cpu->synic_message_page;
    833			msg = (struct hv_message *)page_addr
    834				+ VMBUS_MESSAGE_SINT;
    835
    836			message_type = READ_ONCE(msg->header.message_type);
    837			if (message_type == HVMSG_NONE)
    838				continue;
    839
    840			hdr = (struct vmbus_channel_message_header *)
    841				msg->u.payload;
    842
    843			if (hdr->msgtype == CHANNELMSG_UNLOAD_RESPONSE)
    844				complete(&vmbus_connection.unload_event);
    845
    846			vmbus_signal_eom(msg, message_type);
    847		}
    848
    849		/*
    850		 * Give a notice periodically so someone watching the
    851		 * serial output won't think it is completely hung.
    852		 */
    853		if (!(i % UNLOAD_MSG_LOOPS))
    854			pr_notice("Waiting for VMBus UNLOAD to complete\n");
    855
    856		mdelay(UNLOAD_DELAY_UNIT_MS);
    857	}
    858	pr_err("Continuing even though VMBus UNLOAD did not complete\n");
    859
    860completed:
    861	/*
    862	 * We're crashing and already got the UNLOAD_RESPONSE, cleanup all
    863	 * maybe-pending messages on all CPUs to be able to receive new
    864	 * messages after we reconnect.
    865	 */
    866	for_each_online_cpu(cpu) {
    867		struct hv_per_cpu_context *hv_cpu
    868			= per_cpu_ptr(hv_context.cpu_context, cpu);
    869
    870		page_addr = hv_cpu->synic_message_page;
    871		msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
    872		msg->header.message_type = HVMSG_NONE;
    873	}
    874}
    875
    876/*
    877 * vmbus_unload_response - Handler for the unload response.
    878 */
    879static void vmbus_unload_response(struct vmbus_channel_message_header *hdr)
    880{
    881	/*
    882	 * This is a global event; just wakeup the waiting thread.
    883	 * Once we successfully unload, we can cleanup the monitor state.
    884	 *
    885	 * NB.  A malicious or compromised Hyper-V could send a spurious
    886	 * message of type CHANNELMSG_UNLOAD_RESPONSE, and trigger a call
    887	 * of the complete() below.  Make sure that unload_event has been
    888	 * initialized by the time this complete() is executed.
    889	 */
    890	complete(&vmbus_connection.unload_event);
    891}
    892
    893void vmbus_initiate_unload(bool crash)
    894{
    895	struct vmbus_channel_message_header hdr;
    896
    897	if (xchg(&vmbus_connection.conn_state, DISCONNECTED) == DISCONNECTED)
    898		return;
    899
    900	/* Pre-Win2012R2 hosts don't support reconnect */
    901	if (vmbus_proto_version < VERSION_WIN8_1)
    902		return;
    903
    904	reinit_completion(&vmbus_connection.unload_event);
    905	memset(&hdr, 0, sizeof(struct vmbus_channel_message_header));
    906	hdr.msgtype = CHANNELMSG_UNLOAD;
    907	vmbus_post_msg(&hdr, sizeof(struct vmbus_channel_message_header),
    908		       !crash);
    909
    910	/*
    911	 * vmbus_initiate_unload() is also called on crash and the crash can be
    912	 * happening in an interrupt context, where scheduling is impossible.
    913	 */
    914	if (!crash)
    915		wait_for_completion(&vmbus_connection.unload_event);
    916	else
    917		vmbus_wait_for_unload();
    918}
    919
    920static void check_ready_for_resume_event(void)
    921{
    922	/*
    923	 * If all the old primary channels have been fixed up, then it's safe
    924	 * to resume.
    925	 */
    926	if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume))
    927		complete(&vmbus_connection.ready_for_resume_event);
    928}
    929
    930static void vmbus_setup_channel_state(struct vmbus_channel *channel,
    931				      struct vmbus_channel_offer_channel *offer)
    932{
    933	/*
    934	 * Setup state for signalling the host.
    935	 */
    936	channel->sig_event = VMBUS_EVENT_CONNECTION_ID;
    937
    938	channel->is_dedicated_interrupt =
    939			(offer->is_dedicated_interrupt != 0);
    940	channel->sig_event = offer->connection_id;
    941
    942	memcpy(&channel->offermsg, offer,
    943	       sizeof(struct vmbus_channel_offer_channel));
    944	channel->monitor_grp = (u8)offer->monitorid / 32;
    945	channel->monitor_bit = (u8)offer->monitorid % 32;
    946	channel->device_id = hv_get_dev_type(channel);
    947}
    948
    949/*
    950 * find_primary_channel_by_offer - Get the channel object given the new offer.
    951 * This is only used in the resume path of hibernation.
    952 */
    953static struct vmbus_channel *
    954find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer)
    955{
    956	struct vmbus_channel *channel = NULL, *iter;
    957	const guid_t *inst1, *inst2;
    958
    959	/* Ignore sub-channel offers. */
    960	if (offer->offer.sub_channel_index != 0)
    961		return NULL;
    962
    963	mutex_lock(&vmbus_connection.channel_mutex);
    964
    965	list_for_each_entry(iter, &vmbus_connection.chn_list, listentry) {
    966		inst1 = &iter->offermsg.offer.if_instance;
    967		inst2 = &offer->offer.if_instance;
    968
    969		if (guid_equal(inst1, inst2)) {
    970			channel = iter;
    971			break;
    972		}
    973	}
    974
    975	mutex_unlock(&vmbus_connection.channel_mutex);
    976
    977	return channel;
    978}
    979
    980static bool vmbus_is_valid_offer(const struct vmbus_channel_offer_channel *offer)
    981{
    982	const guid_t *guid = &offer->offer.if_type;
    983	u16 i;
    984
    985	if (!hv_is_isolation_supported())
    986		return true;
    987
    988	if (is_hvsock_offer(offer))
    989		return true;
    990
    991	for (i = 0; i < ARRAY_SIZE(vmbus_devs); i++) {
    992		if (guid_equal(guid, &vmbus_devs[i].guid))
    993			return vmbus_devs[i].allowed_in_isolated;
    994	}
    995	return false;
    996}
    997
    998/*
    999 * vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
   1000 *
   1001 */
   1002static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
   1003{
   1004	struct vmbus_channel_offer_channel *offer;
   1005	struct vmbus_channel *oldchannel, *newchannel;
   1006	size_t offer_sz;
   1007
   1008	offer = (struct vmbus_channel_offer_channel *)hdr;
   1009
   1010	trace_vmbus_onoffer(offer);
   1011
   1012	if (!vmbus_is_valid_offer(offer)) {
   1013		pr_err_ratelimited("Invalid offer %d from the host supporting isolation\n",
   1014				   offer->child_relid);
   1015		atomic_dec(&vmbus_connection.offer_in_progress);
   1016		return;
   1017	}
   1018
   1019	oldchannel = find_primary_channel_by_offer(offer);
   1020
   1021	if (oldchannel != NULL) {
   1022		/*
   1023		 * We're resuming from hibernation: all the sub-channel and
   1024		 * hv_sock channels we had before the hibernation should have
   1025		 * been cleaned up, and now we must be seeing a re-offered
   1026		 * primary channel that we had before the hibernation.
   1027		 */
   1028
   1029		/*
   1030		 * { Initially: channel relid = INVALID_RELID,
   1031		 *		channels[valid_relid] = NULL }
   1032		 *
   1033		 * CPU1					CPU2
   1034		 *
   1035		 * [vmbus_onoffer()]			[vmbus_device_release()]
   1036		 *
   1037		 * LOCK channel_mutex			LOCK channel_mutex
   1038		 * STORE channel relid = valid_relid	LOAD r1 = channel relid
   1039		 * MAP_RELID channel			if (r1 != INVALID_RELID)
   1040		 * UNLOCK channel_mutex			  UNMAP_RELID channel
   1041		 *					UNLOCK channel_mutex
   1042		 *
   1043		 * Forbids: r1 == valid_relid &&
   1044		 *              channels[valid_relid] == channel
   1045		 *
   1046		 * Note.  r1 can be INVALID_RELID only for an hv_sock channel.
   1047		 * None of the hv_sock channels which were present before the
   1048		 * suspend are re-offered upon the resume.  See the WARN_ON()
   1049		 * in hv_process_channel_removal().
   1050		 */
   1051		mutex_lock(&vmbus_connection.channel_mutex);
   1052
   1053		atomic_dec(&vmbus_connection.offer_in_progress);
   1054
   1055		WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID);
   1056		/* Fix up the relid. */
   1057		oldchannel->offermsg.child_relid = offer->child_relid;
   1058
   1059		offer_sz = sizeof(*offer);
   1060		if (memcmp(offer, &oldchannel->offermsg, offer_sz) != 0) {
   1061			/*
   1062			 * This is not an error, since the host can also change
   1063			 * the other field(s) of the offer, e.g. on WS RS5
   1064			 * (Build 17763), the offer->connection_id of the
   1065			 * Mellanox VF vmbus device can change when the host
   1066			 * reoffers the device upon resume.
   1067			 */
   1068			pr_debug("vmbus offer changed: relid=%d\n",
   1069				 offer->child_relid);
   1070
   1071			print_hex_dump_debug("Old vmbus offer: ",
   1072					     DUMP_PREFIX_OFFSET, 16, 4,
   1073					     &oldchannel->offermsg, offer_sz,
   1074					     false);
   1075			print_hex_dump_debug("New vmbus offer: ",
   1076					     DUMP_PREFIX_OFFSET, 16, 4,
   1077					     offer, offer_sz, false);
   1078
   1079			/* Fix up the old channel. */
   1080			vmbus_setup_channel_state(oldchannel, offer);
   1081		}
   1082
   1083		/* Add the channel back to the array of channels. */
   1084		vmbus_channel_map_relid(oldchannel);
   1085		check_ready_for_resume_event();
   1086
   1087		mutex_unlock(&vmbus_connection.channel_mutex);
   1088		return;
   1089	}
   1090
   1091	/* Allocate the channel object and save this offer. */
   1092	newchannel = alloc_channel();
   1093	if (!newchannel) {
   1094		vmbus_release_relid(offer->child_relid);
   1095		atomic_dec(&vmbus_connection.offer_in_progress);
   1096		pr_err("Unable to allocate channel object\n");
   1097		return;
   1098	}
   1099
   1100	vmbus_setup_channel_state(newchannel, offer);
   1101
   1102	vmbus_process_offer(newchannel);
   1103}
   1104
   1105static void check_ready_for_suspend_event(void)
   1106{
   1107	/*
   1108	 * If all the sub-channels or hv_sock channels have been cleaned up,
   1109	 * then it's safe to suspend.
   1110	 */
   1111	if (atomic_dec_and_test(&vmbus_connection.nr_chan_close_on_suspend))
   1112		complete(&vmbus_connection.ready_for_suspend_event);
   1113}
   1114
   1115/*
   1116 * vmbus_onoffer_rescind - Rescind offer handler.
   1117 *
   1118 * We queue a work item to process this offer synchronously
   1119 */
   1120static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
   1121{
   1122	struct vmbus_channel_rescind_offer *rescind;
   1123	struct vmbus_channel *channel;
   1124	struct device *dev;
   1125	bool clean_up_chan_for_suspend;
   1126
   1127	rescind = (struct vmbus_channel_rescind_offer *)hdr;
   1128
   1129	trace_vmbus_onoffer_rescind(rescind);
   1130
   1131	/*
   1132	 * The offer msg and the corresponding rescind msg
   1133	 * from the host are guranteed to be ordered -
   1134	 * offer comes in first and then the rescind.
   1135	 * Since we process these events in work elements,
   1136	 * and with preemption, we may end up processing
   1137	 * the events out of order.  We rely on the synchronization
   1138	 * provided by offer_in_progress and by channel_mutex for
   1139	 * ordering these events:
   1140	 *
   1141	 * { Initially: offer_in_progress = 1 }
   1142	 *
   1143	 * CPU1				CPU2
   1144	 *
   1145	 * [vmbus_onoffer()]		[vmbus_onoffer_rescind()]
   1146	 *
   1147	 * LOCK channel_mutex		WAIT_ON offer_in_progress == 0
   1148	 * DECREMENT offer_in_progress	LOCK channel_mutex
   1149	 * STORE channels[]		LOAD channels[]
   1150	 * UNLOCK channel_mutex		UNLOCK channel_mutex
   1151	 *
   1152	 * Forbids: CPU2's LOAD from *not* seeing CPU1's STORE
   1153	 */
   1154
   1155	while (atomic_read(&vmbus_connection.offer_in_progress) != 0) {
   1156		/*
   1157		 * We wait here until any channel offer is currently
   1158		 * being processed.
   1159		 */
   1160		msleep(1);
   1161	}
   1162
   1163	mutex_lock(&vmbus_connection.channel_mutex);
   1164	channel = relid2channel(rescind->child_relid);
   1165	if (channel != NULL) {
   1166		/*
   1167		 * Guarantee that no other instance of vmbus_onoffer_rescind()
   1168		 * has got a reference to the channel object.  Synchronize on
   1169		 * &vmbus_connection.channel_mutex.
   1170		 */
   1171		if (channel->rescind_ref) {
   1172			mutex_unlock(&vmbus_connection.channel_mutex);
   1173			return;
   1174		}
   1175		channel->rescind_ref = true;
   1176	}
   1177	mutex_unlock(&vmbus_connection.channel_mutex);
   1178
   1179	if (channel == NULL) {
   1180		/*
   1181		 * We failed in processing the offer message;
   1182		 * we would have cleaned up the relid in that
   1183		 * failure path.
   1184		 */
   1185		return;
   1186	}
   1187
   1188	clean_up_chan_for_suspend = is_hvsock_channel(channel) ||
   1189				    is_sub_channel(channel);
   1190	/*
   1191	 * Before setting channel->rescind in vmbus_rescind_cleanup(), we
   1192	 * should make sure the channel callback is not running any more.
   1193	 */
   1194	vmbus_reset_channel_cb(channel);
   1195
   1196	/*
   1197	 * Now wait for offer handling to complete.
   1198	 */
   1199	vmbus_rescind_cleanup(channel);
   1200	while (READ_ONCE(channel->probe_done) == false) {
   1201		/*
   1202		 * We wait here until any channel offer is currently
   1203		 * being processed.
   1204		 */
   1205		msleep(1);
   1206	}
   1207
   1208	/*
   1209	 * At this point, the rescind handling can proceed safely.
   1210	 */
   1211
   1212	if (channel->device_obj) {
   1213		if (channel->chn_rescind_callback) {
   1214			channel->chn_rescind_callback(channel);
   1215
   1216			if (clean_up_chan_for_suspend)
   1217				check_ready_for_suspend_event();
   1218
   1219			return;
   1220		}
   1221		/*
   1222		 * We will have to unregister this device from the
   1223		 * driver core.
   1224		 */
   1225		dev = get_device(&channel->device_obj->device);
   1226		if (dev) {
   1227			vmbus_device_unregister(channel->device_obj);
   1228			put_device(dev);
   1229		}
   1230	} else if (channel->primary_channel != NULL) {
   1231		/*
   1232		 * Sub-channel is being rescinded. Following is the channel
   1233		 * close sequence when initiated from the driveri (refer to
   1234		 * vmbus_close() for details):
   1235		 * 1. Close all sub-channels first
   1236		 * 2. Then close the primary channel.
   1237		 */
   1238		mutex_lock(&vmbus_connection.channel_mutex);
   1239		if (channel->state == CHANNEL_OPEN_STATE) {
   1240			/*
   1241			 * The channel is currently not open;
   1242			 * it is safe for us to cleanup the channel.
   1243			 */
   1244			hv_process_channel_removal(channel);
   1245		} else {
   1246			complete(&channel->rescind_event);
   1247		}
   1248		mutex_unlock(&vmbus_connection.channel_mutex);
   1249	}
   1250
   1251	/* The "channel" may have been freed. Do not access it any longer. */
   1252
   1253	if (clean_up_chan_for_suspend)
   1254		check_ready_for_suspend_event();
   1255}
   1256
   1257void vmbus_hvsock_device_unregister(struct vmbus_channel *channel)
   1258{
   1259	BUG_ON(!is_hvsock_channel(channel));
   1260
   1261	/* We always get a rescind msg when a connection is closed. */
   1262	while (!READ_ONCE(channel->probe_done) || !READ_ONCE(channel->rescind))
   1263		msleep(1);
   1264
   1265	vmbus_device_unregister(channel->device_obj);
   1266}
   1267EXPORT_SYMBOL_GPL(vmbus_hvsock_device_unregister);
   1268
   1269
   1270/*
   1271 * vmbus_onoffers_delivered -
   1272 * This is invoked when all offers have been delivered.
   1273 *
   1274 * Nothing to do here.
   1275 */
   1276static void vmbus_onoffers_delivered(
   1277			struct vmbus_channel_message_header *hdr)
   1278{
   1279}
   1280
   1281/*
   1282 * vmbus_onopen_result - Open result handler.
   1283 *
   1284 * This is invoked when we received a response to our channel open request.
   1285 * Find the matching request, copy the response and signal the requesting
   1286 * thread.
   1287 */
   1288static void vmbus_onopen_result(struct vmbus_channel_message_header *hdr)
   1289{
   1290	struct vmbus_channel_open_result *result;
   1291	struct vmbus_channel_msginfo *msginfo;
   1292	struct vmbus_channel_message_header *requestheader;
   1293	struct vmbus_channel_open_channel *openmsg;
   1294	unsigned long flags;
   1295
   1296	result = (struct vmbus_channel_open_result *)hdr;
   1297
   1298	trace_vmbus_onopen_result(result);
   1299
   1300	/*
   1301	 * Find the open msg, copy the result and signal/unblock the wait event
   1302	 */
   1303	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
   1304
   1305	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list,
   1306				msglistentry) {
   1307		requestheader =
   1308			(struct vmbus_channel_message_header *)msginfo->msg;
   1309
   1310		if (requestheader->msgtype == CHANNELMSG_OPENCHANNEL) {
   1311			openmsg =
   1312			(struct vmbus_channel_open_channel *)msginfo->msg;
   1313			if (openmsg->child_relid == result->child_relid &&
   1314			    openmsg->openid == result->openid) {
   1315				memcpy(&msginfo->response.open_result,
   1316				       result,
   1317				       sizeof(
   1318					struct vmbus_channel_open_result));
   1319				complete(&msginfo->waitevent);
   1320				break;
   1321			}
   1322		}
   1323	}
   1324	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
   1325}
   1326
   1327/*
   1328 * vmbus_ongpadl_created - GPADL created handler.
   1329 *
   1330 * This is invoked when we received a response to our gpadl create request.
   1331 * Find the matching request, copy the response and signal the requesting
   1332 * thread.
   1333 */
   1334static void vmbus_ongpadl_created(struct vmbus_channel_message_header *hdr)
   1335{
   1336	struct vmbus_channel_gpadl_created *gpadlcreated;
   1337	struct vmbus_channel_msginfo *msginfo;
   1338	struct vmbus_channel_message_header *requestheader;
   1339	struct vmbus_channel_gpadl_header *gpadlheader;
   1340	unsigned long flags;
   1341
   1342	gpadlcreated = (struct vmbus_channel_gpadl_created *)hdr;
   1343
   1344	trace_vmbus_ongpadl_created(gpadlcreated);
   1345
   1346	/*
   1347	 * Find the establish msg, copy the result and signal/unblock the wait
   1348	 * event
   1349	 */
   1350	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
   1351
   1352	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list,
   1353				msglistentry) {
   1354		requestheader =
   1355			(struct vmbus_channel_message_header *)msginfo->msg;
   1356
   1357		if (requestheader->msgtype == CHANNELMSG_GPADL_HEADER) {
   1358			gpadlheader =
   1359			(struct vmbus_channel_gpadl_header *)requestheader;
   1360
   1361			if ((gpadlcreated->child_relid ==
   1362			     gpadlheader->child_relid) &&
   1363			    (gpadlcreated->gpadl == gpadlheader->gpadl)) {
   1364				memcpy(&msginfo->response.gpadl_created,
   1365				       gpadlcreated,
   1366				       sizeof(
   1367					struct vmbus_channel_gpadl_created));
   1368				complete(&msginfo->waitevent);
   1369				break;
   1370			}
   1371		}
   1372	}
   1373	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
   1374}
   1375
   1376/*
   1377 * vmbus_onmodifychannel_response - Modify Channel response handler.
   1378 *
   1379 * This is invoked when we received a response to our channel modify request.
   1380 * Find the matching request, copy the response and signal the requesting thread.
   1381 */
   1382static void vmbus_onmodifychannel_response(struct vmbus_channel_message_header *hdr)
   1383{
   1384	struct vmbus_channel_modifychannel_response *response;
   1385	struct vmbus_channel_msginfo *msginfo;
   1386	unsigned long flags;
   1387
   1388	response = (struct vmbus_channel_modifychannel_response *)hdr;
   1389
   1390	trace_vmbus_onmodifychannel_response(response);
   1391
   1392	/*
   1393	 * Find the modify msg, copy the response and signal/unblock the wait event.
   1394	 */
   1395	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
   1396
   1397	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, msglistentry) {
   1398		struct vmbus_channel_message_header *responseheader =
   1399				(struct vmbus_channel_message_header *)msginfo->msg;
   1400
   1401		if (responseheader->msgtype == CHANNELMSG_MODIFYCHANNEL) {
   1402			struct vmbus_channel_modifychannel *modifymsg;
   1403
   1404			modifymsg = (struct vmbus_channel_modifychannel *)msginfo->msg;
   1405			if (modifymsg->child_relid == response->child_relid) {
   1406				memcpy(&msginfo->response.modify_response, response,
   1407				       sizeof(*response));
   1408				complete(&msginfo->waitevent);
   1409				break;
   1410			}
   1411		}
   1412	}
   1413	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
   1414}
   1415
   1416/*
   1417 * vmbus_ongpadl_torndown - GPADL torndown handler.
   1418 *
   1419 * This is invoked when we received a response to our gpadl teardown request.
   1420 * Find the matching request, copy the response and signal the requesting
   1421 * thread.
   1422 */
   1423static void vmbus_ongpadl_torndown(
   1424			struct vmbus_channel_message_header *hdr)
   1425{
   1426	struct vmbus_channel_gpadl_torndown *gpadl_torndown;
   1427	struct vmbus_channel_msginfo *msginfo;
   1428	struct vmbus_channel_message_header *requestheader;
   1429	struct vmbus_channel_gpadl_teardown *gpadl_teardown;
   1430	unsigned long flags;
   1431
   1432	gpadl_torndown = (struct vmbus_channel_gpadl_torndown *)hdr;
   1433
   1434	trace_vmbus_ongpadl_torndown(gpadl_torndown);
   1435
   1436	/*
   1437	 * Find the open msg, copy the result and signal/unblock the wait event
   1438	 */
   1439	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
   1440
   1441	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list,
   1442				msglistentry) {
   1443		requestheader =
   1444			(struct vmbus_channel_message_header *)msginfo->msg;
   1445
   1446		if (requestheader->msgtype == CHANNELMSG_GPADL_TEARDOWN) {
   1447			gpadl_teardown =
   1448			(struct vmbus_channel_gpadl_teardown *)requestheader;
   1449
   1450			if (gpadl_torndown->gpadl == gpadl_teardown->gpadl) {
   1451				memcpy(&msginfo->response.gpadl_torndown,
   1452				       gpadl_torndown,
   1453				       sizeof(
   1454					struct vmbus_channel_gpadl_torndown));
   1455				complete(&msginfo->waitevent);
   1456				break;
   1457			}
   1458		}
   1459	}
   1460	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
   1461}
   1462
   1463/*
   1464 * vmbus_onversion_response - Version response handler
   1465 *
   1466 * This is invoked when we received a response to our initiate contact request.
   1467 * Find the matching request, copy the response and signal the requesting
   1468 * thread.
   1469 */
   1470static void vmbus_onversion_response(
   1471		struct vmbus_channel_message_header *hdr)
   1472{
   1473	struct vmbus_channel_msginfo *msginfo;
   1474	struct vmbus_channel_message_header *requestheader;
   1475	struct vmbus_channel_version_response *version_response;
   1476	unsigned long flags;
   1477
   1478	version_response = (struct vmbus_channel_version_response *)hdr;
   1479
   1480	trace_vmbus_onversion_response(version_response);
   1481
   1482	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
   1483
   1484	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list,
   1485				msglistentry) {
   1486		requestheader =
   1487			(struct vmbus_channel_message_header *)msginfo->msg;
   1488
   1489		if (requestheader->msgtype ==
   1490		    CHANNELMSG_INITIATE_CONTACT) {
   1491			memcpy(&msginfo->response.version_response,
   1492			      version_response,
   1493			      sizeof(struct vmbus_channel_version_response));
   1494			complete(&msginfo->waitevent);
   1495		}
   1496	}
   1497	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
   1498}
   1499
   1500/* Channel message dispatch table */
   1501const struct vmbus_channel_message_table_entry
   1502channel_message_table[CHANNELMSG_COUNT] = {
   1503	{ CHANNELMSG_INVALID,			0, NULL, 0},
   1504	{ CHANNELMSG_OFFERCHANNEL,		0, vmbus_onoffer,
   1505		sizeof(struct vmbus_channel_offer_channel)},
   1506	{ CHANNELMSG_RESCIND_CHANNELOFFER,	0, vmbus_onoffer_rescind,
   1507		sizeof(struct vmbus_channel_rescind_offer) },
   1508	{ CHANNELMSG_REQUESTOFFERS,		0, NULL, 0},
   1509	{ CHANNELMSG_ALLOFFERS_DELIVERED,	1, vmbus_onoffers_delivered, 0},
   1510	{ CHANNELMSG_OPENCHANNEL,		0, NULL, 0},
   1511	{ CHANNELMSG_OPENCHANNEL_RESULT,	1, vmbus_onopen_result,
   1512		sizeof(struct vmbus_channel_open_result)},
   1513	{ CHANNELMSG_CLOSECHANNEL,		0, NULL, 0},
   1514	{ CHANNELMSG_GPADL_HEADER,		0, NULL, 0},
   1515	{ CHANNELMSG_GPADL_BODY,		0, NULL, 0},
   1516	{ CHANNELMSG_GPADL_CREATED,		1, vmbus_ongpadl_created,
   1517		sizeof(struct vmbus_channel_gpadl_created)},
   1518	{ CHANNELMSG_GPADL_TEARDOWN,		0, NULL, 0},
   1519	{ CHANNELMSG_GPADL_TORNDOWN,		1, vmbus_ongpadl_torndown,
   1520		sizeof(struct vmbus_channel_gpadl_torndown) },
   1521	{ CHANNELMSG_RELID_RELEASED,		0, NULL, 0},
   1522	{ CHANNELMSG_INITIATE_CONTACT,		0, NULL, 0},
   1523	{ CHANNELMSG_VERSION_RESPONSE,		1, vmbus_onversion_response,
   1524		sizeof(struct vmbus_channel_version_response)},
   1525	{ CHANNELMSG_UNLOAD,			0, NULL, 0},
   1526	{ CHANNELMSG_UNLOAD_RESPONSE,		1, vmbus_unload_response, 0},
   1527	{ CHANNELMSG_18,			0, NULL, 0},
   1528	{ CHANNELMSG_19,			0, NULL, 0},
   1529	{ CHANNELMSG_20,			0, NULL, 0},
   1530	{ CHANNELMSG_TL_CONNECT_REQUEST,	0, NULL, 0},
   1531	{ CHANNELMSG_MODIFYCHANNEL,		0, NULL, 0},
   1532	{ CHANNELMSG_TL_CONNECT_RESULT,		0, NULL, 0},
   1533	{ CHANNELMSG_MODIFYCHANNEL_RESPONSE,	1, vmbus_onmodifychannel_response,
   1534		sizeof(struct vmbus_channel_modifychannel_response)},
   1535};
   1536
   1537/*
   1538 * vmbus_onmessage - Handler for channel protocol messages.
   1539 *
   1540 * This is invoked in the vmbus worker thread context.
   1541 */
   1542void vmbus_onmessage(struct vmbus_channel_message_header *hdr)
   1543{
   1544	trace_vmbus_on_message(hdr);
   1545
   1546	/*
   1547	 * vmbus_on_msg_dpc() makes sure the hdr->msgtype here can not go
   1548	 * out of bound and the message_handler pointer can not be NULL.
   1549	 */
   1550	channel_message_table[hdr->msgtype].message_handler(hdr);
   1551}
   1552
   1553/*
   1554 * vmbus_request_offers - Send a request to get all our pending offers.
   1555 */
   1556int vmbus_request_offers(void)
   1557{
   1558	struct vmbus_channel_message_header *msg;
   1559	struct vmbus_channel_msginfo *msginfo;
   1560	int ret;
   1561
   1562	msginfo = kzalloc(sizeof(*msginfo) +
   1563			  sizeof(struct vmbus_channel_message_header),
   1564			  GFP_KERNEL);
   1565	if (!msginfo)
   1566		return -ENOMEM;
   1567
   1568	msg = (struct vmbus_channel_message_header *)msginfo->msg;
   1569
   1570	msg->msgtype = CHANNELMSG_REQUESTOFFERS;
   1571
   1572	ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_message_header),
   1573			     true);
   1574
   1575	trace_vmbus_request_offers(ret);
   1576
   1577	if (ret != 0) {
   1578		pr_err("Unable to request offers - %d\n", ret);
   1579
   1580		goto cleanup;
   1581	}
   1582
   1583cleanup:
   1584	kfree(msginfo);
   1585
   1586	return ret;
   1587}
   1588
   1589void vmbus_set_sc_create_callback(struct vmbus_channel *primary_channel,
   1590				void (*sc_cr_cb)(struct vmbus_channel *new_sc))
   1591{
   1592	primary_channel->sc_creation_callback = sc_cr_cb;
   1593}
   1594EXPORT_SYMBOL_GPL(vmbus_set_sc_create_callback);
   1595
   1596void vmbus_set_chn_rescind_callback(struct vmbus_channel *channel,
   1597		void (*chn_rescind_cb)(struct vmbus_channel *))
   1598{
   1599	channel->chn_rescind_callback = chn_rescind_cb;
   1600}
   1601EXPORT_SYMBOL_GPL(vmbus_set_chn_rescind_callback);