cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

pci-hyperv.c (112497B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (c) Microsoft Corporation.
      4 *
      5 * Author:
      6 *   Jake Oshins <jakeo@microsoft.com>
      7 *
      8 * This driver acts as a paravirtual front-end for PCI Express root buses.
      9 * When a PCI Express function (either an entire device or an SR-IOV
     10 * Virtual Function) is being passed through to the VM, this driver exposes
     11 * a new bus to the guest VM.  This is modeled as a root PCI bus because
     12 * no bridges are being exposed to the VM.  In fact, with a "Generation 2"
     13 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
     14 * until a device as been exposed using this driver.
     15 *
     16 * Each root PCI bus has its own PCI domain, which is called "Segment" in
     17 * the PCI Firmware Specifications.  Thus while each device passed through
     18 * to the VM using this front-end will appear at "device 0", the domain will
     19 * be unique.  Typically, each bus will have one PCI function on it, though
     20 * this driver does support more than one.
     21 *
     22 * In order to map the interrupts from the device through to the guest VM,
     23 * this driver also implements an IRQ Domain, which handles interrupts (either
     24 * MSI or MSI-X) associated with the functions on the bus.  As interrupts are
     25 * set up, torn down, or reaffined, this driver communicates with the
     26 * underlying hypervisor to adjust the mappings in the I/O MMU so that each
     27 * interrupt will be delivered to the correct virtual processor at the right
     28 * vector.  This driver does not support level-triggered (line-based)
     29 * interrupts, and will report that the Interrupt Line register in the
     30 * function's configuration space is zero.
     31 *
     32 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
     33 * facilities.  For instance, the configuration space of a function exposed
     34 * by Hyper-V is mapped into a single page of memory space, and the
     35 * read and write handlers for config space must be aware of this mechanism.
     36 * Similarly, device setup and teardown involves messages sent to and from
     37 * the PCI back-end driver in Hyper-V.
     38 */
     39
     40#include <linux/kernel.h>
     41#include <linux/module.h>
     42#include <linux/pci.h>
     43#include <linux/pci-ecam.h>
     44#include <linux/delay.h>
     45#include <linux/semaphore.h>
     46#include <linux/irq.h>
     47#include <linux/msi.h>
     48#include <linux/hyperv.h>
     49#include <linux/refcount.h>
     50#include <linux/irqdomain.h>
     51#include <linux/acpi.h>
     52#include <asm/mshyperv.h>
     53
     54/*
     55 * Protocol versions. The low word is the minor version, the high word the
     56 * major version.
     57 */
     58
     59#define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor)))
     60#define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
     61#define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
     62
     63enum pci_protocol_version_t {
     64	PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),	/* Win10 */
     65	PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2),	/* RS1 */
     66	PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3),	/* Vibranium */
     67	PCI_PROTOCOL_VERSION_1_4 = PCI_MAKE_VERSION(1, 4),	/* WS2022 */
     68};
     69
     70#define CPU_AFFINITY_ALL	-1ULL
     71
     72/*
     73 * Supported protocol versions in the order of probing - highest go
     74 * first.
     75 */
     76static enum pci_protocol_version_t pci_protocol_versions[] = {
     77	PCI_PROTOCOL_VERSION_1_4,
     78	PCI_PROTOCOL_VERSION_1_3,
     79	PCI_PROTOCOL_VERSION_1_2,
     80	PCI_PROTOCOL_VERSION_1_1,
     81};
     82
     83#define PCI_CONFIG_MMIO_LENGTH	0x2000
     84#define CFG_PAGE_OFFSET 0x1000
     85#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
     86
     87#define MAX_SUPPORTED_MSI_MESSAGES 0x400
     88
     89#define STATUS_REVISION_MISMATCH 0xC0000059
     90
     91/* space for 32bit serial number as string */
     92#define SLOT_NAME_SIZE 11
     93
     94/*
     95 * Size of requestor for VMbus; the value is based on the observation
     96 * that having more than one request outstanding is 'rare', and so 64
     97 * should be generous in ensuring that we don't ever run out.
     98 */
     99#define HV_PCI_RQSTOR_SIZE 64
    100
    101/*
    102 * Message Types
    103 */
    104
    105enum pci_message_type {
    106	/*
    107	 * Version 1.1
    108	 */
    109	PCI_MESSAGE_BASE                = 0x42490000,
    110	PCI_BUS_RELATIONS               = PCI_MESSAGE_BASE + 0,
    111	PCI_QUERY_BUS_RELATIONS         = PCI_MESSAGE_BASE + 1,
    112	PCI_POWER_STATE_CHANGE          = PCI_MESSAGE_BASE + 4,
    113	PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
    114	PCI_QUERY_RESOURCE_RESOURCES    = PCI_MESSAGE_BASE + 6,
    115	PCI_BUS_D0ENTRY                 = PCI_MESSAGE_BASE + 7,
    116	PCI_BUS_D0EXIT                  = PCI_MESSAGE_BASE + 8,
    117	PCI_READ_BLOCK                  = PCI_MESSAGE_BASE + 9,
    118	PCI_WRITE_BLOCK                 = PCI_MESSAGE_BASE + 0xA,
    119	PCI_EJECT                       = PCI_MESSAGE_BASE + 0xB,
    120	PCI_QUERY_STOP                  = PCI_MESSAGE_BASE + 0xC,
    121	PCI_REENABLE                    = PCI_MESSAGE_BASE + 0xD,
    122	PCI_QUERY_STOP_FAILED           = PCI_MESSAGE_BASE + 0xE,
    123	PCI_EJECTION_COMPLETE           = PCI_MESSAGE_BASE + 0xF,
    124	PCI_RESOURCES_ASSIGNED          = PCI_MESSAGE_BASE + 0x10,
    125	PCI_RESOURCES_RELEASED          = PCI_MESSAGE_BASE + 0x11,
    126	PCI_INVALIDATE_BLOCK            = PCI_MESSAGE_BASE + 0x12,
    127	PCI_QUERY_PROTOCOL_VERSION      = PCI_MESSAGE_BASE + 0x13,
    128	PCI_CREATE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x14,
    129	PCI_DELETE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x15,
    130	PCI_RESOURCES_ASSIGNED2		= PCI_MESSAGE_BASE + 0x16,
    131	PCI_CREATE_INTERRUPT_MESSAGE2	= PCI_MESSAGE_BASE + 0x17,
    132	PCI_DELETE_INTERRUPT_MESSAGE2	= PCI_MESSAGE_BASE + 0x18, /* unused */
    133	PCI_BUS_RELATIONS2		= PCI_MESSAGE_BASE + 0x19,
    134	PCI_RESOURCES_ASSIGNED3         = PCI_MESSAGE_BASE + 0x1A,
    135	PCI_CREATE_INTERRUPT_MESSAGE3   = PCI_MESSAGE_BASE + 0x1B,
    136	PCI_MESSAGE_MAXIMUM
    137};
    138
    139/*
    140 * Structures defining the virtual PCI Express protocol.
    141 */
    142
    143union pci_version {
    144	struct {
    145		u16 minor_version;
    146		u16 major_version;
    147	} parts;
    148	u32 version;
    149} __packed;
    150
    151/*
    152 * Function numbers are 8-bits wide on Express, as interpreted through ARI,
    153 * which is all this driver does.  This representation is the one used in
    154 * Windows, which is what is expected when sending this back and forth with
    155 * the Hyper-V parent partition.
    156 */
    157union win_slot_encoding {
    158	struct {
    159		u32	dev:5;
    160		u32	func:3;
    161		u32	reserved:24;
    162	} bits;
    163	u32 slot;
    164} __packed;
    165
    166/*
    167 * Pretty much as defined in the PCI Specifications.
    168 */
    169struct pci_function_description {
    170	u16	v_id;	/* vendor ID */
    171	u16	d_id;	/* device ID */
    172	u8	rev;
    173	u8	prog_intf;
    174	u8	subclass;
    175	u8	base_class;
    176	u32	subsystem_id;
    177	union win_slot_encoding win_slot;
    178	u32	ser;	/* serial number */
    179} __packed;
    180
    181enum pci_device_description_flags {
    182	HV_PCI_DEVICE_FLAG_NONE			= 0x0,
    183	HV_PCI_DEVICE_FLAG_NUMA_AFFINITY	= 0x1,
    184};
    185
    186struct pci_function_description2 {
    187	u16	v_id;	/* vendor ID */
    188	u16	d_id;	/* device ID */
    189	u8	rev;
    190	u8	prog_intf;
    191	u8	subclass;
    192	u8	base_class;
    193	u32	subsystem_id;
    194	union	win_slot_encoding win_slot;
    195	u32	ser;	/* serial number */
    196	u32	flags;
    197	u16	virtual_numa_node;
    198	u16	reserved;
    199} __packed;
    200
    201/**
    202 * struct hv_msi_desc
    203 * @vector:		IDT entry
    204 * @delivery_mode:	As defined in Intel's Programmer's
    205 *			Reference Manual, Volume 3, Chapter 8.
    206 * @vector_count:	Number of contiguous entries in the
    207 *			Interrupt Descriptor Table that are
    208 *			occupied by this Message-Signaled
    209 *			Interrupt. For "MSI", as first defined
    210 *			in PCI 2.2, this can be between 1 and
    211 *			32. For "MSI-X," as first defined in PCI
    212 *			3.0, this must be 1, as each MSI-X table
    213 *			entry would have its own descriptor.
    214 * @reserved:		Empty space
    215 * @cpu_mask:		All the target virtual processors.
    216 */
    217struct hv_msi_desc {
    218	u8	vector;
    219	u8	delivery_mode;
    220	u16	vector_count;
    221	u32	reserved;
    222	u64	cpu_mask;
    223} __packed;
    224
    225/**
    226 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc
    227 * @vector:		IDT entry
    228 * @delivery_mode:	As defined in Intel's Programmer's
    229 *			Reference Manual, Volume 3, Chapter 8.
    230 * @vector_count:	Number of contiguous entries in the
    231 *			Interrupt Descriptor Table that are
    232 *			occupied by this Message-Signaled
    233 *			Interrupt. For "MSI", as first defined
    234 *			in PCI 2.2, this can be between 1 and
    235 *			32. For "MSI-X," as first defined in PCI
    236 *			3.0, this must be 1, as each MSI-X table
    237 *			entry would have its own descriptor.
    238 * @processor_count:	number of bits enabled in array.
    239 * @processor_array:	All the target virtual processors.
    240 */
    241struct hv_msi_desc2 {
    242	u8	vector;
    243	u8	delivery_mode;
    244	u16	vector_count;
    245	u16	processor_count;
    246	u16	processor_array[32];
    247} __packed;
    248
    249/*
    250 * struct hv_msi_desc3 - 1.3 version of hv_msi_desc
    251 *	Everything is the same as in 'hv_msi_desc2' except that the size of the
    252 *	'vector' field is larger to support bigger vector values. For ex: LPI
    253 *	vectors on ARM.
    254 */
    255struct hv_msi_desc3 {
    256	u32	vector;
    257	u8	delivery_mode;
    258	u8	reserved;
    259	u16	vector_count;
    260	u16	processor_count;
    261	u16	processor_array[32];
    262} __packed;
    263
    264/**
    265 * struct tran_int_desc
    266 * @reserved:		unused, padding
    267 * @vector_count:	same as in hv_msi_desc
    268 * @data:		This is the "data payload" value that is
    269 *			written by the device when it generates
    270 *			a message-signaled interrupt, either MSI
    271 *			or MSI-X.
    272 * @address:		This is the address to which the data
    273 *			payload is written on interrupt
    274 *			generation.
    275 */
    276struct tran_int_desc {
    277	u16	reserved;
    278	u16	vector_count;
    279	u32	data;
    280	u64	address;
    281} __packed;
    282
    283/*
    284 * A generic message format for virtual PCI.
    285 * Specific message formats are defined later in the file.
    286 */
    287
    288struct pci_message {
    289	u32 type;
    290} __packed;
    291
    292struct pci_child_message {
    293	struct pci_message message_type;
    294	union win_slot_encoding wslot;
    295} __packed;
    296
    297struct pci_incoming_message {
    298	struct vmpacket_descriptor hdr;
    299	struct pci_message message_type;
    300} __packed;
    301
    302struct pci_response {
    303	struct vmpacket_descriptor hdr;
    304	s32 status;			/* negative values are failures */
    305} __packed;
    306
    307struct pci_packet {
    308	void (*completion_func)(void *context, struct pci_response *resp,
    309				int resp_packet_size);
    310	void *compl_ctxt;
    311
    312	struct pci_message message[];
    313};
    314
    315/*
    316 * Specific message types supporting the PCI protocol.
    317 */
    318
    319/*
    320 * Version negotiation message. Sent from the guest to the host.
    321 * The guest is free to try different versions until the host
    322 * accepts the version.
    323 *
    324 * pci_version: The protocol version requested.
    325 * is_last_attempt: If TRUE, this is the last version guest will request.
    326 * reservedz: Reserved field, set to zero.
    327 */
    328
    329struct pci_version_request {
    330	struct pci_message message_type;
    331	u32 protocol_version;
    332} __packed;
    333
    334/*
    335 * Bus D0 Entry.  This is sent from the guest to the host when the virtual
    336 * bus (PCI Express port) is ready for action.
    337 */
    338
    339struct pci_bus_d0_entry {
    340	struct pci_message message_type;
    341	u32 reserved;
    342	u64 mmio_base;
    343} __packed;
    344
    345struct pci_bus_relations {
    346	struct pci_incoming_message incoming;
    347	u32 device_count;
    348	struct pci_function_description func[];
    349} __packed;
    350
    351struct pci_bus_relations2 {
    352	struct pci_incoming_message incoming;
    353	u32 device_count;
    354	struct pci_function_description2 func[];
    355} __packed;
    356
    357struct pci_q_res_req_response {
    358	struct vmpacket_descriptor hdr;
    359	s32 status;			/* negative values are failures */
    360	u32 probed_bar[PCI_STD_NUM_BARS];
    361} __packed;
    362
    363struct pci_set_power {
    364	struct pci_message message_type;
    365	union win_slot_encoding wslot;
    366	u32 power_state;		/* In Windows terms */
    367	u32 reserved;
    368} __packed;
    369
    370struct pci_set_power_response {
    371	struct vmpacket_descriptor hdr;
    372	s32 status;			/* negative values are failures */
    373	union win_slot_encoding wslot;
    374	u32 resultant_state;		/* In Windows terms */
    375	u32 reserved;
    376} __packed;
    377
    378struct pci_resources_assigned {
    379	struct pci_message message_type;
    380	union win_slot_encoding wslot;
    381	u8 memory_range[0x14][6];	/* not used here */
    382	u32 msi_descriptors;
    383	u32 reserved[4];
    384} __packed;
    385
    386struct pci_resources_assigned2 {
    387	struct pci_message message_type;
    388	union win_slot_encoding wslot;
    389	u8 memory_range[0x14][6];	/* not used here */
    390	u32 msi_descriptor_count;
    391	u8 reserved[70];
    392} __packed;
    393
    394struct pci_create_interrupt {
    395	struct pci_message message_type;
    396	union win_slot_encoding wslot;
    397	struct hv_msi_desc int_desc;
    398} __packed;
    399
    400struct pci_create_int_response {
    401	struct pci_response response;
    402	u32 reserved;
    403	struct tran_int_desc int_desc;
    404} __packed;
    405
    406struct pci_create_interrupt2 {
    407	struct pci_message message_type;
    408	union win_slot_encoding wslot;
    409	struct hv_msi_desc2 int_desc;
    410} __packed;
    411
    412struct pci_create_interrupt3 {
    413	struct pci_message message_type;
    414	union win_slot_encoding wslot;
    415	struct hv_msi_desc3 int_desc;
    416} __packed;
    417
    418struct pci_delete_interrupt {
    419	struct pci_message message_type;
    420	union win_slot_encoding wslot;
    421	struct tran_int_desc int_desc;
    422} __packed;
    423
    424/*
    425 * Note: the VM must pass a valid block id, wslot and bytes_requested.
    426 */
    427struct pci_read_block {
    428	struct pci_message message_type;
    429	u32 block_id;
    430	union win_slot_encoding wslot;
    431	u32 bytes_requested;
    432} __packed;
    433
    434struct pci_read_block_response {
    435	struct vmpacket_descriptor hdr;
    436	u32 status;
    437	u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
    438} __packed;
    439
    440/*
    441 * Note: the VM must pass a valid block id, wslot and byte_count.
    442 */
    443struct pci_write_block {
    444	struct pci_message message_type;
    445	u32 block_id;
    446	union win_slot_encoding wslot;
    447	u32 byte_count;
    448	u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
    449} __packed;
    450
    451struct pci_dev_inval_block {
    452	struct pci_incoming_message incoming;
    453	union win_slot_encoding wslot;
    454	u64 block_mask;
    455} __packed;
    456
    457struct pci_dev_incoming {
    458	struct pci_incoming_message incoming;
    459	union win_slot_encoding wslot;
    460} __packed;
    461
    462struct pci_eject_response {
    463	struct pci_message message_type;
    464	union win_slot_encoding wslot;
    465	u32 status;
    466} __packed;
    467
    468static int pci_ring_size = (4 * PAGE_SIZE);
    469
    470/*
    471 * Driver specific state.
    472 */
    473
    474enum hv_pcibus_state {
    475	hv_pcibus_init = 0,
    476	hv_pcibus_probed,
    477	hv_pcibus_installed,
    478	hv_pcibus_removing,
    479	hv_pcibus_maximum
    480};
    481
    482struct hv_pcibus_device {
    483#ifdef CONFIG_X86
    484	struct pci_sysdata sysdata;
    485#elif defined(CONFIG_ARM64)
    486	struct pci_config_window sysdata;
    487#endif
    488	struct pci_host_bridge *bridge;
    489	struct fwnode_handle *fwnode;
    490	/* Protocol version negotiated with the host */
    491	enum pci_protocol_version_t protocol_version;
    492	enum hv_pcibus_state state;
    493	struct hv_device *hdev;
    494	resource_size_t low_mmio_space;
    495	resource_size_t high_mmio_space;
    496	struct resource *mem_config;
    497	struct resource *low_mmio_res;
    498	struct resource *high_mmio_res;
    499	struct completion *survey_event;
    500	struct pci_bus *pci_bus;
    501	spinlock_t config_lock;	/* Avoid two threads writing index page */
    502	spinlock_t device_list_lock;	/* Protect lists below */
    503	void __iomem *cfg_addr;
    504
    505	struct list_head children;
    506	struct list_head dr_list;
    507
    508	struct msi_domain_info msi_info;
    509	struct irq_domain *irq_domain;
    510
    511	spinlock_t retarget_msi_interrupt_lock;
    512
    513	struct workqueue_struct *wq;
    514
    515	/* Highest slot of child device with resources allocated */
    516	int wslot_res_allocated;
    517
    518	/* hypercall arg, must not cross page boundary */
    519	struct hv_retarget_device_interrupt retarget_msi_interrupt_params;
    520
    521	/*
    522	 * Don't put anything here: retarget_msi_interrupt_params must be last
    523	 */
    524};
    525
    526/*
    527 * Tracks "Device Relations" messages from the host, which must be both
    528 * processed in order and deferred so that they don't run in the context
    529 * of the incoming packet callback.
    530 */
    531struct hv_dr_work {
    532	struct work_struct wrk;
    533	struct hv_pcibus_device *bus;
    534};
    535
    536struct hv_pcidev_description {
    537	u16	v_id;	/* vendor ID */
    538	u16	d_id;	/* device ID */
    539	u8	rev;
    540	u8	prog_intf;
    541	u8	subclass;
    542	u8	base_class;
    543	u32	subsystem_id;
    544	union	win_slot_encoding win_slot;
    545	u32	ser;	/* serial number */
    546	u32	flags;
    547	u16	virtual_numa_node;
    548};
    549
    550struct hv_dr_state {
    551	struct list_head list_entry;
    552	u32 device_count;
    553	struct hv_pcidev_description func[];
    554};
    555
    556enum hv_pcichild_state {
    557	hv_pcichild_init = 0,
    558	hv_pcichild_requirements,
    559	hv_pcichild_resourced,
    560	hv_pcichild_ejecting,
    561	hv_pcichild_maximum
    562};
    563
    564struct hv_pci_dev {
    565	/* List protected by pci_rescan_remove_lock */
    566	struct list_head list_entry;
    567	refcount_t refs;
    568	enum hv_pcichild_state state;
    569	struct pci_slot *pci_slot;
    570	struct hv_pcidev_description desc;
    571	bool reported_missing;
    572	struct hv_pcibus_device *hbus;
    573	struct work_struct wrk;
    574
    575	void (*block_invalidate)(void *context, u64 block_mask);
    576	void *invalidate_context;
    577
    578	/*
    579	 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
    580	 * read it back, for each of the BAR offsets within config space.
    581	 */
    582	u32 probed_bar[PCI_STD_NUM_BARS];
    583};
    584
    585struct hv_pci_compl {
    586	struct completion host_event;
    587	s32 completion_status;
    588};
    589
    590static void hv_pci_onchannelcallback(void *context);
    591
    592#ifdef CONFIG_X86
    593#define DELIVERY_MODE	APIC_DELIVERY_MODE_FIXED
    594#define FLOW_HANDLER	handle_edge_irq
    595#define FLOW_NAME	"edge"
    596
    597static int hv_pci_irqchip_init(void)
    598{
    599	return 0;
    600}
    601
    602static struct irq_domain *hv_pci_get_root_domain(void)
    603{
    604	return x86_vector_domain;
    605}
    606
    607static unsigned int hv_msi_get_int_vector(struct irq_data *data)
    608{
    609	struct irq_cfg *cfg = irqd_cfg(data);
    610
    611	return cfg->vector;
    612}
    613
    614static int hv_msi_prepare(struct irq_domain *domain, struct device *dev,
    615			  int nvec, msi_alloc_info_t *info)
    616{
    617	int ret = pci_msi_prepare(domain, dev, nvec, info);
    618
    619	/*
    620	 * By using the interrupt remapper in the hypervisor IOMMU, contiguous
    621	 * CPU vectors is not needed for multi-MSI
    622	 */
    623	if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
    624		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
    625
    626	return ret;
    627}
    628
    629/**
    630 * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current
    631 * affinity.
    632 * @data:	Describes the IRQ
    633 *
    634 * Build new a destination for the MSI and make a hypercall to
    635 * update the Interrupt Redirection Table. "Device Logical ID"
    636 * is built out of this PCI bus's instance GUID and the function
    637 * number of the device.
    638 */
    639static void hv_arch_irq_unmask(struct irq_data *data)
    640{
    641	struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
    642	struct hv_retarget_device_interrupt *params;
    643	struct tran_int_desc *int_desc;
    644	struct hv_pcibus_device *hbus;
    645	struct cpumask *dest;
    646	cpumask_var_t tmp;
    647	struct pci_bus *pbus;
    648	struct pci_dev *pdev;
    649	unsigned long flags;
    650	u32 var_size = 0;
    651	int cpu, nr_bank;
    652	u64 res;
    653
    654	dest = irq_data_get_effective_affinity_mask(data);
    655	pdev = msi_desc_to_pci_dev(msi_desc);
    656	pbus = pdev->bus;
    657	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
    658	int_desc = data->chip_data;
    659
    660	spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
    661
    662	params = &hbus->retarget_msi_interrupt_params;
    663	memset(params, 0, sizeof(*params));
    664	params->partition_id = HV_PARTITION_ID_SELF;
    665	params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
    666	params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff;
    667	params->int_entry.msi_entry.data.as_uint32 = int_desc->data;
    668	params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
    669			   (hbus->hdev->dev_instance.b[4] << 16) |
    670			   (hbus->hdev->dev_instance.b[7] << 8) |
    671			   (hbus->hdev->dev_instance.b[6] & 0xf8) |
    672			   PCI_FUNC(pdev->devfn);
    673	params->int_target.vector = hv_msi_get_int_vector(data);
    674
    675	/*
    676	 * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by
    677	 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
    678	 * spurious interrupt storm. Not doing so does not seem to have a
    679	 * negative effect (yet?).
    680	 */
    681
    682	if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
    683		/*
    684		 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
    685		 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
    686		 * with >64 VP support.
    687		 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED
    688		 * is not sufficient for this hypercall.
    689		 */
    690		params->int_target.flags |=
    691			HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
    692
    693		if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) {
    694			res = 1;
    695			goto exit_unlock;
    696		}
    697
    698		cpumask_and(tmp, dest, cpu_online_mask);
    699		nr_bank = cpumask_to_vpset(&params->int_target.vp_set, tmp);
    700		free_cpumask_var(tmp);
    701
    702		if (nr_bank <= 0) {
    703			res = 1;
    704			goto exit_unlock;
    705		}
    706
    707		/*
    708		 * var-sized hypercall, var-size starts after vp_mask (thus
    709		 * vp_set.format does not count, but vp_set.valid_bank_mask
    710		 * does).
    711		 */
    712		var_size = 1 + nr_bank;
    713	} else {
    714		for_each_cpu_and(cpu, dest, cpu_online_mask) {
    715			params->int_target.vp_mask |=
    716				(1ULL << hv_cpu_number_to_vp_number(cpu));
    717		}
    718	}
    719
    720	res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
    721			      params, NULL);
    722
    723exit_unlock:
    724	spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
    725
    726	/*
    727	 * During hibernation, when a CPU is offlined, the kernel tries
    728	 * to move the interrupt to the remaining CPUs that haven't
    729	 * been offlined yet. In this case, the below hv_do_hypercall()
    730	 * always fails since the vmbus channel has been closed:
    731	 * refer to cpu_disable_common() -> fixup_irqs() ->
    732	 * irq_migrate_all_off_this_cpu() -> migrate_one_irq().
    733	 *
    734	 * Suppress the error message for hibernation because the failure
    735	 * during hibernation does not matter (at this time all the devices
    736	 * have been frozen). Note: the correct affinity info is still updated
    737	 * into the irqdata data structure in migrate_one_irq() ->
    738	 * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM
    739	 * resumes, hv_pci_restore_msi_state() is able to correctly restore
    740	 * the interrupt with the correct affinity.
    741	 */
    742	if (!hv_result_success(res) && hbus->state != hv_pcibus_removing)
    743		dev_err(&hbus->hdev->device,
    744			"%s() failed: %#llx", __func__, res);
    745}
    746#elif defined(CONFIG_ARM64)
    747/*
    748 * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit
    749 * of room at the start to allow for SPIs to be specified through ACPI and
    750 * starting with a power of two to satisfy power of 2 multi-MSI requirement.
    751 */
    752#define HV_PCI_MSI_SPI_START	64
    753#define HV_PCI_MSI_SPI_NR	(1020 - HV_PCI_MSI_SPI_START)
    754#define DELIVERY_MODE		0
    755#define FLOW_HANDLER		NULL
    756#define FLOW_NAME		NULL
    757#define hv_msi_prepare		NULL
    758
    759struct hv_pci_chip_data {
    760	DECLARE_BITMAP(spi_map, HV_PCI_MSI_SPI_NR);
    761	struct mutex	map_lock;
    762};
    763
    764/* Hyper-V vPCI MSI GIC IRQ domain */
    765static struct irq_domain *hv_msi_gic_irq_domain;
    766
    767/* Hyper-V PCI MSI IRQ chip */
    768static struct irq_chip hv_arm64_msi_irq_chip = {
    769	.name = "MSI",
    770	.irq_set_affinity = irq_chip_set_affinity_parent,
    771	.irq_eoi = irq_chip_eoi_parent,
    772	.irq_mask = irq_chip_mask_parent,
    773	.irq_unmask = irq_chip_unmask_parent
    774};
    775
    776static unsigned int hv_msi_get_int_vector(struct irq_data *irqd)
    777{
    778	return irqd->parent_data->hwirq;
    779}
    780
    781/*
    782 * @nr_bm_irqs:		Indicates the number of IRQs that were allocated from
    783 *			the bitmap.
    784 * @nr_dom_irqs:	Indicates the number of IRQs that were allocated from
    785 *			the parent domain.
    786 */
    787static void hv_pci_vec_irq_free(struct irq_domain *domain,
    788				unsigned int virq,
    789				unsigned int nr_bm_irqs,
    790				unsigned int nr_dom_irqs)
    791{
    792	struct hv_pci_chip_data *chip_data = domain->host_data;
    793	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
    794	int first = d->hwirq - HV_PCI_MSI_SPI_START;
    795	int i;
    796
    797	mutex_lock(&chip_data->map_lock);
    798	bitmap_release_region(chip_data->spi_map,
    799			      first,
    800			      get_count_order(nr_bm_irqs));
    801	mutex_unlock(&chip_data->map_lock);
    802	for (i = 0; i < nr_dom_irqs; i++) {
    803		if (i)
    804			d = irq_domain_get_irq_data(domain, virq + i);
    805		irq_domain_reset_irq_data(d);
    806	}
    807
    808	irq_domain_free_irqs_parent(domain, virq, nr_dom_irqs);
    809}
    810
    811static void hv_pci_vec_irq_domain_free(struct irq_domain *domain,
    812				       unsigned int virq,
    813				       unsigned int nr_irqs)
    814{
    815	hv_pci_vec_irq_free(domain, virq, nr_irqs, nr_irqs);
    816}
    817
    818static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain,
    819				       unsigned int nr_irqs,
    820				       irq_hw_number_t *hwirq)
    821{
    822	struct hv_pci_chip_data *chip_data = domain->host_data;
    823	int index;
    824
    825	/* Find and allocate region from the SPI bitmap */
    826	mutex_lock(&chip_data->map_lock);
    827	index = bitmap_find_free_region(chip_data->spi_map,
    828					HV_PCI_MSI_SPI_NR,
    829					get_count_order(nr_irqs));
    830	mutex_unlock(&chip_data->map_lock);
    831	if (index < 0)
    832		return -ENOSPC;
    833
    834	*hwirq = index + HV_PCI_MSI_SPI_START;
    835
    836	return 0;
    837}
    838
    839static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain,
    840					   unsigned int virq,
    841					   irq_hw_number_t hwirq)
    842{
    843	struct irq_fwspec fwspec;
    844	struct irq_data *d;
    845	int ret;
    846
    847	fwspec.fwnode = domain->parent->fwnode;
    848	fwspec.param_count = 2;
    849	fwspec.param[0] = hwirq;
    850	fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
    851
    852	ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
    853	if (ret)
    854		return ret;
    855
    856	/*
    857	 * Since the interrupt specifier is not coming from ACPI or DT, the
    858	 * trigger type will need to be set explicitly. Otherwise, it will be
    859	 * set to whatever is in the GIC configuration.
    860	 */
    861	d = irq_domain_get_irq_data(domain->parent, virq);
    862
    863	return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
    864}
    865
    866static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain,
    867				       unsigned int virq, unsigned int nr_irqs,
    868				       void *args)
    869{
    870	irq_hw_number_t hwirq;
    871	unsigned int i;
    872	int ret;
    873
    874	ret = hv_pci_vec_alloc_device_irq(domain, nr_irqs, &hwirq);
    875	if (ret)
    876		return ret;
    877
    878	for (i = 0; i < nr_irqs; i++) {
    879		ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i,
    880						      hwirq + i);
    881		if (ret) {
    882			hv_pci_vec_irq_free(domain, virq, nr_irqs, i);
    883			return ret;
    884		}
    885
    886		irq_domain_set_hwirq_and_chip(domain, virq + i,
    887					      hwirq + i,
    888					      &hv_arm64_msi_irq_chip,
    889					      domain->host_data);
    890		pr_debug("pID:%d vID:%u\n", (int)(hwirq + i), virq + i);
    891	}
    892
    893	return 0;
    894}
    895
    896/*
    897 * Pick the first cpu as the irq affinity that can be temporarily used for
    898 * composing MSI from the hypervisor. GIC will eventually set the right
    899 * affinity for the irq and the 'unmask' will retarget the interrupt to that
    900 * cpu.
    901 */
    902static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain,
    903					  struct irq_data *irqd, bool reserve)
    904{
    905	int cpu = cpumask_first(cpu_present_mask);
    906
    907	irq_data_update_effective_affinity(irqd, cpumask_of(cpu));
    908
    909	return 0;
    910}
    911
    912static const struct irq_domain_ops hv_pci_domain_ops = {
    913	.alloc	= hv_pci_vec_irq_domain_alloc,
    914	.free	= hv_pci_vec_irq_domain_free,
    915	.activate = hv_pci_vec_irq_domain_activate,
    916};
    917
    918static int hv_pci_irqchip_init(void)
    919{
    920	static struct hv_pci_chip_data *chip_data;
    921	struct fwnode_handle *fn = NULL;
    922	int ret = -ENOMEM;
    923
    924	chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL);
    925	if (!chip_data)
    926		return ret;
    927
    928	mutex_init(&chip_data->map_lock);
    929	fn = irq_domain_alloc_named_fwnode("hv_vpci_arm64");
    930	if (!fn)
    931		goto free_chip;
    932
    933	/*
    934	 * IRQ domain once enabled, should not be removed since there is no
    935	 * way to ensure that all the corresponding devices are also gone and
    936	 * no interrupts will be generated.
    937	 */
    938	hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR,
    939							  fn, &hv_pci_domain_ops,
    940							  chip_data);
    941
    942	if (!hv_msi_gic_irq_domain) {
    943		pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n");
    944		goto free_chip;
    945	}
    946
    947	return 0;
    948
    949free_chip:
    950	kfree(chip_data);
    951	if (fn)
    952		irq_domain_free_fwnode(fn);
    953
    954	return ret;
    955}
    956
    957static struct irq_domain *hv_pci_get_root_domain(void)
    958{
    959	return hv_msi_gic_irq_domain;
    960}
    961
    962/*
    963 * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD
    964 * registers which Hyper-V already supports, so no hypercall needed.
    965 */
    966static void hv_arch_irq_unmask(struct irq_data *data) { }
    967#endif /* CONFIG_ARM64 */
    968
    969/**
    970 * hv_pci_generic_compl() - Invoked for a completion packet
    971 * @context:		Set up by the sender of the packet.
    972 * @resp:		The response packet
    973 * @resp_packet_size:	Size in bytes of the packet
    974 *
    975 * This function is used to trigger an event and report status
    976 * for any message for which the completion packet contains a
    977 * status and nothing else.
    978 */
    979static void hv_pci_generic_compl(void *context, struct pci_response *resp,
    980				 int resp_packet_size)
    981{
    982	struct hv_pci_compl *comp_pkt = context;
    983
    984	comp_pkt->completion_status = resp->status;
    985	complete(&comp_pkt->host_event);
    986}
    987
    988static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
    989						u32 wslot);
    990
    991static void get_pcichild(struct hv_pci_dev *hpdev)
    992{
    993	refcount_inc(&hpdev->refs);
    994}
    995
    996static void put_pcichild(struct hv_pci_dev *hpdev)
    997{
    998	if (refcount_dec_and_test(&hpdev->refs))
    999		kfree(hpdev);
   1000}
   1001
   1002/*
   1003 * There is no good way to get notified from vmbus_onoffer_rescind(),
   1004 * so let's use polling here, since this is not a hot path.
   1005 */
   1006static int wait_for_response(struct hv_device *hdev,
   1007			     struct completion *comp)
   1008{
   1009	while (true) {
   1010		if (hdev->channel->rescind) {
   1011			dev_warn_once(&hdev->device, "The device is gone.\n");
   1012			return -ENODEV;
   1013		}
   1014
   1015		if (wait_for_completion_timeout(comp, HZ / 10))
   1016			break;
   1017	}
   1018
   1019	return 0;
   1020}
   1021
   1022/**
   1023 * devfn_to_wslot() - Convert from Linux PCI slot to Windows
   1024 * @devfn:	The Linux representation of PCI slot
   1025 *
   1026 * Windows uses a slightly different representation of PCI slot.
   1027 *
   1028 * Return: The Windows representation
   1029 */
   1030static u32 devfn_to_wslot(int devfn)
   1031{
   1032	union win_slot_encoding wslot;
   1033
   1034	wslot.slot = 0;
   1035	wslot.bits.dev = PCI_SLOT(devfn);
   1036	wslot.bits.func = PCI_FUNC(devfn);
   1037
   1038	return wslot.slot;
   1039}
   1040
   1041/**
   1042 * wslot_to_devfn() - Convert from Windows PCI slot to Linux
   1043 * @wslot:	The Windows representation of PCI slot
   1044 *
   1045 * Windows uses a slightly different representation of PCI slot.
   1046 *
   1047 * Return: The Linux representation
   1048 */
   1049static int wslot_to_devfn(u32 wslot)
   1050{
   1051	union win_slot_encoding slot_no;
   1052
   1053	slot_no.slot = wslot;
   1054	return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
   1055}
   1056
   1057/*
   1058 * PCI Configuration Space for these root PCI buses is implemented as a pair
   1059 * of pages in memory-mapped I/O space.  Writing to the first page chooses
   1060 * the PCI function being written or read.  Once the first page has been
   1061 * written to, the following page maps in the entire configuration space of
   1062 * the function.
   1063 */
   1064
   1065/**
   1066 * _hv_pcifront_read_config() - Internal PCI config read
   1067 * @hpdev:	The PCI driver's representation of the device
   1068 * @where:	Offset within config space
   1069 * @size:	Size of the transfer
   1070 * @val:	Pointer to the buffer receiving the data
   1071 */
   1072static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
   1073				     int size, u32 *val)
   1074{
   1075	unsigned long flags;
   1076	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
   1077
   1078	/*
   1079	 * If the attempt is to read the IDs or the ROM BAR, simulate that.
   1080	 */
   1081	if (where + size <= PCI_COMMAND) {
   1082		memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
   1083	} else if (where >= PCI_CLASS_REVISION && where + size <=
   1084		   PCI_CACHE_LINE_SIZE) {
   1085		memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
   1086		       PCI_CLASS_REVISION, size);
   1087	} else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
   1088		   PCI_ROM_ADDRESS) {
   1089		memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
   1090		       PCI_SUBSYSTEM_VENDOR_ID, size);
   1091	} else if (where >= PCI_ROM_ADDRESS && where + size <=
   1092		   PCI_CAPABILITY_LIST) {
   1093		/* ROM BARs are unimplemented */
   1094		*val = 0;
   1095	} else if (where >= PCI_INTERRUPT_LINE && where + size <=
   1096		   PCI_INTERRUPT_PIN) {
   1097		/*
   1098		 * Interrupt Line and Interrupt PIN are hard-wired to zero
   1099		 * because this front-end only supports message-signaled
   1100		 * interrupts.
   1101		 */
   1102		*val = 0;
   1103	} else if (where + size <= CFG_PAGE_SIZE) {
   1104		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
   1105		/* Choose the function to be read. (See comment above) */
   1106		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
   1107		/* Make sure the function was chosen before we start reading. */
   1108		mb();
   1109		/* Read from that function's config space. */
   1110		switch (size) {
   1111		case 1:
   1112			*val = readb(addr);
   1113			break;
   1114		case 2:
   1115			*val = readw(addr);
   1116			break;
   1117		default:
   1118			*val = readl(addr);
   1119			break;
   1120		}
   1121		/*
   1122		 * Make sure the read was done before we release the spinlock
   1123		 * allowing consecutive reads/writes.
   1124		 */
   1125		mb();
   1126		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
   1127	} else {
   1128		dev_err(&hpdev->hbus->hdev->device,
   1129			"Attempt to read beyond a function's config space.\n");
   1130	}
   1131}
   1132
   1133static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
   1134{
   1135	u16 ret;
   1136	unsigned long flags;
   1137	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET +
   1138			     PCI_VENDOR_ID;
   1139
   1140	spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
   1141
   1142	/* Choose the function to be read. (See comment above) */
   1143	writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
   1144	/* Make sure the function was chosen before we start reading. */
   1145	mb();
   1146	/* Read from that function's config space. */
   1147	ret = readw(addr);
   1148	/*
   1149	 * mb() is not required here, because the spin_unlock_irqrestore()
   1150	 * is a barrier.
   1151	 */
   1152
   1153	spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
   1154
   1155	return ret;
   1156}
   1157
   1158/**
   1159 * _hv_pcifront_write_config() - Internal PCI config write
   1160 * @hpdev:	The PCI driver's representation of the device
   1161 * @where:	Offset within config space
   1162 * @size:	Size of the transfer
   1163 * @val:	The data being transferred
   1164 */
   1165static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
   1166				      int size, u32 val)
   1167{
   1168	unsigned long flags;
   1169	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
   1170
   1171	if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
   1172	    where + size <= PCI_CAPABILITY_LIST) {
   1173		/* SSIDs and ROM BARs are read-only */
   1174	} else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
   1175		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
   1176		/* Choose the function to be written. (See comment above) */
   1177		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
   1178		/* Make sure the function was chosen before we start writing. */
   1179		wmb();
   1180		/* Write to that function's config space. */
   1181		switch (size) {
   1182		case 1:
   1183			writeb(val, addr);
   1184			break;
   1185		case 2:
   1186			writew(val, addr);
   1187			break;
   1188		default:
   1189			writel(val, addr);
   1190			break;
   1191		}
   1192		/*
   1193		 * Make sure the write was done before we release the spinlock
   1194		 * allowing consecutive reads/writes.
   1195		 */
   1196		mb();
   1197		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
   1198	} else {
   1199		dev_err(&hpdev->hbus->hdev->device,
   1200			"Attempt to write beyond a function's config space.\n");
   1201	}
   1202}
   1203
   1204/**
   1205 * hv_pcifront_read_config() - Read configuration space
   1206 * @bus: PCI Bus structure
   1207 * @devfn: Device/function
   1208 * @where: Offset from base
   1209 * @size: Byte/word/dword
   1210 * @val: Value to be read
   1211 *
   1212 * Return: PCIBIOS_SUCCESSFUL on success
   1213 *	   PCIBIOS_DEVICE_NOT_FOUND on failure
   1214 */
   1215static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
   1216				   int where, int size, u32 *val)
   1217{
   1218	struct hv_pcibus_device *hbus =
   1219		container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
   1220	struct hv_pci_dev *hpdev;
   1221
   1222	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
   1223	if (!hpdev)
   1224		return PCIBIOS_DEVICE_NOT_FOUND;
   1225
   1226	_hv_pcifront_read_config(hpdev, where, size, val);
   1227
   1228	put_pcichild(hpdev);
   1229	return PCIBIOS_SUCCESSFUL;
   1230}
   1231
   1232/**
   1233 * hv_pcifront_write_config() - Write configuration space
   1234 * @bus: PCI Bus structure
   1235 * @devfn: Device/function
   1236 * @where: Offset from base
   1237 * @size: Byte/word/dword
   1238 * @val: Value to be written to device
   1239 *
   1240 * Return: PCIBIOS_SUCCESSFUL on success
   1241 *	   PCIBIOS_DEVICE_NOT_FOUND on failure
   1242 */
   1243static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
   1244				    int where, int size, u32 val)
   1245{
   1246	struct hv_pcibus_device *hbus =
   1247	    container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
   1248	struct hv_pci_dev *hpdev;
   1249
   1250	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
   1251	if (!hpdev)
   1252		return PCIBIOS_DEVICE_NOT_FOUND;
   1253
   1254	_hv_pcifront_write_config(hpdev, where, size, val);
   1255
   1256	put_pcichild(hpdev);
   1257	return PCIBIOS_SUCCESSFUL;
   1258}
   1259
   1260/* PCIe operations */
   1261static struct pci_ops hv_pcifront_ops = {
   1262	.read  = hv_pcifront_read_config,
   1263	.write = hv_pcifront_write_config,
   1264};
   1265
   1266/*
   1267 * Paravirtual backchannel
   1268 *
   1269 * Hyper-V SR-IOV provides a backchannel mechanism in software for
   1270 * communication between a VF driver and a PF driver.  These
   1271 * "configuration blocks" are similar in concept to PCI configuration space,
   1272 * but instead of doing reads and writes in 32-bit chunks through a very slow
   1273 * path, packets of up to 128 bytes can be sent or received asynchronously.
   1274 *
   1275 * Nearly every SR-IOV device contains just such a communications channel in
   1276 * hardware, so using this one in software is usually optional.  Using the
   1277 * software channel, however, allows driver implementers to leverage software
   1278 * tools that fuzz the communications channel looking for vulnerabilities.
   1279 *
   1280 * The usage model for these packets puts the responsibility for reading or
   1281 * writing on the VF driver.  The VF driver sends a read or a write packet,
   1282 * indicating which "block" is being referred to by number.
   1283 *
   1284 * If the PF driver wishes to initiate communication, it can "invalidate" one or
   1285 * more of the first 64 blocks.  This invalidation is delivered via a callback
   1286 * supplied by the VF driver by this driver.
   1287 *
   1288 * No protocol is implied, except that supplied by the PF and VF drivers.
   1289 */
   1290
   1291struct hv_read_config_compl {
   1292	struct hv_pci_compl comp_pkt;
   1293	void *buf;
   1294	unsigned int len;
   1295	unsigned int bytes_returned;
   1296};
   1297
   1298/**
   1299 * hv_pci_read_config_compl() - Invoked when a response packet
   1300 * for a read config block operation arrives.
   1301 * @context:		Identifies the read config operation
   1302 * @resp:		The response packet itself
   1303 * @resp_packet_size:	Size in bytes of the response packet
   1304 */
   1305static void hv_pci_read_config_compl(void *context, struct pci_response *resp,
   1306				     int resp_packet_size)
   1307{
   1308	struct hv_read_config_compl *comp = context;
   1309	struct pci_read_block_response *read_resp =
   1310		(struct pci_read_block_response *)resp;
   1311	unsigned int data_len, hdr_len;
   1312
   1313	hdr_len = offsetof(struct pci_read_block_response, bytes);
   1314	if (resp_packet_size < hdr_len) {
   1315		comp->comp_pkt.completion_status = -1;
   1316		goto out;
   1317	}
   1318
   1319	data_len = resp_packet_size - hdr_len;
   1320	if (data_len > 0 && read_resp->status == 0) {
   1321		comp->bytes_returned = min(comp->len, data_len);
   1322		memcpy(comp->buf, read_resp->bytes, comp->bytes_returned);
   1323	} else {
   1324		comp->bytes_returned = 0;
   1325	}
   1326
   1327	comp->comp_pkt.completion_status = read_resp->status;
   1328out:
   1329	complete(&comp->comp_pkt.host_event);
   1330}
   1331
   1332/**
   1333 * hv_read_config_block() - Sends a read config block request to
   1334 * the back-end driver running in the Hyper-V parent partition.
   1335 * @pdev:		The PCI driver's representation for this device.
   1336 * @buf:		Buffer into which the config block will be copied.
   1337 * @len:		Size in bytes of buf.
   1338 * @block_id:		Identifies the config block which has been requested.
   1339 * @bytes_returned:	Size which came back from the back-end driver.
   1340 *
   1341 * Return: 0 on success, -errno on failure
   1342 */
   1343static int hv_read_config_block(struct pci_dev *pdev, void *buf,
   1344				unsigned int len, unsigned int block_id,
   1345				unsigned int *bytes_returned)
   1346{
   1347	struct hv_pcibus_device *hbus =
   1348		container_of(pdev->bus->sysdata, struct hv_pcibus_device,
   1349			     sysdata);
   1350	struct {
   1351		struct pci_packet pkt;
   1352		char buf[sizeof(struct pci_read_block)];
   1353	} pkt;
   1354	struct hv_read_config_compl comp_pkt;
   1355	struct pci_read_block *read_blk;
   1356	int ret;
   1357
   1358	if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
   1359		return -EINVAL;
   1360
   1361	init_completion(&comp_pkt.comp_pkt.host_event);
   1362	comp_pkt.buf = buf;
   1363	comp_pkt.len = len;
   1364
   1365	memset(&pkt, 0, sizeof(pkt));
   1366	pkt.pkt.completion_func = hv_pci_read_config_compl;
   1367	pkt.pkt.compl_ctxt = &comp_pkt;
   1368	read_blk = (struct pci_read_block *)&pkt.pkt.message;
   1369	read_blk->message_type.type = PCI_READ_BLOCK;
   1370	read_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
   1371	read_blk->block_id = block_id;
   1372	read_blk->bytes_requested = len;
   1373
   1374	ret = vmbus_sendpacket(hbus->hdev->channel, read_blk,
   1375			       sizeof(*read_blk), (unsigned long)&pkt.pkt,
   1376			       VM_PKT_DATA_INBAND,
   1377			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
   1378	if (ret)
   1379		return ret;
   1380
   1381	ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event);
   1382	if (ret)
   1383		return ret;
   1384
   1385	if (comp_pkt.comp_pkt.completion_status != 0 ||
   1386	    comp_pkt.bytes_returned == 0) {
   1387		dev_err(&hbus->hdev->device,
   1388			"Read Config Block failed: 0x%x, bytes_returned=%d\n",
   1389			comp_pkt.comp_pkt.completion_status,
   1390			comp_pkt.bytes_returned);
   1391		return -EIO;
   1392	}
   1393
   1394	*bytes_returned = comp_pkt.bytes_returned;
   1395	return 0;
   1396}
   1397
   1398/**
   1399 * hv_pci_write_config_compl() - Invoked when a response packet for a write
   1400 * config block operation arrives.
   1401 * @context:		Identifies the write config operation
   1402 * @resp:		The response packet itself
   1403 * @resp_packet_size:	Size in bytes of the response packet
   1404 */
   1405static void hv_pci_write_config_compl(void *context, struct pci_response *resp,
   1406				      int resp_packet_size)
   1407{
   1408	struct hv_pci_compl *comp_pkt = context;
   1409
   1410	comp_pkt->completion_status = resp->status;
   1411	complete(&comp_pkt->host_event);
   1412}
   1413
   1414/**
   1415 * hv_write_config_block() - Sends a write config block request to the
   1416 * back-end driver running in the Hyper-V parent partition.
   1417 * @pdev:		The PCI driver's representation for this device.
   1418 * @buf:		Buffer from which the config block will	be copied.
   1419 * @len:		Size in bytes of buf.
   1420 * @block_id:		Identifies the config block which is being written.
   1421 *
   1422 * Return: 0 on success, -errno on failure
   1423 */
   1424static int hv_write_config_block(struct pci_dev *pdev, void *buf,
   1425				unsigned int len, unsigned int block_id)
   1426{
   1427	struct hv_pcibus_device *hbus =
   1428		container_of(pdev->bus->sysdata, struct hv_pcibus_device,
   1429			     sysdata);
   1430	struct {
   1431		struct pci_packet pkt;
   1432		char buf[sizeof(struct pci_write_block)];
   1433		u32 reserved;
   1434	} pkt;
   1435	struct hv_pci_compl comp_pkt;
   1436	struct pci_write_block *write_blk;
   1437	u32 pkt_size;
   1438	int ret;
   1439
   1440	if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
   1441		return -EINVAL;
   1442
   1443	init_completion(&comp_pkt.host_event);
   1444
   1445	memset(&pkt, 0, sizeof(pkt));
   1446	pkt.pkt.completion_func = hv_pci_write_config_compl;
   1447	pkt.pkt.compl_ctxt = &comp_pkt;
   1448	write_blk = (struct pci_write_block *)&pkt.pkt.message;
   1449	write_blk->message_type.type = PCI_WRITE_BLOCK;
   1450	write_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
   1451	write_blk->block_id = block_id;
   1452	write_blk->byte_count = len;
   1453	memcpy(write_blk->bytes, buf, len);
   1454	pkt_size = offsetof(struct pci_write_block, bytes) + len;
   1455	/*
   1456	 * This quirk is required on some hosts shipped around 2018, because
   1457	 * these hosts don't check the pkt_size correctly (new hosts have been
   1458	 * fixed since early 2019). The quirk is also safe on very old hosts
   1459	 * and new hosts, because, on them, what really matters is the length
   1460	 * specified in write_blk->byte_count.
   1461	 */
   1462	pkt_size += sizeof(pkt.reserved);
   1463
   1464	ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size,
   1465			       (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND,
   1466			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
   1467	if (ret)
   1468		return ret;
   1469
   1470	ret = wait_for_response(hbus->hdev, &comp_pkt.host_event);
   1471	if (ret)
   1472		return ret;
   1473
   1474	if (comp_pkt.completion_status != 0) {
   1475		dev_err(&hbus->hdev->device,
   1476			"Write Config Block failed: 0x%x\n",
   1477			comp_pkt.completion_status);
   1478		return -EIO;
   1479	}
   1480
   1481	return 0;
   1482}
   1483
   1484/**
   1485 * hv_register_block_invalidate() - Invoked when a config block invalidation
   1486 * arrives from the back-end driver.
   1487 * @pdev:		The PCI driver's representation for this device.
   1488 * @context:		Identifies the device.
   1489 * @block_invalidate:	Identifies all of the blocks being invalidated.
   1490 *
   1491 * Return: 0 on success, -errno on failure
   1492 */
   1493static int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
   1494					void (*block_invalidate)(void *context,
   1495								 u64 block_mask))
   1496{
   1497	struct hv_pcibus_device *hbus =
   1498		container_of(pdev->bus->sysdata, struct hv_pcibus_device,
   1499			     sysdata);
   1500	struct hv_pci_dev *hpdev;
   1501
   1502	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
   1503	if (!hpdev)
   1504		return -ENODEV;
   1505
   1506	hpdev->block_invalidate = block_invalidate;
   1507	hpdev->invalidate_context = context;
   1508
   1509	put_pcichild(hpdev);
   1510	return 0;
   1511
   1512}
   1513
   1514/* Interrupt management hooks */
   1515static void hv_int_desc_free(struct hv_pci_dev *hpdev,
   1516			     struct tran_int_desc *int_desc)
   1517{
   1518	struct pci_delete_interrupt *int_pkt;
   1519	struct {
   1520		struct pci_packet pkt;
   1521		u8 buffer[sizeof(struct pci_delete_interrupt)];
   1522	} ctxt;
   1523
   1524	if (!int_desc->vector_count) {
   1525		kfree(int_desc);
   1526		return;
   1527	}
   1528	memset(&ctxt, 0, sizeof(ctxt));
   1529	int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
   1530	int_pkt->message_type.type =
   1531		PCI_DELETE_INTERRUPT_MESSAGE;
   1532	int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
   1533	int_pkt->int_desc = *int_desc;
   1534	vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
   1535			 0, VM_PKT_DATA_INBAND, 0);
   1536	kfree(int_desc);
   1537}
   1538
   1539/**
   1540 * hv_msi_free() - Free the MSI.
   1541 * @domain:	The interrupt domain pointer
   1542 * @info:	Extra MSI-related context
   1543 * @irq:	Identifies the IRQ.
   1544 *
   1545 * The Hyper-V parent partition and hypervisor are tracking the
   1546 * messages that are in use, keeping the interrupt redirection
   1547 * table up to date.  This callback sends a message that frees
   1548 * the IRT entry and related tracking nonsense.
   1549 */
   1550static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
   1551			unsigned int irq)
   1552{
   1553	struct hv_pcibus_device *hbus;
   1554	struct hv_pci_dev *hpdev;
   1555	struct pci_dev *pdev;
   1556	struct tran_int_desc *int_desc;
   1557	struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
   1558	struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
   1559
   1560	pdev = msi_desc_to_pci_dev(msi);
   1561	hbus = info->data;
   1562	int_desc = irq_data_get_irq_chip_data(irq_data);
   1563	if (!int_desc)
   1564		return;
   1565
   1566	irq_data->chip_data = NULL;
   1567	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
   1568	if (!hpdev) {
   1569		kfree(int_desc);
   1570		return;
   1571	}
   1572
   1573	hv_int_desc_free(hpdev, int_desc);
   1574	put_pcichild(hpdev);
   1575}
   1576
   1577static void hv_irq_mask(struct irq_data *data)
   1578{
   1579	pci_msi_mask_irq(data);
   1580	if (data->parent_data->chip->irq_mask)
   1581		irq_chip_mask_parent(data);
   1582}
   1583
   1584static void hv_irq_unmask(struct irq_data *data)
   1585{
   1586	hv_arch_irq_unmask(data);
   1587
   1588	if (data->parent_data->chip->irq_unmask)
   1589		irq_chip_unmask_parent(data);
   1590	pci_msi_unmask_irq(data);
   1591}
   1592
   1593struct compose_comp_ctxt {
   1594	struct hv_pci_compl comp_pkt;
   1595	struct tran_int_desc int_desc;
   1596};
   1597
   1598static void hv_pci_compose_compl(void *context, struct pci_response *resp,
   1599				 int resp_packet_size)
   1600{
   1601	struct compose_comp_ctxt *comp_pkt = context;
   1602	struct pci_create_int_response *int_resp =
   1603		(struct pci_create_int_response *)resp;
   1604
   1605	if (resp_packet_size < sizeof(*int_resp)) {
   1606		comp_pkt->comp_pkt.completion_status = -1;
   1607		goto out;
   1608	}
   1609	comp_pkt->comp_pkt.completion_status = resp->status;
   1610	comp_pkt->int_desc = int_resp->int_desc;
   1611out:
   1612	complete(&comp_pkt->comp_pkt.host_event);
   1613}
   1614
   1615static u32 hv_compose_msi_req_v1(
   1616	struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
   1617	u32 slot, u8 vector, u8 vector_count)
   1618{
   1619	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
   1620	int_pkt->wslot.slot = slot;
   1621	int_pkt->int_desc.vector = vector;
   1622	int_pkt->int_desc.vector_count = vector_count;
   1623	int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
   1624
   1625	/*
   1626	 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
   1627	 * hv_irq_unmask().
   1628	 */
   1629	int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL;
   1630
   1631	return sizeof(*int_pkt);
   1632}
   1633
   1634/*
   1635 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
   1636 * by subsequent retarget in hv_irq_unmask().
   1637 */
   1638static int hv_compose_msi_req_get_cpu(struct cpumask *affinity)
   1639{
   1640	return cpumask_first_and(affinity, cpu_online_mask);
   1641}
   1642
   1643static u32 hv_compose_msi_req_v2(
   1644	struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
   1645	u32 slot, u8 vector, u8 vector_count)
   1646{
   1647	int cpu;
   1648
   1649	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
   1650	int_pkt->wslot.slot = slot;
   1651	int_pkt->int_desc.vector = vector;
   1652	int_pkt->int_desc.vector_count = vector_count;
   1653	int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
   1654	cpu = hv_compose_msi_req_get_cpu(affinity);
   1655	int_pkt->int_desc.processor_array[0] =
   1656		hv_cpu_number_to_vp_number(cpu);
   1657	int_pkt->int_desc.processor_count = 1;
   1658
   1659	return sizeof(*int_pkt);
   1660}
   1661
   1662static u32 hv_compose_msi_req_v3(
   1663	struct pci_create_interrupt3 *int_pkt, struct cpumask *affinity,
   1664	u32 slot, u32 vector, u8 vector_count)
   1665{
   1666	int cpu;
   1667
   1668	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3;
   1669	int_pkt->wslot.slot = slot;
   1670	int_pkt->int_desc.vector = vector;
   1671	int_pkt->int_desc.reserved = 0;
   1672	int_pkt->int_desc.vector_count = vector_count;
   1673	int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
   1674	cpu = hv_compose_msi_req_get_cpu(affinity);
   1675	int_pkt->int_desc.processor_array[0] =
   1676		hv_cpu_number_to_vp_number(cpu);
   1677	int_pkt->int_desc.processor_count = 1;
   1678
   1679	return sizeof(*int_pkt);
   1680}
   1681
   1682/**
   1683 * hv_compose_msi_msg() - Supplies a valid MSI address/data
   1684 * @data:	Everything about this MSI
   1685 * @msg:	Buffer that is filled in by this function
   1686 *
   1687 * This function unpacks the IRQ looking for target CPU set, IDT
   1688 * vector and mode and sends a message to the parent partition
   1689 * asking for a mapping for that tuple in this partition.  The
   1690 * response supplies a data value and address to which that data
   1691 * should be written to trigger that interrupt.
   1692 */
   1693static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
   1694{
   1695	struct hv_pcibus_device *hbus;
   1696	struct vmbus_channel *channel;
   1697	struct hv_pci_dev *hpdev;
   1698	struct pci_bus *pbus;
   1699	struct pci_dev *pdev;
   1700	struct cpumask *dest;
   1701	struct compose_comp_ctxt comp;
   1702	struct tran_int_desc *int_desc;
   1703	struct msi_desc *msi_desc;
   1704	u8 vector, vector_count;
   1705	struct {
   1706		struct pci_packet pci_pkt;
   1707		union {
   1708			struct pci_create_interrupt v1;
   1709			struct pci_create_interrupt2 v2;
   1710			struct pci_create_interrupt3 v3;
   1711		} int_pkts;
   1712	} __packed ctxt;
   1713	u64 trans_id;
   1714	u32 size;
   1715	int ret;
   1716
   1717	/* Reuse the previous allocation */
   1718	if (data->chip_data) {
   1719		int_desc = data->chip_data;
   1720		msg->address_hi = int_desc->address >> 32;
   1721		msg->address_lo = int_desc->address & 0xffffffff;
   1722		msg->data = int_desc->data;
   1723		return;
   1724	}
   1725
   1726	msi_desc  = irq_data_get_msi_desc(data);
   1727	pdev = msi_desc_to_pci_dev(msi_desc);
   1728	dest = irq_data_get_effective_affinity_mask(data);
   1729	pbus = pdev->bus;
   1730	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
   1731	channel = hbus->hdev->channel;
   1732	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
   1733	if (!hpdev)
   1734		goto return_null_message;
   1735
   1736	int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
   1737	if (!int_desc)
   1738		goto drop_reference;
   1739
   1740	if (!msi_desc->pci.msi_attrib.is_msix && msi_desc->nvec_used > 1) {
   1741		/*
   1742		 * If this is not the first MSI of Multi MSI, we already have
   1743		 * a mapping.  Can exit early.
   1744		 */
   1745		if (msi_desc->irq != data->irq) {
   1746			data->chip_data = int_desc;
   1747			int_desc->address = msi_desc->msg.address_lo |
   1748					    (u64)msi_desc->msg.address_hi << 32;
   1749			int_desc->data = msi_desc->msg.data +
   1750					 (data->irq - msi_desc->irq);
   1751			msg->address_hi = msi_desc->msg.address_hi;
   1752			msg->address_lo = msi_desc->msg.address_lo;
   1753			msg->data = int_desc->data;
   1754			put_pcichild(hpdev);
   1755			return;
   1756		}
   1757		/*
   1758		 * The vector we select here is a dummy value.  The correct
   1759		 * value gets sent to the hypervisor in unmask().  This needs
   1760		 * to be aligned with the count, and also not zero.  Multi-msi
   1761		 * is powers of 2 up to 32, so 32 will always work here.
   1762		 */
   1763		vector = 32;
   1764		vector_count = msi_desc->nvec_used;
   1765	} else {
   1766		vector = hv_msi_get_int_vector(data);
   1767		vector_count = 1;
   1768	}
   1769
   1770	memset(&ctxt, 0, sizeof(ctxt));
   1771	init_completion(&comp.comp_pkt.host_event);
   1772	ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
   1773	ctxt.pci_pkt.compl_ctxt = &comp;
   1774
   1775	switch (hbus->protocol_version) {
   1776	case PCI_PROTOCOL_VERSION_1_1:
   1777		size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
   1778					dest,
   1779					hpdev->desc.win_slot.slot,
   1780					vector,
   1781					vector_count);
   1782		break;
   1783
   1784	case PCI_PROTOCOL_VERSION_1_2:
   1785	case PCI_PROTOCOL_VERSION_1_3:
   1786		size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
   1787					dest,
   1788					hpdev->desc.win_slot.slot,
   1789					vector,
   1790					vector_count);
   1791		break;
   1792
   1793	case PCI_PROTOCOL_VERSION_1_4:
   1794		size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3,
   1795					dest,
   1796					hpdev->desc.win_slot.slot,
   1797					vector,
   1798					vector_count);
   1799		break;
   1800
   1801	default:
   1802		/* As we only negotiate protocol versions known to this driver,
   1803		 * this path should never hit. However, this is it not a hot
   1804		 * path so we print a message to aid future updates.
   1805		 */
   1806		dev_err(&hbus->hdev->device,
   1807			"Unexpected vPCI protocol, update driver.");
   1808		goto free_int_desc;
   1809	}
   1810
   1811	ret = vmbus_sendpacket_getid(hpdev->hbus->hdev->channel, &ctxt.int_pkts,
   1812				     size, (unsigned long)&ctxt.pci_pkt,
   1813				     &trans_id, VM_PKT_DATA_INBAND,
   1814				     VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
   1815	if (ret) {
   1816		dev_err(&hbus->hdev->device,
   1817			"Sending request for interrupt failed: 0x%x",
   1818			comp.comp_pkt.completion_status);
   1819		goto free_int_desc;
   1820	}
   1821
   1822	/*
   1823	 * Prevents hv_pci_onchannelcallback() from running concurrently
   1824	 * in the tasklet.
   1825	 */
   1826	tasklet_disable_in_atomic(&channel->callback_event);
   1827
   1828	/*
   1829	 * Since this function is called with IRQ locks held, can't
   1830	 * do normal wait for completion; instead poll.
   1831	 */
   1832	while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
   1833		unsigned long flags;
   1834
   1835		/* 0xFFFF means an invalid PCI VENDOR ID. */
   1836		if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
   1837			dev_err_once(&hbus->hdev->device,
   1838				     "the device has gone\n");
   1839			goto enable_tasklet;
   1840		}
   1841
   1842		/*
   1843		 * Make sure that the ring buffer data structure doesn't get
   1844		 * freed while we dereference the ring buffer pointer.  Test
   1845		 * for the channel's onchannel_callback being NULL within a
   1846		 * sched_lock critical section.  See also the inline comments
   1847		 * in vmbus_reset_channel_cb().
   1848		 */
   1849		spin_lock_irqsave(&channel->sched_lock, flags);
   1850		if (unlikely(channel->onchannel_callback == NULL)) {
   1851			spin_unlock_irqrestore(&channel->sched_lock, flags);
   1852			goto enable_tasklet;
   1853		}
   1854		hv_pci_onchannelcallback(hbus);
   1855		spin_unlock_irqrestore(&channel->sched_lock, flags);
   1856
   1857		if (hpdev->state == hv_pcichild_ejecting) {
   1858			dev_err_once(&hbus->hdev->device,
   1859				     "the device is being ejected\n");
   1860			goto enable_tasklet;
   1861		}
   1862
   1863		udelay(100);
   1864	}
   1865
   1866	tasklet_enable(&channel->callback_event);
   1867
   1868	if (comp.comp_pkt.completion_status < 0) {
   1869		dev_err(&hbus->hdev->device,
   1870			"Request for interrupt failed: 0x%x",
   1871			comp.comp_pkt.completion_status);
   1872		goto free_int_desc;
   1873	}
   1874
   1875	/*
   1876	 * Record the assignment so that this can be unwound later. Using
   1877	 * irq_set_chip_data() here would be appropriate, but the lock it takes
   1878	 * is already held.
   1879	 */
   1880	*int_desc = comp.int_desc;
   1881	data->chip_data = int_desc;
   1882
   1883	/* Pass up the result. */
   1884	msg->address_hi = comp.int_desc.address >> 32;
   1885	msg->address_lo = comp.int_desc.address & 0xffffffff;
   1886	msg->data = comp.int_desc.data;
   1887
   1888	put_pcichild(hpdev);
   1889	return;
   1890
   1891enable_tasklet:
   1892	tasklet_enable(&channel->callback_event);
   1893	/*
   1894	 * The completion packet on the stack becomes invalid after 'return';
   1895	 * remove the ID from the VMbus requestor if the identifier is still
   1896	 * mapped to/associated with the packet.  (The identifier could have
   1897	 * been 're-used', i.e., already removed and (re-)mapped.)
   1898	 *
   1899	 * Cf. hv_pci_onchannelcallback().
   1900	 */
   1901	vmbus_request_addr_match(channel, trans_id, (unsigned long)&ctxt.pci_pkt);
   1902free_int_desc:
   1903	kfree(int_desc);
   1904drop_reference:
   1905	put_pcichild(hpdev);
   1906return_null_message:
   1907	msg->address_hi = 0;
   1908	msg->address_lo = 0;
   1909	msg->data = 0;
   1910}
   1911
   1912/* HW Interrupt Chip Descriptor */
   1913static struct irq_chip hv_msi_irq_chip = {
   1914	.name			= "Hyper-V PCIe MSI",
   1915	.irq_compose_msi_msg	= hv_compose_msi_msg,
   1916	.irq_set_affinity	= irq_chip_set_affinity_parent,
   1917#ifdef CONFIG_X86
   1918	.irq_ack		= irq_chip_ack_parent,
   1919#elif defined(CONFIG_ARM64)
   1920	.irq_eoi		= irq_chip_eoi_parent,
   1921#endif
   1922	.irq_mask		= hv_irq_mask,
   1923	.irq_unmask		= hv_irq_unmask,
   1924};
   1925
   1926static struct msi_domain_ops hv_msi_ops = {
   1927	.msi_prepare	= hv_msi_prepare,
   1928	.msi_free	= hv_msi_free,
   1929};
   1930
   1931/**
   1932 * hv_pcie_init_irq_domain() - Initialize IRQ domain
   1933 * @hbus:	The root PCI bus
   1934 *
   1935 * This function creates an IRQ domain which will be used for
   1936 * interrupts from devices that have been passed through.  These
   1937 * devices only support MSI and MSI-X, not line-based interrupts
   1938 * or simulations of line-based interrupts through PCIe's
   1939 * fabric-layer messages.  Because interrupts are remapped, we
   1940 * can support multi-message MSI here.
   1941 *
   1942 * Return: '0' on success and error value on failure
   1943 */
   1944static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
   1945{
   1946	hbus->msi_info.chip = &hv_msi_irq_chip;
   1947	hbus->msi_info.ops = &hv_msi_ops;
   1948	hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
   1949		MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
   1950		MSI_FLAG_PCI_MSIX);
   1951	hbus->msi_info.handler = FLOW_HANDLER;
   1952	hbus->msi_info.handler_name = FLOW_NAME;
   1953	hbus->msi_info.data = hbus;
   1954	hbus->irq_domain = pci_msi_create_irq_domain(hbus->fwnode,
   1955						     &hbus->msi_info,
   1956						     hv_pci_get_root_domain());
   1957	if (!hbus->irq_domain) {
   1958		dev_err(&hbus->hdev->device,
   1959			"Failed to build an MSI IRQ domain\n");
   1960		return -ENODEV;
   1961	}
   1962
   1963	dev_set_msi_domain(&hbus->bridge->dev, hbus->irq_domain);
   1964
   1965	return 0;
   1966}
   1967
   1968/**
   1969 * get_bar_size() - Get the address space consumed by a BAR
   1970 * @bar_val:	Value that a BAR returned after -1 was written
   1971 *              to it.
   1972 *
   1973 * This function returns the size of the BAR, rounded up to 1
   1974 * page.  It has to be rounded up because the hypervisor's page
   1975 * table entry that maps the BAR into the VM can't specify an
   1976 * offset within a page.  The invariant is that the hypervisor
   1977 * must place any BARs of smaller than page length at the
   1978 * beginning of a page.
   1979 *
   1980 * Return:	Size in bytes of the consumed MMIO space.
   1981 */
   1982static u64 get_bar_size(u64 bar_val)
   1983{
   1984	return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
   1985			PAGE_SIZE);
   1986}
   1987
   1988/**
   1989 * survey_child_resources() - Total all MMIO requirements
   1990 * @hbus:	Root PCI bus, as understood by this driver
   1991 */
   1992static void survey_child_resources(struct hv_pcibus_device *hbus)
   1993{
   1994	struct hv_pci_dev *hpdev;
   1995	resource_size_t bar_size = 0;
   1996	unsigned long flags;
   1997	struct completion *event;
   1998	u64 bar_val;
   1999	int i;
   2000
   2001	/* If nobody is waiting on the answer, don't compute it. */
   2002	event = xchg(&hbus->survey_event, NULL);
   2003	if (!event)
   2004		return;
   2005
   2006	/* If the answer has already been computed, go with it. */
   2007	if (hbus->low_mmio_space || hbus->high_mmio_space) {
   2008		complete(event);
   2009		return;
   2010	}
   2011
   2012	spin_lock_irqsave(&hbus->device_list_lock, flags);
   2013
   2014	/*
   2015	 * Due to an interesting quirk of the PCI spec, all memory regions
   2016	 * for a child device are a power of 2 in size and aligned in memory,
   2017	 * so it's sufficient to just add them up without tracking alignment.
   2018	 */
   2019	list_for_each_entry(hpdev, &hbus->children, list_entry) {
   2020		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
   2021			if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
   2022				dev_err(&hbus->hdev->device,
   2023					"There's an I/O BAR in this list!\n");
   2024
   2025			if (hpdev->probed_bar[i] != 0) {
   2026				/*
   2027				 * A probed BAR has all the upper bits set that
   2028				 * can be changed.
   2029				 */
   2030
   2031				bar_val = hpdev->probed_bar[i];
   2032				if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
   2033					bar_val |=
   2034					((u64)hpdev->probed_bar[++i] << 32);
   2035				else
   2036					bar_val |= 0xffffffff00000000ULL;
   2037
   2038				bar_size = get_bar_size(bar_val);
   2039
   2040				if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
   2041					hbus->high_mmio_space += bar_size;
   2042				else
   2043					hbus->low_mmio_space += bar_size;
   2044			}
   2045		}
   2046	}
   2047
   2048	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
   2049	complete(event);
   2050}
   2051
   2052/**
   2053 * prepopulate_bars() - Fill in BARs with defaults
   2054 * @hbus:	Root PCI bus, as understood by this driver
   2055 *
   2056 * The core PCI driver code seems much, much happier if the BARs
   2057 * for a device have values upon first scan. So fill them in.
   2058 * The algorithm below works down from large sizes to small,
   2059 * attempting to pack the assignments optimally. The assumption,
   2060 * enforced in other parts of the code, is that the beginning of
   2061 * the memory-mapped I/O space will be aligned on the largest
   2062 * BAR size.
   2063 */
   2064static void prepopulate_bars(struct hv_pcibus_device *hbus)
   2065{
   2066	resource_size_t high_size = 0;
   2067	resource_size_t low_size = 0;
   2068	resource_size_t high_base = 0;
   2069	resource_size_t low_base = 0;
   2070	resource_size_t bar_size;
   2071	struct hv_pci_dev *hpdev;
   2072	unsigned long flags;
   2073	u64 bar_val;
   2074	u32 command;
   2075	bool high;
   2076	int i;
   2077
   2078	if (hbus->low_mmio_space) {
   2079		low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
   2080		low_base = hbus->low_mmio_res->start;
   2081	}
   2082
   2083	if (hbus->high_mmio_space) {
   2084		high_size = 1ULL <<
   2085			(63 - __builtin_clzll(hbus->high_mmio_space));
   2086		high_base = hbus->high_mmio_res->start;
   2087	}
   2088
   2089	spin_lock_irqsave(&hbus->device_list_lock, flags);
   2090
   2091	/*
   2092	 * Clear the memory enable bit, in case it's already set. This occurs
   2093	 * in the suspend path of hibernation, where the device is suspended,
   2094	 * resumed and suspended again: see hibernation_snapshot() and
   2095	 * hibernation_platform_enter().
   2096	 *
   2097	 * If the memory enable bit is already set, Hyper-V silently ignores
   2098	 * the below BAR updates, and the related PCI device driver can not
   2099	 * work, because reading from the device register(s) always returns
   2100	 * 0xFFFFFFFF (PCI_ERROR_RESPONSE).
   2101	 */
   2102	list_for_each_entry(hpdev, &hbus->children, list_entry) {
   2103		_hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command);
   2104		command &= ~PCI_COMMAND_MEMORY;
   2105		_hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command);
   2106	}
   2107
   2108	/* Pick addresses for the BARs. */
   2109	do {
   2110		list_for_each_entry(hpdev, &hbus->children, list_entry) {
   2111			for (i = 0; i < PCI_STD_NUM_BARS; i++) {
   2112				bar_val = hpdev->probed_bar[i];
   2113				if (bar_val == 0)
   2114					continue;
   2115				high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
   2116				if (high) {
   2117					bar_val |=
   2118						((u64)hpdev->probed_bar[i + 1]
   2119						 << 32);
   2120				} else {
   2121					bar_val |= 0xffffffffULL << 32;
   2122				}
   2123				bar_size = get_bar_size(bar_val);
   2124				if (high) {
   2125					if (high_size != bar_size) {
   2126						i++;
   2127						continue;
   2128					}
   2129					_hv_pcifront_write_config(hpdev,
   2130						PCI_BASE_ADDRESS_0 + (4 * i),
   2131						4,
   2132						(u32)(high_base & 0xffffff00));
   2133					i++;
   2134					_hv_pcifront_write_config(hpdev,
   2135						PCI_BASE_ADDRESS_0 + (4 * i),
   2136						4, (u32)(high_base >> 32));
   2137					high_base += bar_size;
   2138				} else {
   2139					if (low_size != bar_size)
   2140						continue;
   2141					_hv_pcifront_write_config(hpdev,
   2142						PCI_BASE_ADDRESS_0 + (4 * i),
   2143						4,
   2144						(u32)(low_base & 0xffffff00));
   2145					low_base += bar_size;
   2146				}
   2147			}
   2148			if (high_size <= 1 && low_size <= 1) {
   2149				/*
   2150				 * No need to set the PCI_COMMAND_MEMORY bit as
   2151				 * the core PCI driver doesn't require the bit
   2152				 * to be pre-set. Actually here we intentionally
   2153				 * keep the bit off so that the PCI BAR probing
   2154				 * in the core PCI driver doesn't cause Hyper-V
   2155				 * to unnecessarily unmap/map the virtual BARs
   2156				 * from/to the physical BARs multiple times.
   2157				 * This reduces the VM boot time significantly
   2158				 * if the BAR sizes are huge.
   2159				 */
   2160				break;
   2161			}
   2162		}
   2163
   2164		high_size >>= 1;
   2165		low_size >>= 1;
   2166	}  while (high_size || low_size);
   2167
   2168	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
   2169}
   2170
   2171/*
   2172 * Assign entries in sysfs pci slot directory.
   2173 *
   2174 * Note that this function does not need to lock the children list
   2175 * because it is called from pci_devices_present_work which
   2176 * is serialized with hv_eject_device_work because they are on the
   2177 * same ordered workqueue. Therefore hbus->children list will not change
   2178 * even when pci_create_slot sleeps.
   2179 */
   2180static void hv_pci_assign_slots(struct hv_pcibus_device *hbus)
   2181{
   2182	struct hv_pci_dev *hpdev;
   2183	char name[SLOT_NAME_SIZE];
   2184	int slot_nr;
   2185
   2186	list_for_each_entry(hpdev, &hbus->children, list_entry) {
   2187		if (hpdev->pci_slot)
   2188			continue;
   2189
   2190		slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot));
   2191		snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser);
   2192		hpdev->pci_slot = pci_create_slot(hbus->bridge->bus, slot_nr,
   2193					  name, NULL);
   2194		if (IS_ERR(hpdev->pci_slot)) {
   2195			pr_warn("pci_create slot %s failed\n", name);
   2196			hpdev->pci_slot = NULL;
   2197		}
   2198	}
   2199}
   2200
   2201/*
   2202 * Remove entries in sysfs pci slot directory.
   2203 */
   2204static void hv_pci_remove_slots(struct hv_pcibus_device *hbus)
   2205{
   2206	struct hv_pci_dev *hpdev;
   2207
   2208	list_for_each_entry(hpdev, &hbus->children, list_entry) {
   2209		if (!hpdev->pci_slot)
   2210			continue;
   2211		pci_destroy_slot(hpdev->pci_slot);
   2212		hpdev->pci_slot = NULL;
   2213	}
   2214}
   2215
   2216/*
   2217 * Set NUMA node for the devices on the bus
   2218 */
   2219static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus)
   2220{
   2221	struct pci_dev *dev;
   2222	struct pci_bus *bus = hbus->bridge->bus;
   2223	struct hv_pci_dev *hv_dev;
   2224
   2225	list_for_each_entry(dev, &bus->devices, bus_list) {
   2226		hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn));
   2227		if (!hv_dev)
   2228			continue;
   2229
   2230		if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY &&
   2231		    hv_dev->desc.virtual_numa_node < num_possible_nodes())
   2232			/*
   2233			 * The kernel may boot with some NUMA nodes offline
   2234			 * (e.g. in a KDUMP kernel) or with NUMA disabled via
   2235			 * "numa=off". In those cases, adjust the host provided
   2236			 * NUMA node to a valid NUMA node used by the kernel.
   2237			 */
   2238			set_dev_node(&dev->dev,
   2239				     numa_map_to_online_node(
   2240					     hv_dev->desc.virtual_numa_node));
   2241
   2242		put_pcichild(hv_dev);
   2243	}
   2244}
   2245
   2246/**
   2247 * create_root_hv_pci_bus() - Expose a new root PCI bus
   2248 * @hbus:	Root PCI bus, as understood by this driver
   2249 *
   2250 * Return: 0 on success, -errno on failure
   2251 */
   2252static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
   2253{
   2254	int error;
   2255	struct pci_host_bridge *bridge = hbus->bridge;
   2256
   2257	bridge->dev.parent = &hbus->hdev->device;
   2258	bridge->sysdata = &hbus->sysdata;
   2259	bridge->ops = &hv_pcifront_ops;
   2260
   2261	error = pci_scan_root_bus_bridge(bridge);
   2262	if (error)
   2263		return error;
   2264
   2265	pci_lock_rescan_remove();
   2266	hv_pci_assign_numa_node(hbus);
   2267	pci_bus_assign_resources(bridge->bus);
   2268	hv_pci_assign_slots(hbus);
   2269	pci_bus_add_devices(bridge->bus);
   2270	pci_unlock_rescan_remove();
   2271	hbus->state = hv_pcibus_installed;
   2272	return 0;
   2273}
   2274
   2275struct q_res_req_compl {
   2276	struct completion host_event;
   2277	struct hv_pci_dev *hpdev;
   2278};
   2279
   2280/**
   2281 * q_resource_requirements() - Query Resource Requirements
   2282 * @context:		The completion context.
   2283 * @resp:		The response that came from the host.
   2284 * @resp_packet_size:	The size in bytes of resp.
   2285 *
   2286 * This function is invoked on completion of a Query Resource
   2287 * Requirements packet.
   2288 */
   2289static void q_resource_requirements(void *context, struct pci_response *resp,
   2290				    int resp_packet_size)
   2291{
   2292	struct q_res_req_compl *completion = context;
   2293	struct pci_q_res_req_response *q_res_req =
   2294		(struct pci_q_res_req_response *)resp;
   2295	s32 status;
   2296	int i;
   2297
   2298	status = (resp_packet_size < sizeof(*q_res_req)) ? -1 : resp->status;
   2299	if (status < 0) {
   2300		dev_err(&completion->hpdev->hbus->hdev->device,
   2301			"query resource requirements failed: %x\n",
   2302			status);
   2303	} else {
   2304		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
   2305			completion->hpdev->probed_bar[i] =
   2306				q_res_req->probed_bar[i];
   2307		}
   2308	}
   2309
   2310	complete(&completion->host_event);
   2311}
   2312
   2313/**
   2314 * new_pcichild_device() - Create a new child device
   2315 * @hbus:	The internal struct tracking this root PCI bus.
   2316 * @desc:	The information supplied so far from the host
   2317 *              about the device.
   2318 *
   2319 * This function creates the tracking structure for a new child
   2320 * device and kicks off the process of figuring out what it is.
   2321 *
   2322 * Return: Pointer to the new tracking struct
   2323 */
   2324static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
   2325		struct hv_pcidev_description *desc)
   2326{
   2327	struct hv_pci_dev *hpdev;
   2328	struct pci_child_message *res_req;
   2329	struct q_res_req_compl comp_pkt;
   2330	struct {
   2331		struct pci_packet init_packet;
   2332		u8 buffer[sizeof(struct pci_child_message)];
   2333	} pkt;
   2334	unsigned long flags;
   2335	int ret;
   2336
   2337	hpdev = kzalloc(sizeof(*hpdev), GFP_KERNEL);
   2338	if (!hpdev)
   2339		return NULL;
   2340
   2341	hpdev->hbus = hbus;
   2342
   2343	memset(&pkt, 0, sizeof(pkt));
   2344	init_completion(&comp_pkt.host_event);
   2345	comp_pkt.hpdev = hpdev;
   2346	pkt.init_packet.compl_ctxt = &comp_pkt;
   2347	pkt.init_packet.completion_func = q_resource_requirements;
   2348	res_req = (struct pci_child_message *)&pkt.init_packet.message;
   2349	res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
   2350	res_req->wslot.slot = desc->win_slot.slot;
   2351
   2352	ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
   2353			       sizeof(struct pci_child_message),
   2354			       (unsigned long)&pkt.init_packet,
   2355			       VM_PKT_DATA_INBAND,
   2356			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
   2357	if (ret)
   2358		goto error;
   2359
   2360	if (wait_for_response(hbus->hdev, &comp_pkt.host_event))
   2361		goto error;
   2362
   2363	hpdev->desc = *desc;
   2364	refcount_set(&hpdev->refs, 1);
   2365	get_pcichild(hpdev);
   2366	spin_lock_irqsave(&hbus->device_list_lock, flags);
   2367
   2368	list_add_tail(&hpdev->list_entry, &hbus->children);
   2369	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
   2370	return hpdev;
   2371
   2372error:
   2373	kfree(hpdev);
   2374	return NULL;
   2375}
   2376
   2377/**
   2378 * get_pcichild_wslot() - Find device from slot
   2379 * @hbus:	Root PCI bus, as understood by this driver
   2380 * @wslot:	Location on the bus
   2381 *
   2382 * This function looks up a PCI device and returns the internal
   2383 * representation of it.  It acquires a reference on it, so that
   2384 * the device won't be deleted while somebody is using it.  The
   2385 * caller is responsible for calling put_pcichild() to release
   2386 * this reference.
   2387 *
   2388 * Return:	Internal representation of a PCI device
   2389 */
   2390static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
   2391					     u32 wslot)
   2392{
   2393	unsigned long flags;
   2394	struct hv_pci_dev *iter, *hpdev = NULL;
   2395
   2396	spin_lock_irqsave(&hbus->device_list_lock, flags);
   2397	list_for_each_entry(iter, &hbus->children, list_entry) {
   2398		if (iter->desc.win_slot.slot == wslot) {
   2399			hpdev = iter;
   2400			get_pcichild(hpdev);
   2401			break;
   2402		}
   2403	}
   2404	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
   2405
   2406	return hpdev;
   2407}
   2408
   2409/**
   2410 * pci_devices_present_work() - Handle new list of child devices
   2411 * @work:	Work struct embedded in struct hv_dr_work
   2412 *
   2413 * "Bus Relations" is the Windows term for "children of this
   2414 * bus."  The terminology is preserved here for people trying to
   2415 * debug the interaction between Hyper-V and Linux.  This
   2416 * function is called when the parent partition reports a list
   2417 * of functions that should be observed under this PCI Express
   2418 * port (bus).
   2419 *
   2420 * This function updates the list, and must tolerate being
   2421 * called multiple times with the same information.  The typical
   2422 * number of child devices is one, with very atypical cases
   2423 * involving three or four, so the algorithms used here can be
   2424 * simple and inefficient.
   2425 *
   2426 * It must also treat the omission of a previously observed device as
   2427 * notification that the device no longer exists.
   2428 *
   2429 * Note that this function is serialized with hv_eject_device_work(),
   2430 * because both are pushed to the ordered workqueue hbus->wq.
   2431 */
   2432static void pci_devices_present_work(struct work_struct *work)
   2433{
   2434	u32 child_no;
   2435	bool found;
   2436	struct hv_pcidev_description *new_desc;
   2437	struct hv_pci_dev *hpdev;
   2438	struct hv_pcibus_device *hbus;
   2439	struct list_head removed;
   2440	struct hv_dr_work *dr_wrk;
   2441	struct hv_dr_state *dr = NULL;
   2442	unsigned long flags;
   2443
   2444	dr_wrk = container_of(work, struct hv_dr_work, wrk);
   2445	hbus = dr_wrk->bus;
   2446	kfree(dr_wrk);
   2447
   2448	INIT_LIST_HEAD(&removed);
   2449
   2450	/* Pull this off the queue and process it if it was the last one. */
   2451	spin_lock_irqsave(&hbus->device_list_lock, flags);
   2452	while (!list_empty(&hbus->dr_list)) {
   2453		dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
   2454				      list_entry);
   2455		list_del(&dr->list_entry);
   2456
   2457		/* Throw this away if the list still has stuff in it. */
   2458		if (!list_empty(&hbus->dr_list)) {
   2459			kfree(dr);
   2460			continue;
   2461		}
   2462	}
   2463	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
   2464
   2465	if (!dr)
   2466		return;
   2467
   2468	/* First, mark all existing children as reported missing. */
   2469	spin_lock_irqsave(&hbus->device_list_lock, flags);
   2470	list_for_each_entry(hpdev, &hbus->children, list_entry) {
   2471		hpdev->reported_missing = true;
   2472	}
   2473	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
   2474
   2475	/* Next, add back any reported devices. */
   2476	for (child_no = 0; child_no < dr->device_count; child_no++) {
   2477		found = false;
   2478		new_desc = &dr->func[child_no];
   2479
   2480		spin_lock_irqsave(&hbus->device_list_lock, flags);
   2481		list_for_each_entry(hpdev, &hbus->children, list_entry) {
   2482			if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) &&
   2483			    (hpdev->desc.v_id == new_desc->v_id) &&
   2484			    (hpdev->desc.d_id == new_desc->d_id) &&
   2485			    (hpdev->desc.ser == new_desc->ser)) {
   2486				hpdev->reported_missing = false;
   2487				found = true;
   2488			}
   2489		}
   2490		spin_unlock_irqrestore(&hbus->device_list_lock, flags);
   2491
   2492		if (!found) {
   2493			hpdev = new_pcichild_device(hbus, new_desc);
   2494			if (!hpdev)
   2495				dev_err(&hbus->hdev->device,
   2496					"couldn't record a child device.\n");
   2497		}
   2498	}
   2499
   2500	/* Move missing children to a list on the stack. */
   2501	spin_lock_irqsave(&hbus->device_list_lock, flags);
   2502	do {
   2503		found = false;
   2504		list_for_each_entry(hpdev, &hbus->children, list_entry) {
   2505			if (hpdev->reported_missing) {
   2506				found = true;
   2507				put_pcichild(hpdev);
   2508				list_move_tail(&hpdev->list_entry, &removed);
   2509				break;
   2510			}
   2511		}
   2512	} while (found);
   2513	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
   2514
   2515	/* Delete everything that should no longer exist. */
   2516	while (!list_empty(&removed)) {
   2517		hpdev = list_first_entry(&removed, struct hv_pci_dev,
   2518					 list_entry);
   2519		list_del(&hpdev->list_entry);
   2520
   2521		if (hpdev->pci_slot)
   2522			pci_destroy_slot(hpdev->pci_slot);
   2523
   2524		put_pcichild(hpdev);
   2525	}
   2526
   2527	switch (hbus->state) {
   2528	case hv_pcibus_installed:
   2529		/*
   2530		 * Tell the core to rescan bus
   2531		 * because there may have been changes.
   2532		 */
   2533		pci_lock_rescan_remove();
   2534		pci_scan_child_bus(hbus->bridge->bus);
   2535		hv_pci_assign_numa_node(hbus);
   2536		hv_pci_assign_slots(hbus);
   2537		pci_unlock_rescan_remove();
   2538		break;
   2539
   2540	case hv_pcibus_init:
   2541	case hv_pcibus_probed:
   2542		survey_child_resources(hbus);
   2543		break;
   2544
   2545	default:
   2546		break;
   2547	}
   2548
   2549	kfree(dr);
   2550}
   2551
   2552/**
   2553 * hv_pci_start_relations_work() - Queue work to start device discovery
   2554 * @hbus:	Root PCI bus, as understood by this driver
   2555 * @dr:		The list of children returned from host
   2556 *
   2557 * Return:  0 on success, -errno on failure
   2558 */
   2559static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus,
   2560				       struct hv_dr_state *dr)
   2561{
   2562	struct hv_dr_work *dr_wrk;
   2563	unsigned long flags;
   2564	bool pending_dr;
   2565
   2566	if (hbus->state == hv_pcibus_removing) {
   2567		dev_info(&hbus->hdev->device,
   2568			 "PCI VMBus BUS_RELATIONS: ignored\n");
   2569		return -ENOENT;
   2570	}
   2571
   2572	dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
   2573	if (!dr_wrk)
   2574		return -ENOMEM;
   2575
   2576	INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
   2577	dr_wrk->bus = hbus;
   2578
   2579	spin_lock_irqsave(&hbus->device_list_lock, flags);
   2580	/*
   2581	 * If pending_dr is true, we have already queued a work,
   2582	 * which will see the new dr. Otherwise, we need to
   2583	 * queue a new work.
   2584	 */
   2585	pending_dr = !list_empty(&hbus->dr_list);
   2586	list_add_tail(&dr->list_entry, &hbus->dr_list);
   2587	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
   2588
   2589	if (pending_dr)
   2590		kfree(dr_wrk);
   2591	else
   2592		queue_work(hbus->wq, &dr_wrk->wrk);
   2593
   2594	return 0;
   2595}
   2596
   2597/**
   2598 * hv_pci_devices_present() - Handle list of new children
   2599 * @hbus:      Root PCI bus, as understood by this driver
   2600 * @relations: Packet from host listing children
   2601 *
   2602 * Process a new list of devices on the bus. The list of devices is
   2603 * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS,
   2604 * whenever a new list of devices for this bus appears.
   2605 */
   2606static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
   2607				   struct pci_bus_relations *relations)
   2608{
   2609	struct hv_dr_state *dr;
   2610	int i;
   2611
   2612	dr = kzalloc(struct_size(dr, func, relations->device_count),
   2613		     GFP_NOWAIT);
   2614	if (!dr)
   2615		return;
   2616
   2617	dr->device_count = relations->device_count;
   2618	for (i = 0; i < dr->device_count; i++) {
   2619		dr->func[i].v_id = relations->func[i].v_id;
   2620		dr->func[i].d_id = relations->func[i].d_id;
   2621		dr->func[i].rev = relations->func[i].rev;
   2622		dr->func[i].prog_intf = relations->func[i].prog_intf;
   2623		dr->func[i].subclass = relations->func[i].subclass;
   2624		dr->func[i].base_class = relations->func[i].base_class;
   2625		dr->func[i].subsystem_id = relations->func[i].subsystem_id;
   2626		dr->func[i].win_slot = relations->func[i].win_slot;
   2627		dr->func[i].ser = relations->func[i].ser;
   2628	}
   2629
   2630	if (hv_pci_start_relations_work(hbus, dr))
   2631		kfree(dr);
   2632}
   2633
   2634/**
   2635 * hv_pci_devices_present2() - Handle list of new children
   2636 * @hbus:	Root PCI bus, as understood by this driver
   2637 * @relations:	Packet from host listing children
   2638 *
   2639 * This function is the v2 version of hv_pci_devices_present()
   2640 */
   2641static void hv_pci_devices_present2(struct hv_pcibus_device *hbus,
   2642				    struct pci_bus_relations2 *relations)
   2643{
   2644	struct hv_dr_state *dr;
   2645	int i;
   2646
   2647	dr = kzalloc(struct_size(dr, func, relations->device_count),
   2648		     GFP_NOWAIT);
   2649	if (!dr)
   2650		return;
   2651
   2652	dr->device_count = relations->device_count;
   2653	for (i = 0; i < dr->device_count; i++) {
   2654		dr->func[i].v_id = relations->func[i].v_id;
   2655		dr->func[i].d_id = relations->func[i].d_id;
   2656		dr->func[i].rev = relations->func[i].rev;
   2657		dr->func[i].prog_intf = relations->func[i].prog_intf;
   2658		dr->func[i].subclass = relations->func[i].subclass;
   2659		dr->func[i].base_class = relations->func[i].base_class;
   2660		dr->func[i].subsystem_id = relations->func[i].subsystem_id;
   2661		dr->func[i].win_slot = relations->func[i].win_slot;
   2662		dr->func[i].ser = relations->func[i].ser;
   2663		dr->func[i].flags = relations->func[i].flags;
   2664		dr->func[i].virtual_numa_node =
   2665			relations->func[i].virtual_numa_node;
   2666	}
   2667
   2668	if (hv_pci_start_relations_work(hbus, dr))
   2669		kfree(dr);
   2670}
   2671
   2672/**
   2673 * hv_eject_device_work() - Asynchronously handles ejection
   2674 * @work:	Work struct embedded in internal device struct
   2675 *
   2676 * This function handles ejecting a device.  Windows will
   2677 * attempt to gracefully eject a device, waiting 60 seconds to
   2678 * hear back from the guest OS that this completed successfully.
   2679 * If this timer expires, the device will be forcibly removed.
   2680 */
   2681static void hv_eject_device_work(struct work_struct *work)
   2682{
   2683	struct pci_eject_response *ejct_pkt;
   2684	struct hv_pcibus_device *hbus;
   2685	struct hv_pci_dev *hpdev;
   2686	struct pci_dev *pdev;
   2687	unsigned long flags;
   2688	int wslot;
   2689	struct {
   2690		struct pci_packet pkt;
   2691		u8 buffer[sizeof(struct pci_eject_response)];
   2692	} ctxt;
   2693
   2694	hpdev = container_of(work, struct hv_pci_dev, wrk);
   2695	hbus = hpdev->hbus;
   2696
   2697	WARN_ON(hpdev->state != hv_pcichild_ejecting);
   2698
   2699	/*
   2700	 * Ejection can come before or after the PCI bus has been set up, so
   2701	 * attempt to find it and tear down the bus state, if it exists.  This
   2702	 * must be done without constructs like pci_domain_nr(hbus->bridge->bus)
   2703	 * because hbus->bridge->bus may not exist yet.
   2704	 */
   2705	wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
   2706	pdev = pci_get_domain_bus_and_slot(hbus->bridge->domain_nr, 0, wslot);
   2707	if (pdev) {
   2708		pci_lock_rescan_remove();
   2709		pci_stop_and_remove_bus_device(pdev);
   2710		pci_dev_put(pdev);
   2711		pci_unlock_rescan_remove();
   2712	}
   2713
   2714	spin_lock_irqsave(&hbus->device_list_lock, flags);
   2715	list_del(&hpdev->list_entry);
   2716	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
   2717
   2718	if (hpdev->pci_slot)
   2719		pci_destroy_slot(hpdev->pci_slot);
   2720
   2721	memset(&ctxt, 0, sizeof(ctxt));
   2722	ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
   2723	ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
   2724	ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
   2725	vmbus_sendpacket(hbus->hdev->channel, ejct_pkt,
   2726			 sizeof(*ejct_pkt), 0,
   2727			 VM_PKT_DATA_INBAND, 0);
   2728
   2729	/* For the get_pcichild() in hv_pci_eject_device() */
   2730	put_pcichild(hpdev);
   2731	/* For the two refs got in new_pcichild_device() */
   2732	put_pcichild(hpdev);
   2733	put_pcichild(hpdev);
   2734	/* hpdev has been freed. Do not use it any more. */
   2735}
   2736
   2737/**
   2738 * hv_pci_eject_device() - Handles device ejection
   2739 * @hpdev:	Internal device tracking struct
   2740 *
   2741 * This function is invoked when an ejection packet arrives.  It
   2742 * just schedules work so that we don't re-enter the packet
   2743 * delivery code handling the ejection.
   2744 */
   2745static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
   2746{
   2747	struct hv_pcibus_device *hbus = hpdev->hbus;
   2748	struct hv_device *hdev = hbus->hdev;
   2749
   2750	if (hbus->state == hv_pcibus_removing) {
   2751		dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n");
   2752		return;
   2753	}
   2754
   2755	hpdev->state = hv_pcichild_ejecting;
   2756	get_pcichild(hpdev);
   2757	INIT_WORK(&hpdev->wrk, hv_eject_device_work);
   2758	queue_work(hbus->wq, &hpdev->wrk);
   2759}
   2760
   2761/**
   2762 * hv_pci_onchannelcallback() - Handles incoming packets
   2763 * @context:	Internal bus tracking struct
   2764 *
   2765 * This function is invoked whenever the host sends a packet to
   2766 * this channel (which is private to this root PCI bus).
   2767 */
   2768static void hv_pci_onchannelcallback(void *context)
   2769{
   2770	const int packet_size = 0x100;
   2771	int ret;
   2772	struct hv_pcibus_device *hbus = context;
   2773	struct vmbus_channel *chan = hbus->hdev->channel;
   2774	u32 bytes_recvd;
   2775	u64 req_id, req_addr;
   2776	struct vmpacket_descriptor *desc;
   2777	unsigned char *buffer;
   2778	int bufferlen = packet_size;
   2779	struct pci_packet *comp_packet;
   2780	struct pci_response *response;
   2781	struct pci_incoming_message *new_message;
   2782	struct pci_bus_relations *bus_rel;
   2783	struct pci_bus_relations2 *bus_rel2;
   2784	struct pci_dev_inval_block *inval;
   2785	struct pci_dev_incoming *dev_message;
   2786	struct hv_pci_dev *hpdev;
   2787	unsigned long flags;
   2788
   2789	buffer = kmalloc(bufferlen, GFP_ATOMIC);
   2790	if (!buffer)
   2791		return;
   2792
   2793	while (1) {
   2794		ret = vmbus_recvpacket_raw(chan, buffer, bufferlen,
   2795					   &bytes_recvd, &req_id);
   2796
   2797		if (ret == -ENOBUFS) {
   2798			kfree(buffer);
   2799			/* Handle large packet */
   2800			bufferlen = bytes_recvd;
   2801			buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
   2802			if (!buffer)
   2803				return;
   2804			continue;
   2805		}
   2806
   2807		/* Zero length indicates there are no more packets. */
   2808		if (ret || !bytes_recvd)
   2809			break;
   2810
   2811		/*
   2812		 * All incoming packets must be at least as large as a
   2813		 * response.
   2814		 */
   2815		if (bytes_recvd <= sizeof(struct pci_response))
   2816			continue;
   2817		desc = (struct vmpacket_descriptor *)buffer;
   2818
   2819		switch (desc->type) {
   2820		case VM_PKT_COMP:
   2821
   2822			lock_requestor(chan, flags);
   2823			req_addr = __vmbus_request_addr_match(chan, req_id,
   2824							      VMBUS_RQST_ADDR_ANY);
   2825			if (req_addr == VMBUS_RQST_ERROR) {
   2826				unlock_requestor(chan, flags);
   2827				dev_err(&hbus->hdev->device,
   2828					"Invalid transaction ID %llx\n",
   2829					req_id);
   2830				break;
   2831			}
   2832			comp_packet = (struct pci_packet *)req_addr;
   2833			response = (struct pci_response *)buffer;
   2834			/*
   2835			 * Call ->completion_func() within the critical section to make
   2836			 * sure that the packet pointer is still valid during the call:
   2837			 * here 'valid' means that there's a task still waiting for the
   2838			 * completion, and that the packet data is still on the waiting
   2839			 * task's stack.  Cf. hv_compose_msi_msg().
   2840			 */
   2841			comp_packet->completion_func(comp_packet->compl_ctxt,
   2842						     response,
   2843						     bytes_recvd);
   2844			unlock_requestor(chan, flags);
   2845			break;
   2846
   2847		case VM_PKT_DATA_INBAND:
   2848
   2849			new_message = (struct pci_incoming_message *)buffer;
   2850			switch (new_message->message_type.type) {
   2851			case PCI_BUS_RELATIONS:
   2852
   2853				bus_rel = (struct pci_bus_relations *)buffer;
   2854				if (bytes_recvd < sizeof(*bus_rel) ||
   2855				    bytes_recvd <
   2856					struct_size(bus_rel, func,
   2857						    bus_rel->device_count)) {
   2858					dev_err(&hbus->hdev->device,
   2859						"bus relations too small\n");
   2860					break;
   2861				}
   2862
   2863				hv_pci_devices_present(hbus, bus_rel);
   2864				break;
   2865
   2866			case PCI_BUS_RELATIONS2:
   2867
   2868				bus_rel2 = (struct pci_bus_relations2 *)buffer;
   2869				if (bytes_recvd < sizeof(*bus_rel2) ||
   2870				    bytes_recvd <
   2871					struct_size(bus_rel2, func,
   2872						    bus_rel2->device_count)) {
   2873					dev_err(&hbus->hdev->device,
   2874						"bus relations v2 too small\n");
   2875					break;
   2876				}
   2877
   2878				hv_pci_devices_present2(hbus, bus_rel2);
   2879				break;
   2880
   2881			case PCI_EJECT:
   2882
   2883				dev_message = (struct pci_dev_incoming *)buffer;
   2884				if (bytes_recvd < sizeof(*dev_message)) {
   2885					dev_err(&hbus->hdev->device,
   2886						"eject message too small\n");
   2887					break;
   2888				}
   2889				hpdev = get_pcichild_wslot(hbus,
   2890						      dev_message->wslot.slot);
   2891				if (hpdev) {
   2892					hv_pci_eject_device(hpdev);
   2893					put_pcichild(hpdev);
   2894				}
   2895				break;
   2896
   2897			case PCI_INVALIDATE_BLOCK:
   2898
   2899				inval = (struct pci_dev_inval_block *)buffer;
   2900				if (bytes_recvd < sizeof(*inval)) {
   2901					dev_err(&hbus->hdev->device,
   2902						"invalidate message too small\n");
   2903					break;
   2904				}
   2905				hpdev = get_pcichild_wslot(hbus,
   2906							   inval->wslot.slot);
   2907				if (hpdev) {
   2908					if (hpdev->block_invalidate) {
   2909						hpdev->block_invalidate(
   2910						    hpdev->invalidate_context,
   2911						    inval->block_mask);
   2912					}
   2913					put_pcichild(hpdev);
   2914				}
   2915				break;
   2916
   2917			default:
   2918				dev_warn(&hbus->hdev->device,
   2919					"Unimplemented protocol message %x\n",
   2920					new_message->message_type.type);
   2921				break;
   2922			}
   2923			break;
   2924
   2925		default:
   2926			dev_err(&hbus->hdev->device,
   2927				"unhandled packet type %d, tid %llx len %d\n",
   2928				desc->type, req_id, bytes_recvd);
   2929			break;
   2930		}
   2931	}
   2932
   2933	kfree(buffer);
   2934}
   2935
   2936/**
   2937 * hv_pci_protocol_negotiation() - Set up protocol
   2938 * @hdev:		VMBus's tracking struct for this root PCI bus.
   2939 * @version:		Array of supported channel protocol versions in
   2940 *			the order of probing - highest go first.
   2941 * @num_version:	Number of elements in the version array.
   2942 *
   2943 * This driver is intended to support running on Windows 10
   2944 * (server) and later versions. It will not run on earlier
   2945 * versions, as they assume that many of the operations which
   2946 * Linux needs accomplished with a spinlock held were done via
   2947 * asynchronous messaging via VMBus.  Windows 10 increases the
   2948 * surface area of PCI emulation so that these actions can take
   2949 * place by suspending a virtual processor for their duration.
   2950 *
   2951 * This function negotiates the channel protocol version,
   2952 * failing if the host doesn't support the necessary protocol
   2953 * level.
   2954 */
   2955static int hv_pci_protocol_negotiation(struct hv_device *hdev,
   2956				       enum pci_protocol_version_t version[],
   2957				       int num_version)
   2958{
   2959	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
   2960	struct pci_version_request *version_req;
   2961	struct hv_pci_compl comp_pkt;
   2962	struct pci_packet *pkt;
   2963	int ret;
   2964	int i;
   2965
   2966	/*
   2967	 * Initiate the handshake with the host and negotiate
   2968	 * a version that the host can support. We start with the
   2969	 * highest version number and go down if the host cannot
   2970	 * support it.
   2971	 */
   2972	pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
   2973	if (!pkt)
   2974		return -ENOMEM;
   2975
   2976	init_completion(&comp_pkt.host_event);
   2977	pkt->completion_func = hv_pci_generic_compl;
   2978	pkt->compl_ctxt = &comp_pkt;
   2979	version_req = (struct pci_version_request *)&pkt->message;
   2980	version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
   2981
   2982	for (i = 0; i < num_version; i++) {
   2983		version_req->protocol_version = version[i];
   2984		ret = vmbus_sendpacket(hdev->channel, version_req,
   2985				sizeof(struct pci_version_request),
   2986				(unsigned long)pkt, VM_PKT_DATA_INBAND,
   2987				VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
   2988		if (!ret)
   2989			ret = wait_for_response(hdev, &comp_pkt.host_event);
   2990
   2991		if (ret) {
   2992			dev_err(&hdev->device,
   2993				"PCI Pass-through VSP failed to request version: %d",
   2994				ret);
   2995			goto exit;
   2996		}
   2997
   2998		if (comp_pkt.completion_status >= 0) {
   2999			hbus->protocol_version = version[i];
   3000			dev_info(&hdev->device,
   3001				"PCI VMBus probing: Using version %#x\n",
   3002				hbus->protocol_version);
   3003			goto exit;
   3004		}
   3005
   3006		if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) {
   3007			dev_err(&hdev->device,
   3008				"PCI Pass-through VSP failed version request: %#x",
   3009				comp_pkt.completion_status);
   3010			ret = -EPROTO;
   3011			goto exit;
   3012		}
   3013
   3014		reinit_completion(&comp_pkt.host_event);
   3015	}
   3016
   3017	dev_err(&hdev->device,
   3018		"PCI pass-through VSP failed to find supported version");
   3019	ret = -EPROTO;
   3020
   3021exit:
   3022	kfree(pkt);
   3023	return ret;
   3024}
   3025
   3026/**
   3027 * hv_pci_free_bridge_windows() - Release memory regions for the
   3028 * bus
   3029 * @hbus:	Root PCI bus, as understood by this driver
   3030 */
   3031static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
   3032{
   3033	/*
   3034	 * Set the resources back to the way they looked when they
   3035	 * were allocated by setting IORESOURCE_BUSY again.
   3036	 */
   3037
   3038	if (hbus->low_mmio_space && hbus->low_mmio_res) {
   3039		hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
   3040		vmbus_free_mmio(hbus->low_mmio_res->start,
   3041				resource_size(hbus->low_mmio_res));
   3042	}
   3043
   3044	if (hbus->high_mmio_space && hbus->high_mmio_res) {
   3045		hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
   3046		vmbus_free_mmio(hbus->high_mmio_res->start,
   3047				resource_size(hbus->high_mmio_res));
   3048	}
   3049}
   3050
   3051/**
   3052 * hv_pci_allocate_bridge_windows() - Allocate memory regions
   3053 * for the bus
   3054 * @hbus:	Root PCI bus, as understood by this driver
   3055 *
   3056 * This function calls vmbus_allocate_mmio(), which is itself a
   3057 * bit of a compromise.  Ideally, we might change the pnp layer
   3058 * in the kernel such that it comprehends either PCI devices
   3059 * which are "grandchildren of ACPI," with some intermediate bus
   3060 * node (in this case, VMBus) or change it such that it
   3061 * understands VMBus.  The pnp layer, however, has been declared
   3062 * deprecated, and not subject to change.
   3063 *
   3064 * The workaround, implemented here, is to ask VMBus to allocate
   3065 * MMIO space for this bus.  VMBus itself knows which ranges are
   3066 * appropriate by looking at its own ACPI objects.  Then, after
   3067 * these ranges are claimed, they're modified to look like they
   3068 * would have looked if the ACPI and pnp code had allocated
   3069 * bridge windows.  These descriptors have to exist in this form
   3070 * in order to satisfy the code which will get invoked when the
   3071 * endpoint PCI function driver calls request_mem_region() or
   3072 * request_mem_region_exclusive().
   3073 *
   3074 * Return: 0 on success, -errno on failure
   3075 */
   3076static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
   3077{
   3078	resource_size_t align;
   3079	int ret;
   3080
   3081	if (hbus->low_mmio_space) {
   3082		align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
   3083		ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
   3084					  (u64)(u32)0xffffffff,
   3085					  hbus->low_mmio_space,
   3086					  align, false);
   3087		if (ret) {
   3088			dev_err(&hbus->hdev->device,
   3089				"Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
   3090				hbus->low_mmio_space);
   3091			return ret;
   3092		}
   3093
   3094		/* Modify this resource to become a bridge window. */
   3095		hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
   3096		hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
   3097		pci_add_resource(&hbus->bridge->windows, hbus->low_mmio_res);
   3098	}
   3099
   3100	if (hbus->high_mmio_space) {
   3101		align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
   3102		ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
   3103					  0x100000000, -1,
   3104					  hbus->high_mmio_space, align,
   3105					  false);
   3106		if (ret) {
   3107			dev_err(&hbus->hdev->device,
   3108				"Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
   3109				hbus->high_mmio_space);
   3110			goto release_low_mmio;
   3111		}
   3112
   3113		/* Modify this resource to become a bridge window. */
   3114		hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
   3115		hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
   3116		pci_add_resource(&hbus->bridge->windows, hbus->high_mmio_res);
   3117	}
   3118
   3119	return 0;
   3120
   3121release_low_mmio:
   3122	if (hbus->low_mmio_res) {
   3123		vmbus_free_mmio(hbus->low_mmio_res->start,
   3124				resource_size(hbus->low_mmio_res));
   3125	}
   3126
   3127	return ret;
   3128}
   3129
   3130/**
   3131 * hv_allocate_config_window() - Find MMIO space for PCI Config
   3132 * @hbus:	Root PCI bus, as understood by this driver
   3133 *
   3134 * This function claims memory-mapped I/O space for accessing
   3135 * configuration space for the functions on this bus.
   3136 *
   3137 * Return: 0 on success, -errno on failure
   3138 */
   3139static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
   3140{
   3141	int ret;
   3142
   3143	/*
   3144	 * Set up a region of MMIO space to use for accessing configuration
   3145	 * space.
   3146	 */
   3147	ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
   3148				  PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
   3149	if (ret)
   3150		return ret;
   3151
   3152	/*
   3153	 * vmbus_allocate_mmio() gets used for allocating both device endpoint
   3154	 * resource claims (those which cannot be overlapped) and the ranges
   3155	 * which are valid for the children of this bus, which are intended
   3156	 * to be overlapped by those children.  Set the flag on this claim
   3157	 * meaning that this region can't be overlapped.
   3158	 */
   3159
   3160	hbus->mem_config->flags |= IORESOURCE_BUSY;
   3161
   3162	return 0;
   3163}
   3164
   3165static void hv_free_config_window(struct hv_pcibus_device *hbus)
   3166{
   3167	vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
   3168}
   3169
   3170static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs);
   3171
   3172/**
   3173 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
   3174 * @hdev:	VMBus's tracking struct for this root PCI bus
   3175 *
   3176 * Return: 0 on success, -errno on failure
   3177 */
   3178static int hv_pci_enter_d0(struct hv_device *hdev)
   3179{
   3180	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
   3181	struct pci_bus_d0_entry *d0_entry;
   3182	struct hv_pci_compl comp_pkt;
   3183	struct pci_packet *pkt;
   3184	int ret;
   3185
   3186	/*
   3187	 * Tell the host that the bus is ready to use, and moved into the
   3188	 * powered-on state.  This includes telling the host which region
   3189	 * of memory-mapped I/O space has been chosen for configuration space
   3190	 * access.
   3191	 */
   3192	pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
   3193	if (!pkt)
   3194		return -ENOMEM;
   3195
   3196	init_completion(&comp_pkt.host_event);
   3197	pkt->completion_func = hv_pci_generic_compl;
   3198	pkt->compl_ctxt = &comp_pkt;
   3199	d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
   3200	d0_entry->message_type.type = PCI_BUS_D0ENTRY;
   3201	d0_entry->mmio_base = hbus->mem_config->start;
   3202
   3203	ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
   3204			       (unsigned long)pkt, VM_PKT_DATA_INBAND,
   3205			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
   3206	if (!ret)
   3207		ret = wait_for_response(hdev, &comp_pkt.host_event);
   3208
   3209	if (ret)
   3210		goto exit;
   3211
   3212	if (comp_pkt.completion_status < 0) {
   3213		dev_err(&hdev->device,
   3214			"PCI Pass-through VSP failed D0 Entry with status %x\n",
   3215			comp_pkt.completion_status);
   3216		ret = -EPROTO;
   3217		goto exit;
   3218	}
   3219
   3220	ret = 0;
   3221
   3222exit:
   3223	kfree(pkt);
   3224	return ret;
   3225}
   3226
   3227/**
   3228 * hv_pci_query_relations() - Ask host to send list of child
   3229 * devices
   3230 * @hdev:	VMBus's tracking struct for this root PCI bus
   3231 *
   3232 * Return: 0 on success, -errno on failure
   3233 */
   3234static int hv_pci_query_relations(struct hv_device *hdev)
   3235{
   3236	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
   3237	struct pci_message message;
   3238	struct completion comp;
   3239	int ret;
   3240
   3241	/* Ask the host to send along the list of child devices */
   3242	init_completion(&comp);
   3243	if (cmpxchg(&hbus->survey_event, NULL, &comp))
   3244		return -ENOTEMPTY;
   3245
   3246	memset(&message, 0, sizeof(message));
   3247	message.type = PCI_QUERY_BUS_RELATIONS;
   3248
   3249	ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
   3250			       0, VM_PKT_DATA_INBAND, 0);
   3251	if (!ret)
   3252		ret = wait_for_response(hdev, &comp);
   3253
   3254	return ret;
   3255}
   3256
   3257/**
   3258 * hv_send_resources_allocated() - Report local resource choices
   3259 * @hdev:	VMBus's tracking struct for this root PCI bus
   3260 *
   3261 * The host OS is expecting to be sent a request as a message
   3262 * which contains all the resources that the device will use.
   3263 * The response contains those same resources, "translated"
   3264 * which is to say, the values which should be used by the
   3265 * hardware, when it delivers an interrupt.  (MMIO resources are
   3266 * used in local terms.)  This is nice for Windows, and lines up
   3267 * with the FDO/PDO split, which doesn't exist in Linux.  Linux
   3268 * is deeply expecting to scan an emulated PCI configuration
   3269 * space.  So this message is sent here only to drive the state
   3270 * machine on the host forward.
   3271 *
   3272 * Return: 0 on success, -errno on failure
   3273 */
   3274static int hv_send_resources_allocated(struct hv_device *hdev)
   3275{
   3276	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
   3277	struct pci_resources_assigned *res_assigned;
   3278	struct pci_resources_assigned2 *res_assigned2;
   3279	struct hv_pci_compl comp_pkt;
   3280	struct hv_pci_dev *hpdev;
   3281	struct pci_packet *pkt;
   3282	size_t size_res;
   3283	int wslot;
   3284	int ret;
   3285
   3286	size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2)
   3287			? sizeof(*res_assigned) : sizeof(*res_assigned2);
   3288
   3289	pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL);
   3290	if (!pkt)
   3291		return -ENOMEM;
   3292
   3293	ret = 0;
   3294
   3295	for (wslot = 0; wslot < 256; wslot++) {
   3296		hpdev = get_pcichild_wslot(hbus, wslot);
   3297		if (!hpdev)
   3298			continue;
   3299
   3300		memset(pkt, 0, sizeof(*pkt) + size_res);
   3301		init_completion(&comp_pkt.host_event);
   3302		pkt->completion_func = hv_pci_generic_compl;
   3303		pkt->compl_ctxt = &comp_pkt;
   3304
   3305		if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) {
   3306			res_assigned =
   3307				(struct pci_resources_assigned *)&pkt->message;
   3308			res_assigned->message_type.type =
   3309				PCI_RESOURCES_ASSIGNED;
   3310			res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
   3311		} else {
   3312			res_assigned2 =
   3313				(struct pci_resources_assigned2 *)&pkt->message;
   3314			res_assigned2->message_type.type =
   3315				PCI_RESOURCES_ASSIGNED2;
   3316			res_assigned2->wslot.slot = hpdev->desc.win_slot.slot;
   3317		}
   3318		put_pcichild(hpdev);
   3319
   3320		ret = vmbus_sendpacket(hdev->channel, &pkt->message,
   3321				size_res, (unsigned long)pkt,
   3322				VM_PKT_DATA_INBAND,
   3323				VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
   3324		if (!ret)
   3325			ret = wait_for_response(hdev, &comp_pkt.host_event);
   3326		if (ret)
   3327			break;
   3328
   3329		if (comp_pkt.completion_status < 0) {
   3330			ret = -EPROTO;
   3331			dev_err(&hdev->device,
   3332				"resource allocated returned 0x%x",
   3333				comp_pkt.completion_status);
   3334			break;
   3335		}
   3336
   3337		hbus->wslot_res_allocated = wslot;
   3338	}
   3339
   3340	kfree(pkt);
   3341	return ret;
   3342}
   3343
   3344/**
   3345 * hv_send_resources_released() - Report local resources
   3346 * released
   3347 * @hdev:	VMBus's tracking struct for this root PCI bus
   3348 *
   3349 * Return: 0 on success, -errno on failure
   3350 */
   3351static int hv_send_resources_released(struct hv_device *hdev)
   3352{
   3353	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
   3354	struct pci_child_message pkt;
   3355	struct hv_pci_dev *hpdev;
   3356	int wslot;
   3357	int ret;
   3358
   3359	for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) {
   3360		hpdev = get_pcichild_wslot(hbus, wslot);
   3361		if (!hpdev)
   3362			continue;
   3363
   3364		memset(&pkt, 0, sizeof(pkt));
   3365		pkt.message_type.type = PCI_RESOURCES_RELEASED;
   3366		pkt.wslot.slot = hpdev->desc.win_slot.slot;
   3367
   3368		put_pcichild(hpdev);
   3369
   3370		ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
   3371				       VM_PKT_DATA_INBAND, 0);
   3372		if (ret)
   3373			return ret;
   3374
   3375		hbus->wslot_res_allocated = wslot - 1;
   3376	}
   3377
   3378	hbus->wslot_res_allocated = -1;
   3379
   3380	return 0;
   3381}
   3382
   3383#define HVPCI_DOM_MAP_SIZE (64 * 1024)
   3384static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
   3385
   3386/*
   3387 * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
   3388 * as invalid for passthrough PCI devices of this driver.
   3389 */
   3390#define HVPCI_DOM_INVALID 0
   3391
   3392/**
   3393 * hv_get_dom_num() - Get a valid PCI domain number
   3394 * Check if the PCI domain number is in use, and return another number if
   3395 * it is in use.
   3396 *
   3397 * @dom: Requested domain number
   3398 *
   3399 * return: domain number on success, HVPCI_DOM_INVALID on failure
   3400 */
   3401static u16 hv_get_dom_num(u16 dom)
   3402{
   3403	unsigned int i;
   3404
   3405	if (test_and_set_bit(dom, hvpci_dom_map) == 0)
   3406		return dom;
   3407
   3408	for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
   3409		if (test_and_set_bit(i, hvpci_dom_map) == 0)
   3410			return i;
   3411	}
   3412
   3413	return HVPCI_DOM_INVALID;
   3414}
   3415
   3416/**
   3417 * hv_put_dom_num() - Mark the PCI domain number as free
   3418 * @dom: Domain number to be freed
   3419 */
   3420static void hv_put_dom_num(u16 dom)
   3421{
   3422	clear_bit(dom, hvpci_dom_map);
   3423}
   3424
   3425/**
   3426 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
   3427 * @hdev:	VMBus's tracking struct for this root PCI bus
   3428 * @dev_id:	Identifies the device itself
   3429 *
   3430 * Return: 0 on success, -errno on failure
   3431 */
   3432static int hv_pci_probe(struct hv_device *hdev,
   3433			const struct hv_vmbus_device_id *dev_id)
   3434{
   3435	struct pci_host_bridge *bridge;
   3436	struct hv_pcibus_device *hbus;
   3437	u16 dom_req, dom;
   3438	char *name;
   3439	bool enter_d0_retry = true;
   3440	int ret;
   3441
   3442	/*
   3443	 * hv_pcibus_device contains the hypercall arguments for retargeting in
   3444	 * hv_irq_unmask(). Those must not cross a page boundary.
   3445	 */
   3446	BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE);
   3447
   3448	bridge = devm_pci_alloc_host_bridge(&hdev->device, 0);
   3449	if (!bridge)
   3450		return -ENOMEM;
   3451
   3452	/*
   3453	 * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural
   3454	 * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate
   3455	 * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and
   3456	 * alignment of hbus is important because hbus's field
   3457	 * retarget_msi_interrupt_params must not cross a 4KB page boundary.
   3458	 *
   3459	 * Here we prefer kzalloc to get_zeroed_page(), because a buffer
   3460	 * allocated by the latter is not tracked and scanned by kmemleak, and
   3461	 * hence kmemleak reports the pointer contained in the hbus buffer
   3462	 * (i.e. the hpdev struct, which is created in new_pcichild_device() and
   3463	 * is tracked by hbus->children) as memory leak (false positive).
   3464	 *
   3465	 * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be
   3466	 * used to allocate the hbus buffer and we can avoid the kmemleak false
   3467	 * positive by using kmemleak_alloc() and kmemleak_free() to ask
   3468	 * kmemleak to track and scan the hbus buffer.
   3469	 */
   3470	hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
   3471	if (!hbus)
   3472		return -ENOMEM;
   3473
   3474	hbus->bridge = bridge;
   3475	hbus->state = hv_pcibus_init;
   3476	hbus->wslot_res_allocated = -1;
   3477
   3478	/*
   3479	 * The PCI bus "domain" is what is called "segment" in ACPI and other
   3480	 * specs. Pull it from the instance ID, to get something usually
   3481	 * unique. In rare cases of collision, we will find out another number
   3482	 * not in use.
   3483	 *
   3484	 * Note that, since this code only runs in a Hyper-V VM, Hyper-V
   3485	 * together with this guest driver can guarantee that (1) The only
   3486	 * domain used by Gen1 VMs for something that looks like a physical
   3487	 * PCI bus (which is actually emulated by the hypervisor) is domain 0.
   3488	 * (2) There will be no overlap between domains (after fixing possible
   3489	 * collisions) in the same VM.
   3490	 */
   3491	dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
   3492	dom = hv_get_dom_num(dom_req);
   3493
   3494	if (dom == HVPCI_DOM_INVALID) {
   3495		dev_err(&hdev->device,
   3496			"Unable to use dom# 0x%x or other numbers", dom_req);
   3497		ret = -EINVAL;
   3498		goto free_bus;
   3499	}
   3500
   3501	if (dom != dom_req)
   3502		dev_info(&hdev->device,
   3503			 "PCI dom# 0x%x has collision, using 0x%x",
   3504			 dom_req, dom);
   3505
   3506	hbus->bridge->domain_nr = dom;
   3507#ifdef CONFIG_X86
   3508	hbus->sysdata.domain = dom;
   3509#elif defined(CONFIG_ARM64)
   3510	/*
   3511	 * Set the PCI bus parent to be the corresponding VMbus
   3512	 * device. Then the VMbus device will be assigned as the
   3513	 * ACPI companion in pcibios_root_bridge_prepare() and
   3514	 * pci_dma_configure() will propagate device coherence
   3515	 * information to devices created on the bus.
   3516	 */
   3517	hbus->sysdata.parent = hdev->device.parent;
   3518#endif
   3519
   3520	hbus->hdev = hdev;
   3521	INIT_LIST_HEAD(&hbus->children);
   3522	INIT_LIST_HEAD(&hbus->dr_list);
   3523	spin_lock_init(&hbus->config_lock);
   3524	spin_lock_init(&hbus->device_list_lock);
   3525	spin_lock_init(&hbus->retarget_msi_interrupt_lock);
   3526	hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0,
   3527					   hbus->bridge->domain_nr);
   3528	if (!hbus->wq) {
   3529		ret = -ENOMEM;
   3530		goto free_dom;
   3531	}
   3532
   3533	hdev->channel->next_request_id_callback = vmbus_next_request_id;
   3534	hdev->channel->request_addr_callback = vmbus_request_addr;
   3535	hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE;
   3536
   3537	ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
   3538			 hv_pci_onchannelcallback, hbus);
   3539	if (ret)
   3540		goto destroy_wq;
   3541
   3542	hv_set_drvdata(hdev, hbus);
   3543
   3544	ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions,
   3545					  ARRAY_SIZE(pci_protocol_versions));
   3546	if (ret)
   3547		goto close;
   3548
   3549	ret = hv_allocate_config_window(hbus);
   3550	if (ret)
   3551		goto close;
   3552
   3553	hbus->cfg_addr = ioremap(hbus->mem_config->start,
   3554				 PCI_CONFIG_MMIO_LENGTH);
   3555	if (!hbus->cfg_addr) {
   3556		dev_err(&hdev->device,
   3557			"Unable to map a virtual address for config space\n");
   3558		ret = -ENOMEM;
   3559		goto free_config;
   3560	}
   3561
   3562	name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance);
   3563	if (!name) {
   3564		ret = -ENOMEM;
   3565		goto unmap;
   3566	}
   3567
   3568	hbus->fwnode = irq_domain_alloc_named_fwnode(name);
   3569	kfree(name);
   3570	if (!hbus->fwnode) {
   3571		ret = -ENOMEM;
   3572		goto unmap;
   3573	}
   3574
   3575	ret = hv_pcie_init_irq_domain(hbus);
   3576	if (ret)
   3577		goto free_fwnode;
   3578
   3579retry:
   3580	ret = hv_pci_query_relations(hdev);
   3581	if (ret)
   3582		goto free_irq_domain;
   3583
   3584	ret = hv_pci_enter_d0(hdev);
   3585	/*
   3586	 * In certain case (Kdump) the pci device of interest was
   3587	 * not cleanly shut down and resource is still held on host
   3588	 * side, the host could return invalid device status.
   3589	 * We need to explicitly request host to release the resource
   3590	 * and try to enter D0 again.
   3591	 * Since the hv_pci_bus_exit() call releases structures
   3592	 * of all its child devices, we need to start the retry from
   3593	 * hv_pci_query_relations() call, requesting host to send
   3594	 * the synchronous child device relations message before this
   3595	 * information is needed in hv_send_resources_allocated()
   3596	 * call later.
   3597	 */
   3598	if (ret == -EPROTO && enter_d0_retry) {
   3599		enter_d0_retry = false;
   3600
   3601		dev_err(&hdev->device, "Retrying D0 Entry\n");
   3602
   3603		/*
   3604		 * Hv_pci_bus_exit() calls hv_send_resources_released()
   3605		 * to free up resources of its child devices.
   3606		 * In the kdump kernel we need to set the
   3607		 * wslot_res_allocated to 255 so it scans all child
   3608		 * devices to release resources allocated in the
   3609		 * normal kernel before panic happened.
   3610		 */
   3611		hbus->wslot_res_allocated = 255;
   3612		ret = hv_pci_bus_exit(hdev, true);
   3613
   3614		if (ret == 0)
   3615			goto retry;
   3616
   3617		dev_err(&hdev->device,
   3618			"Retrying D0 failed with ret %d\n", ret);
   3619	}
   3620	if (ret)
   3621		goto free_irq_domain;
   3622
   3623	ret = hv_pci_allocate_bridge_windows(hbus);
   3624	if (ret)
   3625		goto exit_d0;
   3626
   3627	ret = hv_send_resources_allocated(hdev);
   3628	if (ret)
   3629		goto free_windows;
   3630
   3631	prepopulate_bars(hbus);
   3632
   3633	hbus->state = hv_pcibus_probed;
   3634
   3635	ret = create_root_hv_pci_bus(hbus);
   3636	if (ret)
   3637		goto free_windows;
   3638
   3639	return 0;
   3640
   3641free_windows:
   3642	hv_pci_free_bridge_windows(hbus);
   3643exit_d0:
   3644	(void) hv_pci_bus_exit(hdev, true);
   3645free_irq_domain:
   3646	irq_domain_remove(hbus->irq_domain);
   3647free_fwnode:
   3648	irq_domain_free_fwnode(hbus->fwnode);
   3649unmap:
   3650	iounmap(hbus->cfg_addr);
   3651free_config:
   3652	hv_free_config_window(hbus);
   3653close:
   3654	vmbus_close(hdev->channel);
   3655destroy_wq:
   3656	destroy_workqueue(hbus->wq);
   3657free_dom:
   3658	hv_put_dom_num(hbus->bridge->domain_nr);
   3659free_bus:
   3660	kfree(hbus);
   3661	return ret;
   3662}
   3663
   3664static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs)
   3665{
   3666	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
   3667	struct vmbus_channel *chan = hdev->channel;
   3668	struct {
   3669		struct pci_packet teardown_packet;
   3670		u8 buffer[sizeof(struct pci_message)];
   3671	} pkt;
   3672	struct hv_pci_compl comp_pkt;
   3673	struct hv_pci_dev *hpdev, *tmp;
   3674	unsigned long flags;
   3675	u64 trans_id;
   3676	int ret;
   3677
   3678	/*
   3679	 * After the host sends the RESCIND_CHANNEL message, it doesn't
   3680	 * access the per-channel ringbuffer any longer.
   3681	 */
   3682	if (chan->rescind)
   3683		return 0;
   3684
   3685	if (!keep_devs) {
   3686		struct list_head removed;
   3687
   3688		/* Move all present children to the list on stack */
   3689		INIT_LIST_HEAD(&removed);
   3690		spin_lock_irqsave(&hbus->device_list_lock, flags);
   3691		list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry)
   3692			list_move_tail(&hpdev->list_entry, &removed);
   3693		spin_unlock_irqrestore(&hbus->device_list_lock, flags);
   3694
   3695		/* Remove all children in the list */
   3696		list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) {
   3697			list_del(&hpdev->list_entry);
   3698			if (hpdev->pci_slot)
   3699				pci_destroy_slot(hpdev->pci_slot);
   3700			/* For the two refs got in new_pcichild_device() */
   3701			put_pcichild(hpdev);
   3702			put_pcichild(hpdev);
   3703		}
   3704	}
   3705
   3706	ret = hv_send_resources_released(hdev);
   3707	if (ret) {
   3708		dev_err(&hdev->device,
   3709			"Couldn't send resources released packet(s)\n");
   3710		return ret;
   3711	}
   3712
   3713	memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
   3714	init_completion(&comp_pkt.host_event);
   3715	pkt.teardown_packet.completion_func = hv_pci_generic_compl;
   3716	pkt.teardown_packet.compl_ctxt = &comp_pkt;
   3717	pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
   3718
   3719	ret = vmbus_sendpacket_getid(chan, &pkt.teardown_packet.message,
   3720				     sizeof(struct pci_message),
   3721				     (unsigned long)&pkt.teardown_packet,
   3722				     &trans_id, VM_PKT_DATA_INBAND,
   3723				     VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
   3724	if (ret)
   3725		return ret;
   3726
   3727	if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) {
   3728		/*
   3729		 * The completion packet on the stack becomes invalid after
   3730		 * 'return'; remove the ID from the VMbus requestor if the
   3731		 * identifier is still mapped to/associated with the packet.
   3732		 *
   3733		 * Cf. hv_pci_onchannelcallback().
   3734		 */
   3735		vmbus_request_addr_match(chan, trans_id,
   3736					 (unsigned long)&pkt.teardown_packet);
   3737		return -ETIMEDOUT;
   3738	}
   3739
   3740	return 0;
   3741}
   3742
   3743/**
   3744 * hv_pci_remove() - Remove routine for this VMBus channel
   3745 * @hdev:	VMBus's tracking struct for this root PCI bus
   3746 *
   3747 * Return: 0 on success, -errno on failure
   3748 */
   3749static int hv_pci_remove(struct hv_device *hdev)
   3750{
   3751	struct hv_pcibus_device *hbus;
   3752	int ret;
   3753
   3754	hbus = hv_get_drvdata(hdev);
   3755	if (hbus->state == hv_pcibus_installed) {
   3756		tasklet_disable(&hdev->channel->callback_event);
   3757		hbus->state = hv_pcibus_removing;
   3758		tasklet_enable(&hdev->channel->callback_event);
   3759		destroy_workqueue(hbus->wq);
   3760		hbus->wq = NULL;
   3761		/*
   3762		 * At this point, no work is running or can be scheduled
   3763		 * on hbus-wq. We can't race with hv_pci_devices_present()
   3764		 * or hv_pci_eject_device(), it's safe to proceed.
   3765		 */
   3766
   3767		/* Remove the bus from PCI's point of view. */
   3768		pci_lock_rescan_remove();
   3769		pci_stop_root_bus(hbus->bridge->bus);
   3770		hv_pci_remove_slots(hbus);
   3771		pci_remove_root_bus(hbus->bridge->bus);
   3772		pci_unlock_rescan_remove();
   3773	}
   3774
   3775	ret = hv_pci_bus_exit(hdev, false);
   3776
   3777	vmbus_close(hdev->channel);
   3778
   3779	iounmap(hbus->cfg_addr);
   3780	hv_free_config_window(hbus);
   3781	hv_pci_free_bridge_windows(hbus);
   3782	irq_domain_remove(hbus->irq_domain);
   3783	irq_domain_free_fwnode(hbus->fwnode);
   3784
   3785	hv_put_dom_num(hbus->bridge->domain_nr);
   3786
   3787	kfree(hbus);
   3788	return ret;
   3789}
   3790
   3791static int hv_pci_suspend(struct hv_device *hdev)
   3792{
   3793	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
   3794	enum hv_pcibus_state old_state;
   3795	int ret;
   3796
   3797	/*
   3798	 * hv_pci_suspend() must make sure there are no pending work items
   3799	 * before calling vmbus_close(), since it runs in a process context
   3800	 * as a callback in dpm_suspend().  When it starts to run, the channel
   3801	 * callback hv_pci_onchannelcallback(), which runs in a tasklet
   3802	 * context, can be still running concurrently and scheduling new work
   3803	 * items onto hbus->wq in hv_pci_devices_present() and
   3804	 * hv_pci_eject_device(), and the work item handlers can access the
   3805	 * vmbus channel, which can be being closed by hv_pci_suspend(), e.g.
   3806	 * the work item handler pci_devices_present_work() ->
   3807	 * new_pcichild_device() writes to the vmbus channel.
   3808	 *
   3809	 * To eliminate the race, hv_pci_suspend() disables the channel
   3810	 * callback tasklet, sets hbus->state to hv_pcibus_removing, and
   3811	 * re-enables the tasklet. This way, when hv_pci_suspend() proceeds,
   3812	 * it knows that no new work item can be scheduled, and then it flushes
   3813	 * hbus->wq and safely closes the vmbus channel.
   3814	 */
   3815	tasklet_disable(&hdev->channel->callback_event);
   3816
   3817	/* Change the hbus state to prevent new work items. */
   3818	old_state = hbus->state;
   3819	if (hbus->state == hv_pcibus_installed)
   3820		hbus->state = hv_pcibus_removing;
   3821
   3822	tasklet_enable(&hdev->channel->callback_event);
   3823
   3824	if (old_state != hv_pcibus_installed)
   3825		return -EINVAL;
   3826
   3827	flush_workqueue(hbus->wq);
   3828
   3829	ret = hv_pci_bus_exit(hdev, true);
   3830	if (ret)
   3831		return ret;
   3832
   3833	vmbus_close(hdev->channel);
   3834
   3835	return 0;
   3836}
   3837
   3838static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg)
   3839{
   3840	struct irq_data *irq_data;
   3841	struct msi_desc *entry;
   3842	int ret = 0;
   3843
   3844	msi_lock_descs(&pdev->dev);
   3845	msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) {
   3846		irq_data = irq_get_irq_data(entry->irq);
   3847		if (WARN_ON_ONCE(!irq_data)) {
   3848			ret = -EINVAL;
   3849			break;
   3850		}
   3851
   3852		hv_compose_msi_msg(irq_data, &entry->msg);
   3853	}
   3854	msi_unlock_descs(&pdev->dev);
   3855
   3856	return ret;
   3857}
   3858
   3859/*
   3860 * Upon resume, pci_restore_msi_state() -> ... ->  __pci_write_msi_msg()
   3861 * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V
   3862 * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg()
   3863 * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping
   3864 * Table entries.
   3865 */
   3866static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus)
   3867{
   3868	pci_walk_bus(hbus->bridge->bus, hv_pci_restore_msi_msg, NULL);
   3869}
   3870
   3871static int hv_pci_resume(struct hv_device *hdev)
   3872{
   3873	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
   3874	enum pci_protocol_version_t version[1];
   3875	int ret;
   3876
   3877	hbus->state = hv_pcibus_init;
   3878
   3879	hdev->channel->next_request_id_callback = vmbus_next_request_id;
   3880	hdev->channel->request_addr_callback = vmbus_request_addr;
   3881	hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE;
   3882
   3883	ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
   3884			 hv_pci_onchannelcallback, hbus);
   3885	if (ret)
   3886		return ret;
   3887
   3888	/* Only use the version that was in use before hibernation. */
   3889	version[0] = hbus->protocol_version;
   3890	ret = hv_pci_protocol_negotiation(hdev, version, 1);
   3891	if (ret)
   3892		goto out;
   3893
   3894	ret = hv_pci_query_relations(hdev);
   3895	if (ret)
   3896		goto out;
   3897
   3898	ret = hv_pci_enter_d0(hdev);
   3899	if (ret)
   3900		goto out;
   3901
   3902	ret = hv_send_resources_allocated(hdev);
   3903	if (ret)
   3904		goto out;
   3905
   3906	prepopulate_bars(hbus);
   3907
   3908	hv_pci_restore_msi_state(hbus);
   3909
   3910	hbus->state = hv_pcibus_installed;
   3911	return 0;
   3912out:
   3913	vmbus_close(hdev->channel);
   3914	return ret;
   3915}
   3916
   3917static const struct hv_vmbus_device_id hv_pci_id_table[] = {
   3918	/* PCI Pass-through Class ID */
   3919	/* 44C4F61D-4444-4400-9D52-802E27EDE19F */
   3920	{ HV_PCIE_GUID, },
   3921	{ },
   3922};
   3923
   3924MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
   3925
   3926static struct hv_driver hv_pci_drv = {
   3927	.name		= "hv_pci",
   3928	.id_table	= hv_pci_id_table,
   3929	.probe		= hv_pci_probe,
   3930	.remove		= hv_pci_remove,
   3931	.suspend	= hv_pci_suspend,
   3932	.resume		= hv_pci_resume,
   3933};
   3934
   3935static void __exit exit_hv_pci_drv(void)
   3936{
   3937	vmbus_driver_unregister(&hv_pci_drv);
   3938
   3939	hvpci_block_ops.read_block = NULL;
   3940	hvpci_block_ops.write_block = NULL;
   3941	hvpci_block_ops.reg_blk_invalidate = NULL;
   3942}
   3943
   3944static int __init init_hv_pci_drv(void)
   3945{
   3946	int ret;
   3947
   3948	if (!hv_is_hyperv_initialized())
   3949		return -ENODEV;
   3950
   3951	ret = hv_pci_irqchip_init();
   3952	if (ret)
   3953		return ret;
   3954
   3955	/* Set the invalid domain number's bit, so it will not be used */
   3956	set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
   3957
   3958	/* Initialize PCI block r/w interface */
   3959	hvpci_block_ops.read_block = hv_read_config_block;
   3960	hvpci_block_ops.write_block = hv_write_config_block;
   3961	hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate;
   3962
   3963	return vmbus_driver_register(&hv_pci_drv);
   3964}
   3965
   3966module_init(init_hv_pci_drv);
   3967module_exit(exit_hv_pci_drv);
   3968
   3969MODULE_DESCRIPTION("Hyper-V PCI");
   3970MODULE_LICENSE("GPL v2");