From f88eecfe2f22b2790e7527c0aaec14ea175919de Mon Sep 17 00:00:00 2001 From: Sebastian Frias Date: Tue, 16 Aug 2016 16:05:08 +0200 Subject: genirq/generic_chip: Verify irqs_per_chip <= 32 Most (if not all) code here implicitly assumes that the maximum number of IRQs per chip will be 32, and thus uses 'u32' or 'unsigned long' for many tasks (for example "struct irq_data" declares its 'mask' field as 'u32', and "struct irq_chip_generic" declares its 'installed' field as 'unsigned long') However, there is no check to verify that irqs_per_chip is <= 32. Hence, calling irq_alloc_domain_generic_chips() with a bigger value will result in unexpected results. Provide a wrapper with a MAYBE_BUILD_BUG_ON(nrirqs >= 32) to catch such cases. [ tglx: Reduced changelog to the essential information ] Signed-off-by: Sebastian Frias Cc: Marc Zyngier Cc: Mason Cc: Jason Cooper Link: http://lkml.kernel.org/r/57B31D94.5040701@laposte.net Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index b52424eaa0ed..603986741f2c 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -916,12 +916,20 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set); struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); -int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, - int num_ct, const char *name, - irq_flow_handler_t handler, - unsigned int clr, unsigned int set, - enum irq_gc_flags flags); +int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, + int num_ct, const char *name, + irq_flow_handler_t handler, + unsigned int clr, unsigned int set, + enum irq_gc_flags flags); + +#define irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name, \ + handler, clr, set, flags) \ +({ \ + MAYBE_BUILD_BUG_ON(irqs_per_chip > 32); \ + __irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name,\ + handler, clr, set, flags); \ +}) static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) { -- cgit v1.2.3-71-gd317 From 88ef16d888a094587b2ac77de60927df5da5d56d Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Mon, 12 Sep 2016 20:54:20 +0200 Subject: ACPI: I/O Remapping Table (IORT) initial support IORT shows representation of IO topology for ARM based systems. It describes how various components are connected together on parent-child basis e.g. PCI RC -> SMMU -> ITS. Also see IORT spec. http://infocenter.arm.com/help/topic/com.arm.doc.den0049b/DEN0049B_IO_Remapping_Table.pdf Initial support allows to detect IORT table presence and save its root pointer obtained through acpi_get_table(). The pointer validity depends on acpi_gbl_permanent_mmap because if acpi_gbl_permanent_mmap is not set while using IORT nodes we would dereference unmapped pointers. For the aforementioned reason call acpi_iort_init() from acpi_init() which guarantees acpi_gbl_permanent_mmap to be set at that point. Add generic helpers which are helpful for scanning and retrieving information from IORT table content. List of the most important helpers: - iort_find_dev_node() finds IORT node for a given device - iort_node_map_rid() maps device RID and returns IORT node which provides final translation IORT support is placed under drivers/acpi/arm64/ new directory due to its ARM64 specific nature. The code there is considered only for ARM64. The long term plan is to keep all ARM64 specific tables support in this place e.g. GTDT table. Signed-off-by: Tomasz Nowicki Acked-by: Rafael J. Wysocki Reviewed-by: Hanjun Guo Reviewed-by: Lorenzo Pieralisi Signed-off-by: Marc Zyngier --- drivers/acpi/Kconfig | 4 + drivers/acpi/Makefile | 2 + drivers/acpi/arm64/Kconfig | 6 ++ drivers/acpi/arm64/Makefile | 1 + drivers/acpi/arm64/iort.c | 216 ++++++++++++++++++++++++++++++++++++++++++++ drivers/acpi/bus.c | 2 + include/linux/acpi_iort.h | 30 ++++++ 7 files changed, 261 insertions(+) create mode 100644 drivers/acpi/arm64/Kconfig create mode 100644 drivers/acpi/arm64/Makefile create mode 100644 drivers/acpi/arm64/iort.c create mode 100644 include/linux/acpi_iort.h (limited to 'include/linux') diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 445ce28475b3..d5c06145d07f 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -521,4 +521,8 @@ config ACPI_CONFIGFS userspace. The configurable ACPI groups will be visible under /config/acpi, assuming configfs is mounted under /config. +if ARM64 +source "drivers/acpi/arm64/Kconfig" +endif + endif # ACPI diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 5ae9d85c5159..e5ada7895697 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -105,3 +105,5 @@ obj-$(CONFIG_ACPI_CONFIGFS) += acpi_configfs.o video-objs += acpi_video.o video_detect.o obj-y += dptf/ + +obj-$(CONFIG_ARM64) += arm64/ diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig new file mode 100644 index 000000000000..4616da4c15be --- /dev/null +++ b/drivers/acpi/arm64/Kconfig @@ -0,0 +1,6 @@ +# +# ACPI Configuration for ARM64 +# + +config ACPI_IORT + bool diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile new file mode 100644 index 000000000000..72331f2ce0e9 --- /dev/null +++ b/drivers/acpi/arm64/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_ACPI_IORT) += iort.o diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c new file mode 100644 index 000000000000..5279a358924a --- /dev/null +++ b/drivers/acpi/arm64/iort.c @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2016, Semihalf + * Author: Tomasz Nowicki + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * This file implements early detection/parsing of I/O mapping + * reported to OS through firmware via I/O Remapping Table (IORT) + * IORT document number: ARM DEN 0049A + */ + +#define pr_fmt(fmt) "ACPI: IORT: " fmt + +#include +#include +#include + +typedef acpi_status (*iort_find_node_callback) + (struct acpi_iort_node *node, void *context); + +/* Root pointer to the mapped IORT table */ +static struct acpi_table_header *iort_table; + +static LIST_HEAD(iort_msi_chip_list); +static DEFINE_SPINLOCK(iort_msi_chip_lock); + +static struct acpi_iort_node *iort_scan_node(enum acpi_iort_node_type type, + iort_find_node_callback callback, + void *context) +{ + struct acpi_iort_node *iort_node, *iort_end; + struct acpi_table_iort *iort; + int i; + + if (!iort_table) + return NULL; + + /* Get the first IORT node */ + iort = (struct acpi_table_iort *)iort_table; + iort_node = ACPI_ADD_PTR(struct acpi_iort_node, iort, + iort->node_offset); + iort_end = ACPI_ADD_PTR(struct acpi_iort_node, iort_table, + iort_table->length); + + for (i = 0; i < iort->node_count; i++) { + if (WARN_TAINT(iort_node >= iort_end, TAINT_FIRMWARE_WORKAROUND, + "IORT node pointer overflows, bad table!\n")) + return NULL; + + if (iort_node->type == type && + ACPI_SUCCESS(callback(iort_node, context))) + return iort_node; + + iort_node = ACPI_ADD_PTR(struct acpi_iort_node, iort_node, + iort_node->length); + } + + return NULL; +} + +static acpi_status iort_match_node_callback(struct acpi_iort_node *node, + void *context) +{ + struct device *dev = context; + acpi_status status; + + if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT) { + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_device *adev = to_acpi_device_node(dev->fwnode); + struct acpi_iort_named_component *ncomp; + + if (!adev) { + status = AE_NOT_FOUND; + goto out; + } + + status = acpi_get_name(adev->handle, ACPI_FULL_PATHNAME, &buf); + if (ACPI_FAILURE(status)) { + dev_warn(dev, "Can't get device full path name\n"); + goto out; + } + + ncomp = (struct acpi_iort_named_component *)node->node_data; + status = !strcmp(ncomp->device_name, buf.pointer) ? + AE_OK : AE_NOT_FOUND; + acpi_os_free(buf.pointer); + } else if (node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) { + struct acpi_iort_root_complex *pci_rc; + struct pci_bus *bus; + + bus = to_pci_bus(dev); + pci_rc = (struct acpi_iort_root_complex *)node->node_data; + + /* + * It is assumed that PCI segment numbers maps one-to-one + * with root complexes. Each segment number can represent only + * one root complex. + */ + status = pci_rc->pci_segment_number == pci_domain_nr(bus) ? + AE_OK : AE_NOT_FOUND; + } else { + status = AE_NOT_FOUND; + } +out: + return status; +} + +static int iort_id_map(struct acpi_iort_id_mapping *map, u8 type, u32 rid_in, + u32 *rid_out) +{ + /* Single mapping does not care for input id */ + if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) { + if (type == ACPI_IORT_NODE_NAMED_COMPONENT || + type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) { + *rid_out = map->output_base; + return 0; + } + + pr_warn(FW_BUG "[map %p] SINGLE MAPPING flag not allowed for node type %d, skipping ID map\n", + map, type); + return -ENXIO; + } + + if (rid_in < map->input_base || + (rid_in >= map->input_base + map->id_count)) + return -ENXIO; + + *rid_out = map->output_base + (rid_in - map->input_base); + return 0; +} + +static struct acpi_iort_node *iort_node_map_rid(struct acpi_iort_node *node, + u32 rid_in, u32 *rid_out, + u8 type) +{ + u32 rid = rid_in; + + /* Parse the ID mapping tree to find specified node type */ + while (node) { + struct acpi_iort_id_mapping *map; + int i; + + if (node->type == type) { + if (rid_out) + *rid_out = rid; + return node; + } + + if (!node->mapping_offset || !node->mapping_count) + goto fail_map; + + map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, node, + node->mapping_offset); + + /* Firmware bug! */ + if (!map->output_reference) { + pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n", + node, node->type); + goto fail_map; + } + + /* Do the RID translation */ + for (i = 0; i < node->mapping_count; i++, map++) { + if (!iort_id_map(map, node->type, rid, &rid)) + break; + } + + if (i == node->mapping_count) + goto fail_map; + + node = ACPI_ADD_PTR(struct acpi_iort_node, iort_table, + map->output_reference); + } + +fail_map: + /* Map input RID to output RID unchanged on mapping failure*/ + if (rid_out) + *rid_out = rid_in; + + return NULL; +} + +static struct acpi_iort_node *iort_find_dev_node(struct device *dev) +{ + struct pci_bus *pbus; + + if (!dev_is_pci(dev)) + return iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT, + iort_match_node_callback, dev); + + /* Find a PCI root bus */ + pbus = to_pci_dev(dev)->bus; + while (!pci_is_root_bus(pbus)) + pbus = pbus->parent; + + return iort_scan_node(ACPI_IORT_NODE_PCI_ROOT_COMPLEX, + iort_match_node_callback, &pbus->dev); +} + +void __init acpi_iort_init(void) +{ + acpi_status status; + + status = acpi_get_table(ACPI_SIG_IORT, 0, &iort_table); + if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) { + const char *msg = acpi_format_exception(status); + pr_err("Failed to get table, %s\n", msg); + } +} diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 85b7d07fe5c8..e56e6438515a 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -36,6 +36,7 @@ #ifdef CONFIG_X86 #include #endif +#include #include #include #include @@ -1186,6 +1187,7 @@ static int __init acpi_init(void) } pci_mmcfg_late_init(); + acpi_iort_init(); acpi_scan_init(); acpi_ec_init(); acpi_debugfs_init(); diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h new file mode 100644 index 000000000000..fcacaf7ed64d --- /dev/null +++ b/include/linux/acpi_iort.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2016, Semihalf + * Author: Tomasz Nowicki + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#ifndef __ACPI_IORT_H__ +#define __ACPI_IORT_H__ + +#include + +#ifdef CONFIG_ACPI_IORT +void acpi_iort_init(void); +#else +static inline void acpi_iort_init(void) { } +#endif + +#endif /* __ACPI_IORT_H__ */ -- cgit v1.2.3-71-gd317 From 4bf2efd26d7624372fb7adff8745b4c2e8407004 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Mon, 12 Sep 2016 20:32:21 +0200 Subject: ACPI: Add new IORT functions to support MSI domain handling For ITS, MSI functionality consists on building domain stack and during that process we need to reference to domain stack components e.g. before we create new DOMAIN_BUS_PCI_MSI domain we need to specify its DOMAIN_BUS_NEXUS parent domain. In order to manage that process properly, maintain list which elements contain domain token (unique for MSI domain stack) and ITS ID: iort_register_domain_token() and iort_deregister_domain_token(). Then retrieve domain token any time later with ITS ID being key off: iort_find_domain_token(). With domain token and domain type we are able to find corresponding IRQ domain. Since IORT is prepared to describe MSI domain on a per-device basis, use existing IORT helpers and implement two calls: 1. iort_msi_map_rid() to map MSI RID for a device 2. iort_get_device_domain() to find domain token for a device Signed-off-by: Tomasz Nowicki Acked-by: Rafael J. Wysocki Reviewed-by: Hanjun Guo Signed-off-by: Marc Zyngier --- drivers/acpi/arm64/iort.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi_iort.h | 12 ++++ 2 files changed, 164 insertions(+) (limited to 'include/linux') diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 5279a358924a..6b81746cd13c 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -22,6 +22,12 @@ #include #include +struct iort_its_msi_chip { + struct list_head list; + struct fwnode_handle *fw_node; + u32 translation_id; +}; + typedef acpi_status (*iort_find_node_callback) (struct acpi_iort_node *node, void *context); @@ -31,6 +37,76 @@ static struct acpi_table_header *iort_table; static LIST_HEAD(iort_msi_chip_list); static DEFINE_SPINLOCK(iort_msi_chip_lock); +/** + * iort_register_domain_token() - register domain token and related ITS ID + * to the list from where we can get it back later on. + * @trans_id: ITS ID. + * @fw_node: Domain token. + * + * Returns: 0 on success, -ENOMEM if no memory when allocating list element + */ +int iort_register_domain_token(int trans_id, struct fwnode_handle *fw_node) +{ + struct iort_its_msi_chip *its_msi_chip; + + its_msi_chip = kzalloc(sizeof(*its_msi_chip), GFP_KERNEL); + if (!its_msi_chip) + return -ENOMEM; + + its_msi_chip->fw_node = fw_node; + its_msi_chip->translation_id = trans_id; + + spin_lock(&iort_msi_chip_lock); + list_add(&its_msi_chip->list, &iort_msi_chip_list); + spin_unlock(&iort_msi_chip_lock); + + return 0; +} + +/** + * iort_deregister_domain_token() - Deregister domain token based on ITS ID + * @trans_id: ITS ID. + * + * Returns: none. + */ +void iort_deregister_domain_token(int trans_id) +{ + struct iort_its_msi_chip *its_msi_chip, *t; + + spin_lock(&iort_msi_chip_lock); + list_for_each_entry_safe(its_msi_chip, t, &iort_msi_chip_list, list) { + if (its_msi_chip->translation_id == trans_id) { + list_del(&its_msi_chip->list); + kfree(its_msi_chip); + break; + } + } + spin_unlock(&iort_msi_chip_lock); +} + +/** + * iort_find_domain_token() - Find domain token based on given ITS ID + * @trans_id: ITS ID. + * + * Returns: domain token when find on the list, NULL otherwise + */ +struct fwnode_handle *iort_find_domain_token(int trans_id) +{ + struct fwnode_handle *fw_node = NULL; + struct iort_its_msi_chip *its_msi_chip; + + spin_lock(&iort_msi_chip_lock); + list_for_each_entry(its_msi_chip, &iort_msi_chip_list, list) { + if (its_msi_chip->translation_id == trans_id) { + fw_node = its_msi_chip->fw_node; + break; + } + } + spin_unlock(&iort_msi_chip_lock); + + return fw_node; +} + static struct acpi_iort_node *iort_scan_node(enum acpi_iort_node_type type, iort_find_node_callback callback, void *context) @@ -204,6 +280,82 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev) iort_match_node_callback, &pbus->dev); } +/** + * iort_msi_map_rid() - Map a MSI requester ID for a device + * @dev: The device for which the mapping is to be done. + * @req_id: The device requester ID. + * + * Returns: mapped MSI RID on success, input requester ID otherwise + */ +u32 iort_msi_map_rid(struct device *dev, u32 req_id) +{ + struct acpi_iort_node *node; + u32 dev_id; + + node = iort_find_dev_node(dev); + if (!node) + return req_id; + + iort_node_map_rid(node, req_id, &dev_id, ACPI_IORT_NODE_ITS_GROUP); + return dev_id; +} + +/** + * iort_dev_find_its_id() - Find the ITS identifier for a device + * @dev: The device. + * @idx: Index of the ITS identifier list. + * @its_id: ITS identifier. + * + * Returns: 0 on success, appropriate error value otherwise + */ +static int iort_dev_find_its_id(struct device *dev, u32 req_id, + unsigned int idx, int *its_id) +{ + struct acpi_iort_its_group *its; + struct acpi_iort_node *node; + + node = iort_find_dev_node(dev); + if (!node) + return -ENXIO; + + node = iort_node_map_rid(node, req_id, NULL, ACPI_IORT_NODE_ITS_GROUP); + if (!node) + return -ENXIO; + + /* Move to ITS specific data */ + its = (struct acpi_iort_its_group *)node->node_data; + if (idx > its->its_count) { + dev_err(dev, "requested ITS ID index [%d] is greater than available [%d]\n", + idx, its->its_count); + return -ENXIO; + } + + *its_id = its->identifiers[idx]; + return 0; +} + +/** + * iort_get_device_domain() - Find MSI domain related to a device + * @dev: The device. + * @req_id: Requester ID for the device. + * + * Returns: the MSI domain for this device, NULL otherwise + */ +struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id) +{ + struct fwnode_handle *handle; + int its_id; + + if (iort_dev_find_its_id(dev, req_id, 0, &its_id)) + return NULL; + + handle = iort_find_domain_token(its_id); + if (!handle) + return NULL; + + return irq_find_matching_fwnode(handle, DOMAIN_BUS_PCI_MSI); +} + void __init acpi_iort_init(void) { acpi_status status; diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index fcacaf7ed64d..0e32dac8fd03 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -20,11 +20,23 @@ #define __ACPI_IORT_H__ #include +#include +#include +int iort_register_domain_token(int trans_id, struct fwnode_handle *fw_node); +void iort_deregister_domain_token(int trans_id); +struct fwnode_handle *iort_find_domain_token(int trans_id); #ifdef CONFIG_ACPI_IORT void acpi_iort_init(void); +u32 iort_msi_map_rid(struct device *dev, u32 req_id); +struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id); #else static inline void acpi_iort_init(void) { } +static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id) +{ return req_id; } +static inline struct irq_domain *iort_get_device_domain(struct device *dev, + u32 req_id) +{ return NULL; } #endif #endif /* __ACPI_IORT_H__ */ -- cgit v1.2.3-71-gd317 From db40f0a7aea5e03ef044ef5dbc51a364e1ff7991 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Mon, 12 Sep 2016 20:32:24 +0200 Subject: irqchip/gicv3-its: Refactor ITS DT init code to prepare for ACPI In order to add ACPI support we need to isolate ACPI&DT common code and move DT logic to corresponding functions. To achieve this we are using firmware agnostic handle which can be unpacked to either DT or ACPI node. No functional changes other than a very minor one: 1. Terminate its_init call with -ENODEV for non-DT case which allows to remove hack from its-gic-v3.c. 2. Fix ITS base register address type (from 'unsigned long' to 'phys_addr_t'), as a bonus we get nice string formatting. 3. Since there is only one of ITS parent domain convert it to static global variable and drop the parameter from its_probe_one. Users can refer to it in more convenient way then. Signed-off-by: Hanjun Guo Signed-off-by: Tomasz Nowicki Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 65 ++++++++++++++++++++++---------------- drivers/irqchip/irq-gic-v3.c | 7 ++-- include/linux/irqchip/arm-gic-v3.h | 4 +-- 3 files changed, 42 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 943442d689d8..c7518c7b48bc 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -75,7 +75,7 @@ struct its_node { raw_spinlock_t lock; struct list_head entry; void __iomem *base; - unsigned long phys_base; + phys_addr_t phys_base; struct its_cmd_block *cmd_base; struct its_cmd_block *cmd_write; struct its_baser tables[GITS_BASER_NR_REGS]; @@ -115,6 +115,7 @@ struct its_device { static LIST_HEAD(its_nodes); static DEFINE_SPINLOCK(its_lock); static struct rdists *gic_rdists; +static struct irq_domain *its_parent; #define gic_data_rdist() (raw_cpu_ptr(gic_rdists->rdist)) #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) @@ -1614,8 +1615,7 @@ static void its_enable_quirks(struct its_node *its) gic_enable_quirks(iidr, its_quirks, its); } -static int its_init_domain(struct device_node *node, struct its_node *its, - struct irq_domain *parent) +static int its_init_domain(struct fwnode_handle *handle, struct its_node *its) { struct irq_domain *inner_domain; struct msi_domain_info *info; @@ -1624,13 +1624,13 @@ static int its_init_domain(struct device_node *node, struct its_node *its, if (!info) return -ENOMEM; - inner_domain = irq_domain_add_tree(node, &its_domain_ops, its); + inner_domain = irq_domain_create_tree(handle, &its_domain_ops, its); if (!inner_domain) { kfree(info); return -ENOMEM; } - inner_domain->parent = parent; + inner_domain->parent = its_parent; inner_domain->bus_token = DOMAIN_BUS_NEXUS; info->ops = &its_msi_domain_ops; info->data = its; @@ -1639,43 +1639,35 @@ static int its_init_domain(struct device_node *node, struct its_node *its, return 0; } -static int __init its_probe(struct device_node *node, - struct irq_domain *parent) +static int __init its_probe_one(struct resource *res, + struct fwnode_handle *handle, int numa_node) { - struct resource res; struct its_node *its; void __iomem *its_base; u32 val; u64 baser, tmp; int err; - err = of_address_to_resource(node, 0, &res); - if (err) { - pr_warn("%s: no regs?\n", node->full_name); - return -ENXIO; - } - - its_base = ioremap(res.start, resource_size(&res)); + its_base = ioremap(res->start, resource_size(res)); if (!its_base) { - pr_warn("%s: unable to map registers\n", node->full_name); + pr_warn("ITS@%pa: Unable to map ITS registers\n", &res->start); return -ENOMEM; } val = readl_relaxed(its_base + GITS_PIDR2) & GIC_PIDR2_ARCH_MASK; if (val != 0x30 && val != 0x40) { - pr_warn("%s: no ITS detected, giving up\n", node->full_name); + pr_warn("ITS@%pa: No ITS detected, giving up\n", &res->start); err = -ENODEV; goto out_unmap; } err = its_force_quiescent(its_base); if (err) { - pr_warn("%s: failed to quiesce, giving up\n", - node->full_name); + pr_warn("ITS@%pa: Failed to quiesce, giving up\n", &res->start); goto out_unmap; } - pr_info("ITS: %s\n", node->full_name); + pr_info("ITS %pR\n", res); its = kzalloc(sizeof(*its), GFP_KERNEL); if (!its) { @@ -1687,9 +1679,9 @@ static int __init its_probe(struct device_node *node, INIT_LIST_HEAD(&its->entry); INIT_LIST_HEAD(&its->its_device_list); its->base = its_base; - its->phys_base = res.start; + its->phys_base = res->start; its->ite_size = ((readl_relaxed(its_base + GITS_TYPER) >> 4) & 0xf) + 1; - its->numa_node = of_node_to_nid(node); + its->numa_node = numa_node; its->cmd_base = kzalloc(ITS_CMD_QUEUE_SZ, GFP_KERNEL); if (!its->cmd_base) { @@ -1736,7 +1728,7 @@ static int __init its_probe(struct device_node *node, writeq_relaxed(0, its->base + GITS_CWRITER); writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR); - err = its_init_domain(node, its, parent); + err = its_init_domain(handle, its); if (err) goto out_free_tables; @@ -1754,7 +1746,7 @@ out_free_its: kfree(its); out_unmap: iounmap(its_base); - pr_err("ITS: failed probing %s (%d)\n", node->full_name, err); + pr_err("ITS@%pa: failed probing (%d)\n", &res->start, err); return err; } @@ -1782,10 +1774,10 @@ static struct of_device_id its_device_id[] = { {}, }; -int __init its_init(struct device_node *node, struct rdists *rdists, - struct irq_domain *parent_domain) +static int __init its_of_probe(struct device_node *node) { struct device_node *np; + struct resource res; for (np = of_find_matching_node(node, its_device_id); np; np = of_find_matching_node(np, its_device_id)) { @@ -1795,8 +1787,27 @@ int __init its_init(struct device_node *node, struct rdists *rdists, continue; } - its_probe(np, parent_domain); + if (of_address_to_resource(np, 0, &res)) { + pr_warn("%s: no regs?\n", np->full_name); + continue; + } + + its_probe_one(&res, &np->fwnode, of_node_to_nid(np)); } + return 0; +} + +int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, + struct irq_domain *parent_domain) +{ + struct device_node *of_node; + + its_parent = parent_domain; + of_node = to_of_node(handle); + if (of_node) + its_of_probe(of_node); + else + return -ENODEV; if (list_empty(&its_nodes)) { pr_warn("ITS: No ITS available, not enabling LPIs\n"); diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index ecc5b2360c7a..850f9c422f24 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -918,7 +918,6 @@ static int __init gic_init_bases(void __iomem *dist_base, u64 redist_stride, struct fwnode_handle *handle) { - struct device_node *node; u32 typer; int gic_irqs; int err; @@ -959,10 +958,8 @@ static int __init gic_init_bases(void __iomem *dist_base, set_handle_irq(gic_handle_irq); - node = to_of_node(handle); - if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis() && - node) /* Temp hack to prevent ITS init for ACPI */ - its_init(node, &gic_data.rdists, gic_data.domain); + if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis()) + its_init(handle, &gic_data.rdists, gic_data.domain); gic_smp_init(); gic_dist_init(); diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 99ac022edc60..8361c8d3edd1 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -430,9 +430,9 @@ struct rdists { }; struct irq_domain; -struct device_node; +struct fwnode_handle; int its_cpu_init(void); -int its_init(struct device_node *node, struct rdists *rdists, +int its_init(struct fwnode_handle *handle, struct rdists *rdists, struct irq_domain *domain); static inline bool gic_enable_sre(void) -- cgit v1.2.3-71-gd317 From ecb3f394c5dba897d215a5422f1b363e93e2ce4e Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Tue, 13 Sep 2016 12:14:51 -0400 Subject: genirq: Expose interrupt information through sysfs Information about interrupts is exposed via /proc/interrupts, but the format of that file has changed over kernel versions and differs across architectures. It also has varying column numbers depending on hardware. That all makes it hard for tools to parse. To solve this, expose the information through sysfs so each irq attribute is in a separate file in a consistent, machine parsable way. This feature is only available when both CONFIG_SPARSE_IRQ and CONFIG_SYSFS are enabled. Examples: /sys/kernel/irq/18/actions: i801_smbus,ehci_hcd:usb1,uhci_hcd:usb7 /sys/kernel/irq/18/chip_name: IR-IO-APIC /sys/kernel/irq/18/hwirq: 18 /sys/kernel/irq/18/name: fasteoi /sys/kernel/irq/18/per_cpu_count: 0,0 /sys/kernel/irq/18/type: level /sys/kernel/irq/25/actions: ahci0 /sys/kernel/irq/25/chip_name: IR-PCI-MSI /sys/kernel/irq/25/hwirq: 512000 /sys/kernel/irq/25/name: edge /sys/kernel/irq/25/per_cpu_count: 29036,0 /sys/kernel/irq/25/type: edge [ tglx: Moved kobject_del() under sparse_irq_lock, massaged code comments and changelog ] Signed-off-by: Craig Gallek Cc: David Decotigny Link: http://lkml.kernel.org/r/1473783291-122873-1-git-send-email-kraigatgoog@gmail.com Signed-off-by: Thomas Gleixner --- Documentation/ABI/testing/sysfs-kernel-irq | 53 ++++++++ include/linux/irqdesc.h | 3 + kernel/irq/irqdesc.c | 193 ++++++++++++++++++++++++++++- 3 files changed, 247 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-irq (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-kernel-irq b/Documentation/ABI/testing/sysfs-kernel-irq new file mode 100644 index 000000000000..eb074b100986 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-irq @@ -0,0 +1,53 @@ +What: /sys/kernel/irq +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: Directory containing information about the system's IRQs. + Specifically, data from the associated struct irq_desc. + The information here is similar to that in /proc/interrupts + but in a more machine-friendly format. This directory contains + one subdirectory for each Linux IRQ number. + +What: /sys/kernel/irq//actions +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: The IRQ action chain. A comma-separated list of zero or more + device names associated with this interrupt. + +What: /sys/kernel/irq//chip_name +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: Human-readable chip name supplied by the associated device + driver. + +What: /sys/kernel/irq//hwirq +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: When interrupt translation domains are used, this file contains + the underlying hardware IRQ number used for this Linux IRQ. + +What: /sys/kernel/irq//name +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: Human-readable flow handler name as defined by the irq chip + driver. + +What: /sys/kernel/irq//per_cpu_count +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: The number of times the interrupt has fired since boot. This + is a comma-separated list of counters; one per CPU in CPU id + order. NOTE: This file consistently shows counters for all + CPU ids. This differs from the behavior of /proc/interrupts + which only shows counters for online CPUs. + +What: /sys/kernel/irq//type +Date: September 2016 +KernelVersion: 4.9 +Contact: Craig Gallek +Description: The type of the interrupt. Either the string 'level' or 'edge'. diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index b51beebf9804..c9be57931b58 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -2,6 +2,7 @@ #define _LINUX_IRQDESC_H #include +#include /* * Core internal functions to deal with irq descriptors @@ -43,6 +44,7 @@ struct pt_regs; * @force_resume_depth: number of irqactions on a irq descriptor with * IRQF_FORCE_RESUME set * @rcu: rcu head for delayed free + * @kobj: kobject used to represent this struct in sysfs * @dir: /proc/irq/ procfs entry * @name: flow handler name for /proc/interrupts output */ @@ -88,6 +90,7 @@ struct irq_desc { #endif #ifdef CONFIG_SPARSE_IRQ struct rcu_head rcu; + struct kobject kobj; #endif int parent_irq; struct module *owner; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a623b44f2d4b..93b51727abaa 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "internals.h" @@ -123,6 +124,181 @@ static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); #ifdef CONFIG_SPARSE_IRQ +static void irq_kobj_release(struct kobject *kobj); + +#ifdef CONFIG_SYSFS +static struct kobject *irq_kobj_base; + +#define IRQ_ATTR_RO(_name) \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +static ssize_t per_cpu_count_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + int cpu, irq = desc->irq_data.irq; + ssize_t ret = 0; + char *p = ""; + + for_each_possible_cpu(cpu) { + unsigned int c = kstat_irqs_cpu(irq, cpu); + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); + p = ","; + } + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; +} +IRQ_ATTR_RO(per_cpu_count); + +static ssize_t chip_name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->irq_data.chip && desc->irq_data.chip->name) { + ret = scnprintf(buf, PAGE_SIZE, "%s\n", + desc->irq_data.chip->name); + } + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(chip_name); + +static ssize_t hwirq_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->irq_data.domain) + ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq); + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(hwirq); + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + ret = sprintf(buf, "%s\n", + irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); + raw_spin_unlock_irq(&desc->lock); + + return ret; + +} +IRQ_ATTR_RO(type); + +static ssize_t name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->name) + ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name); + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(name); + +static ssize_t actions_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + struct irqaction *action; + ssize_t ret = 0; + char *p = ""; + + raw_spin_lock_irq(&desc->lock); + for (action = desc->action; action != NULL; action = action->next) { + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + p, action->name); + p = ","; + } + raw_spin_unlock_irq(&desc->lock); + + if (ret) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + + return ret; +} +IRQ_ATTR_RO(actions); + +static struct attribute *irq_attrs[] = { + &per_cpu_count_attr.attr, + &chip_name_attr.attr, + &hwirq_attr.attr, + &type_attr.attr, + &name_attr.attr, + &actions_attr.attr, + NULL +}; + +static struct kobj_type irq_kobj_type = { + .release = irq_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = irq_attrs, +}; + +static void irq_sysfs_add(int irq, struct irq_desc *desc) +{ + if (irq_kobj_base) { + /* + * Continue even in case of failure as this is nothing + * crucial. + */ + if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) + pr_warn("Failed to add kobject for irq %d\n", irq); + } +} + +static int __init irq_sysfs_init(void) +{ + struct irq_desc *desc; + int irq; + + /* Prevent concurrent irq alloc/free */ + irq_lock_sparse(); + + irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); + if (!irq_kobj_base) { + irq_unlock_sparse(); + return -ENOMEM; + } + + /* Add the already allocated interrupts */ + for_each_irq_desc(irq, desc) + irq_sysfs_add(irq, desc); + irq_unlock_sparse(); + + return 0; +} +postcore_initcall(irq_sysfs_init); + +#else /* !CONFIG_SYSFS */ + +static struct kobj_type irq_kobj_type = { + .release = irq_kobj_release, +}; + +static void irq_sysfs_add(int irq, struct irq_desc *desc) {} + +#endif /* CONFIG_SYSFS */ + static RADIX_TREE(irq_desc_tree, GFP_KERNEL); static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) @@ -187,6 +363,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); + kobject_init(&desc->kobj, &irq_kobj_type); return desc; @@ -197,15 +374,22 @@ err_desc: return NULL; } -static void delayed_free_desc(struct rcu_head *rhp) +static void irq_kobj_release(struct kobject *kobj) { - struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); free_masks(desc); free_percpu(desc->kstat_irqs); kfree(desc); } +static void delayed_free_desc(struct rcu_head *rhp) +{ + struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + + kobject_put(&desc->kobj); +} + static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -217,8 +401,12 @@ static void free_desc(unsigned int irq) * kstat_irq_usr(). Once we deleted the descriptor from the * sparse tree we can free it. Access in proc will fail to * lookup the descriptor. + * + * The sysfs entry must be serialized against a concurrent + * irq_sysfs_init() as well. */ mutex_lock(&sparse_irq_lock); + kobject_del(&desc->kobj); delete_irq_desc(irq); mutex_unlock(&sparse_irq_lock); @@ -261,6 +449,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, goto err; mutex_lock(&sparse_irq_lock); irq_insert_desc(start + i, desc); + irq_sysfs_add(start + i, desc); mutex_unlock(&sparse_irq_lock); } return start; -- cgit v1.2.3-71-gd317 From 28f4b04143c56135b1ca742fc64b664ed04de6a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:47 +0200 Subject: genirq/msi: Add cpumask allocation to alloc_msi_entry For irq spreading want to store affinity masks in the msi_entry. Add the infrastructure for it. We allocate an array of cpumasks with an array size of the number of used vectors in the entry, so we can hand in the information per linux interrupt later. As we hand in the number of used vectors, we assign them right away. Convert all the call sites. Signed-off-by: Thomas Gleixner Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/1473862739-15032-2-git-send-email-hch@lst.de --- drivers/base/platform-msi.c | 3 +-- drivers/pci/msi.c | 6 ++---- drivers/staging/fsl-mc/bus/mc-msi.c | 3 +-- include/linux/msi.h | 5 +++-- kernel/irq/msi.c | 26 ++++++++++++++++++++++++-- 5 files changed, 31 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c index 279e53989374..be6a599bc0c1 100644 --- a/drivers/base/platform-msi.c +++ b/drivers/base/platform-msi.c @@ -142,13 +142,12 @@ static int platform_msi_alloc_descs_with_irq(struct device *dev, int virq, } for (i = 0; i < nvec; i++) { - desc = alloc_msi_entry(dev); + desc = alloc_msi_entry(dev, 1, NULL); if (!desc) break; desc->platform.msi_priv_data = data; desc->platform.msi_index = base + i; - desc->nvec_used = 1; desc->irq = virq ? virq + i : 0; list_add_tail(&desc->list, dev_to_msi_list(dev)); diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 98f12223c734..0db72ba24003 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -555,7 +555,7 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec) struct msi_desc *entry; /* MSI Entry Initialization */ - entry = alloc_msi_entry(&dev->dev); + entry = alloc_msi_entry(&dev->dev, nvec, NULL); if (!entry) return NULL; @@ -568,7 +568,6 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec) entry->msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1; entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec)); - entry->nvec_used = nvec; entry->affinity = dev->irq_affinity; if (control & PCI_MSI_FLAGS_64BIT) @@ -693,7 +692,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, mask = cpumask_of(cpu); } - entry = alloc_msi_entry(&dev->dev); + entry = alloc_msi_entry(&dev->dev, 1, NULL); if (!entry) { if (!i) iounmap(base); @@ -711,7 +710,6 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.entry_nr = i; entry->msi_attrib.default_irq = dev->irq; entry->mask_base = base; - entry->nvec_used = 1; entry->affinity = mask; list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); diff --git a/drivers/staging/fsl-mc/bus/mc-msi.c b/drivers/staging/fsl-mc/bus/mc-msi.c index c7be156ae5e0..4fd8e41ef468 100644 --- a/drivers/staging/fsl-mc/bus/mc-msi.c +++ b/drivers/staging/fsl-mc/bus/mc-msi.c @@ -213,7 +213,7 @@ static int fsl_mc_msi_alloc_descs(struct device *dev, unsigned int irq_count) struct msi_desc *msi_desc; for (i = 0; i < irq_count; i++) { - msi_desc = alloc_msi_entry(dev); + msi_desc = alloc_msi_entry(dev, 1, NULL); if (!msi_desc) { dev_err(dev, "Failed to allocate msi entry\n"); error = -ENOMEM; @@ -221,7 +221,6 @@ static int fsl_mc_msi_alloc_descs(struct device *dev, unsigned int irq_count) } msi_desc->fsl_mc.msi_index = i; - msi_desc->nvec_used = 1; INIT_LIST_HEAD(&msi_desc->list); list_add_tail(&msi_desc->list, dev_to_msi_list(dev)); } diff --git a/include/linux/msi.h b/include/linux/msi.h index e8c81fbd5f9c..0db320b7bb15 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -68,7 +68,7 @@ struct msi_desc { unsigned int nvec_used; struct device *dev; struct msi_msg msg; - const struct cpumask *affinity; + struct cpumask *affinity; union { /* PCI MSI/X specific data */ @@ -123,7 +123,8 @@ static inline void *msi_desc_to_pci_sysdata(struct msi_desc *desc) } #endif /* CONFIG_PCI_MSI */ -struct msi_desc *alloc_msi_entry(struct device *dev); +struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, + const struct cpumask *affinity); void free_msi_entry(struct msi_desc *entry); void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 19e9dfbe97fa..8a3e872798f3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -18,20 +18,42 @@ /* Temparory solution for building, will be removed later */ #include -struct msi_desc *alloc_msi_entry(struct device *dev) +/** + * alloc_msi_entry - Allocate an initialize msi_entry + * @dev: Pointer to the device for which this is allocated + * @nvec: The number of vectors used in this entry + * @affinity: Optional pointer to an affinity mask array size of @nvec + * + * If @affinity is not NULL then a an affinity array[@nvec] is allocated + * and the affinity masks from @affinity are copied. + */ +struct msi_desc * +alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity) { - struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); + struct msi_desc *desc; + + desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) return NULL; INIT_LIST_HEAD(&desc->list); desc->dev = dev; + desc->nvec_used = nvec; + if (affinity) { + desc->affinity = kmemdup(affinity, + nvec * sizeof(*desc->affinity), GFP_KERNEL); + if (!desc->affinity) { + kfree(desc); + return NULL; + } + } return desc; } void free_msi_entry(struct msi_desc *entry) { + kfree(entry->affinity); kfree(entry); } -- cgit v1.2.3-71-gd317 From 34c3d9819fda464be4f1bec59b63353814f76c73 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:48 +0200 Subject: genirq/affinity: Provide smarter irq spreading infrastructure The current irq spreading infrastructure is just looking at a cpumask and tries to spread the interrupts over the mask. Thats suboptimal as it does not take numa nodes into account. Change the logic so the interrupts are spread across numa nodes and inside the nodes. If there are more cpus than vectors per node, then we set the affinity to several cpus. If HT siblings are available we take that into account and try to set all siblings to a single vector. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Link: http://lkml.kernel.org/r/1473862739-15032-3-git-send-email-hch@lst.de --- include/linux/interrupt.h | 15 +++++ kernel/irq/affinity.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index b6683f0ffc9f..4e59d122cad9 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -279,6 +279,8 @@ extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs); +struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, int nvec); +int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec); #else /* CONFIG_SMP */ @@ -316,6 +318,19 @@ static inline struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) *nr_vecs = 1; return NULL; } + +static inline struct cpumask * +irq_create_affinity_masks(const struct cpumask *affinity, int nvec) +{ + return NULL; +} + +static inline int +irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) +{ + return maxvec; +} + #endif /* CONFIG_SMP */ /* diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 32f6cfcff212..7812fecc6e2f 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -4,6 +4,155 @@ #include #include +static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + int cpus_per_vec) +{ + const struct cpumask *siblmsk; + int cpu, sibl; + + for ( ; cpus_per_vec > 0; ) { + cpu = cpumask_first(nmsk); + + /* Should not happen, but I'm too lazy to think about it */ + if (cpu >= nr_cpu_ids) + return; + + cpumask_clear_cpu(cpu, nmsk); + cpumask_set_cpu(cpu, irqmsk); + cpus_per_vec--; + + /* If the cpu has siblings, use them first */ + siblmsk = topology_sibling_cpumask(cpu); + for (sibl = -1; cpus_per_vec > 0; ) { + sibl = cpumask_next(sibl, siblmsk); + if (sibl >= nr_cpu_ids) + break; + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) + continue; + cpumask_set_cpu(sibl, irqmsk); + cpus_per_vec--; + } + } +} + +static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) +{ + int n, nodes; + + /* Calculate the number of nodes in the supplied affinity mask */ + for (n = 0, nodes = 0; n < num_online_nodes(); n++) { + if (cpumask_intersects(mask, cpumask_of_node(n))) { + node_set(n, *nodemsk); + nodes++; + } + } + return nodes; +} + +/** + * irq_create_affinity_masks - Create affinity masks for multiqueue spreading + * @affinity: The affinity mask to spread. If NULL cpu_online_mask + * is used + * @nvecs: The number of vectors + * + * Returns the masks pointer or NULL if allocation failed. + */ +struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, + int nvec) +{ + int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0; + nodemask_t nodemsk = NODE_MASK_NONE; + struct cpumask *masks; + cpumask_var_t nmsk; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL); + if (!masks) + goto out; + + /* Stabilize the cpumasks */ + get_online_cpus(); + /* If the supplied affinity mask is NULL, use cpu online mask */ + if (!affinity) + affinity = cpu_online_mask; + + nodes = get_nodes_in_cpumask(affinity, &nodemsk); + + /* + * If the number of nodes in the mask is less than or equal the + * number of vectors we just spread the vectors across the nodes. + */ + if (nvec <= nodes) { + for_each_node_mask(n, nodemsk) { + cpumask_copy(masks + curvec, cpumask_of_node(n)); + if (++curvec == nvec) + break; + } + goto outonl; + } + + /* Spread the vectors per node */ + vecs_per_node = nvec / nodes; + /* Account for rounding errors */ + extra_vecs = nvec - (nodes * vecs_per_node); + + for_each_node_mask(n, nodemsk) { + int ncpus, v, vecs_to_assign = vecs_per_node; + + /* Get the cpus on this node which are in the mask */ + cpumask_and(nmsk, affinity, cpumask_of_node(n)); + + /* Calculate the number of cpus per vector */ + ncpus = cpumask_weight(nmsk); + + for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) { + cpus_per_vec = ncpus / vecs_to_assign; + + /* Account for extra vectors to compensate rounding errors */ + if (extra_vecs) { + cpus_per_vec++; + if (!--extra_vecs) + vecs_per_node++; + } + irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); + } + + if (curvec >= nvec) + break; + } + +outonl: + put_online_cpus(); +out: + free_cpumask_var(nmsk); + return masks; +} + +/** + * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask + * @affinity: The affinity mask to spread. If NULL cpu_online_mask + * is used + * @maxvec: The maximum number of vectors available + */ +int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) +{ + int cpus, ret; + + /* Stabilize the cpumasks */ + get_online_cpus(); + /* If the supplied affinity mask is NULL, use cpu online mask */ + if (!affinity) + affinity = cpu_online_mask; + + cpus = cpumask_weight(affinity); + ret = (cpus < maxvec) ? cpus : maxvec; + + put_online_cpus(); + return ret; +} + static int get_first_sibling(unsigned int cpu) { unsigned int ret; -- cgit v1.2.3-71-gd317 From 44082fd6702fb12020967fd375f8bf6dd7c111bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:50 +0200 Subject: genirq/affinity: Remove old irq spread infrastructure No more users. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Link: http://lkml.kernel.org/r/1473862739-15032-5-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 7 ------ kernel/irq/affinity.c | 58 ----------------------------------------------- 2 files changed, 65 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 4e59d122cad9..72f0721f75e7 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -278,7 +278,6 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); -struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs); struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, int nvec); int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec); @@ -313,12 +312,6 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) return 0; } -static inline struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) -{ - *nr_vecs = 1; - return NULL; -} - static inline struct cpumask * irq_create_affinity_masks(const struct cpumask *affinity, int nvec) { diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 7812fecc6e2f..17f51d63da56 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -152,61 +152,3 @@ int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) put_online_cpus(); return ret; } - -static int get_first_sibling(unsigned int cpu) -{ - unsigned int ret; - - ret = cpumask_first(topology_sibling_cpumask(cpu)); - if (ret < nr_cpu_ids) - return ret; - return cpu; -} - -/* - * Take a map of online CPUs and the number of available interrupt vectors - * and generate an output cpumask suitable for spreading MSI/MSI-X vectors - * so that they are distributed as good as possible around the CPUs. If - * more vectors than CPUs are available we'll map one to each CPU, - * otherwise we map one to the first sibling of each socket. - * - * If there are more vectors than CPUs we will still only have one bit - * set per CPU, but interrupt code will keep on assigning the vectors from - * the start of the bitmap until we run out of vectors. - */ -struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) -{ - struct cpumask *affinity_mask; - unsigned int max_vecs = *nr_vecs; - - if (max_vecs == 1) - return NULL; - - affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!affinity_mask) { - *nr_vecs = 1; - return NULL; - } - - get_online_cpus(); - if (max_vecs >= num_online_cpus()) { - cpumask_copy(affinity_mask, cpu_online_mask); - *nr_vecs = num_online_cpus(); - } else { - unsigned int vecs = 0, cpu; - - for_each_online_cpu(cpu) { - if (cpu == get_first_sibling(cpu)) { - cpumask_set_cpu(cpu, affinity_mask); - vecs++; - } - - if (--max_vecs == 0) - break; - } - *nr_vecs = vecs; - } - put_online_cpus(); - - return affinity_mask; -} -- cgit v1.2.3-71-gd317 From ee8d41e53efe14bfc5ea5866e1178b06d78a7c95 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2016 16:18:51 +0200 Subject: pci/msi: Retrieve affinity for a vector Add a helper to get the affinity mask for a given PCI irq vector. For MSI or MSI-X vectors these are stored by the IRQ core, while for legacy interrupts we will always return cpu_possible_map. [hch: updated to follow the style of pci_irq_vector()] Signed-off-by: Thomas Gleixner Signed-off-by: Christoph Hellwig Cc: axboe@fb.com Cc: keith.busch@intel.com Cc: agordeev@redhat.com Cc: linux-block@vger.kernel.org Link: http://lkml.kernel.org/r/1473862739-15032-6-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- drivers/pci/msi.c | 31 +++++++++++++++++++++++++++++++ include/linux/pci.h | 6 ++++++ 2 files changed, 37 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 06100dde0e86..9da5ecb41f0b 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1270,6 +1270,37 @@ int pci_irq_vector(struct pci_dev *dev, unsigned int nr) } EXPORT_SYMBOL(pci_irq_vector); +/** + * pci_irq_get_affinity - return the affinity of a particular msi vector + * @dev: PCI device to operate on + * @nr: device-relative interrupt vector index (0-based). + */ +const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr) +{ + if (dev->msix_enabled) { + struct msi_desc *entry; + int i = 0; + + for_each_pci_msi_entry(entry, dev) { + if (i == nr) + return entry->affinity; + i++; + } + WARN_ON_ONCE(1); + return NULL; + } else if (dev->msi_enabled) { + struct msi_desc *entry = first_pci_msi_entry(dev); + + if (WARN_ON_ONCE(!entry || nr >= entry->nvec_used)) + return NULL; + + return &entry->affinity[nr]; + } else { + return cpu_possible_mask; + } +} +EXPORT_SYMBOL(pci_irq_get_affinity); + struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc) { return to_pci_dev(desc->dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 0ab835965669..3b0a8004f313 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1300,6 +1300,7 @@ int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, unsigned int max_vecs, unsigned int flags); void pci_free_irq_vectors(struct pci_dev *dev); int pci_irq_vector(struct pci_dev *dev, unsigned int nr); +const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec); #else static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; } @@ -1342,6 +1343,11 @@ static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr) return -EINVAL; return dev->irq; } +static inline const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, + int vec) +{ + return cpu_possible_mask; +} #endif #ifdef CONFIG_PCIEPORTBUS -- cgit v1.2.3-71-gd317