From fe364a7d95c24e07e9b3f2ab917f01d6d8330bba Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 12 Jul 2021 14:39:40 +0300 Subject: dmaengine: dw: Program xBAR hardware for Elkhart Lake Intel Elkhart Lake PSE DMA implementation is integrated with crossbar IP in order to serve more hardware than there are DMA request lines available. Due to this, program xBAR hardware to make flexible support of PSE peripheral. The Device-to-Device has not been tested and it's not supported by DMA Engine, but it's left in the code for the sake of documenting hardware features. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210712113940.42753-1-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/platform_data/dma-dw.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h index b34a094b2258..b11b0c8bc5da 100644 --- a/include/linux/platform_data/dma-dw.h +++ b/include/linux/platform_data/dma-dw.h @@ -52,6 +52,7 @@ struct dw_dma_slave { * @max_burst: Maximum value of burst transaction size supported by hardware * per channel (in units of CTL.SRC_TR_WIDTH/CTL.DST_TR_WIDTH). * @protctl: Protection control signals setting per channel. + * @quirks: Optional platform quirks. */ struct dw_dma_platform_data { unsigned int nr_channels; @@ -71,6 +72,8 @@ struct dw_dma_platform_data { #define CHAN_PROTCTL_CACHEABLE BIT(2) #define CHAN_PROTCTL_MASK GENMASK(2, 0) unsigned char protctl; +#define DW_DMA_QUIRK_XBAR_PRESENT BIT(0) + unsigned int quirks; }; #endif /* _PLATFORM_DATA_DMA_DW_H */ -- cgit v1.2.3-71-gd317 From 7ed012969bbcdbd7aef5778a061681e6cbc4b402 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 14 Jul 2021 17:01:59 +0200 Subject: Compiler Attributes: fix __has_attribute(__no_sanitize_coverage__) for GCC 4 Fix __has_attribute(__no_sanitize_coverage__) for GCC 4 by defining __GCC4_has_attribute___no_sanitize_coverage__. Fixes: 540540d06e9d ("kcov: add __no_sanitize_coverage to fix noinstr for all architectures") Reported-by: Geert Uytterhoeven Signed-off-by: Marco Elver Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 183ddd5fd072..7b1fa5c30169 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -36,6 +36,7 @@ # define __GCC4_has_attribute___nonstring__ 0 # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) # define __GCC4_has_attribute___no_sanitize_undefined__ (__GNUC_MINOR__ >= 9) +# define __GCC4_has_attribute___no_sanitize_coverage__ 0 # define __GCC4_has_attribute___fallthrough__ 0 #endif -- cgit v1.2.3-71-gd317 From d08c8b855140e9f5240b3ffd1b8b9d435675e281 Mon Sep 17 00:00:00 2001 From: Wasim Khan Date: Thu, 29 Jul 2021 14:17:47 +0200 Subject: PCI: Add ACS quirks for NXP LX2xx0 and LX2xx2 platforms Root Ports in NXP LX2xx0 and LX2xx2, where each Root Port is a Root Complex with unique segment numbers, do provide isolation features to disable peer transactions and validate bus numbers in requests, but do not provide an actual PCIe ACS capability. Add ACS quirks for NXP LX2xx0 A/C/E/N and LX2xx2 A/C/E/N platforms. LX2xx0A : without security features + CAN-FD LX2160A (0x8d81) - 16 cores LX2120A (0x8da1) - 12 cores LX2080A (0x8d83) - 8 cores LX2xx0C : security features + CAN-FD LX2160C (0x8d80) - 16 cores LX2120C (0x8da0) - 12 cores LX2080C (0x8d82) - 8 cores LX2xx0E : security features + CAN LX2160E (0x8d90) - 16 cores LX2120E (0x8db0) - 12 cores LX2080E (0x8d92) - 8 cores LX2xx0N : without security features + CAN LX2160N (0x8d91) - 16 cores LX2120N (0x8db1) - 12 cores LX2080N (0x8d93) - 8 cores LX2xx2A : without security features + CAN-FD LX2162A (0x8d89) - 16 cores LX2122A (0x8da9) - 12 cores LX2082A (0x8d8b) - 8 cores LX2xx2C : security features + CAN-FD LX2162C (0x8d88) - 16 cores LX2122C (0x8da8) - 12 cores LX2082C (0x8d8a) - 8 cores LX2xx2E : security features + CAN LX2162E (0x8d98) - 16 cores LX2122E (0x8db8) - 12 cores LX2082E (0x8d9a) - 8 cores LX2xx2N : without security features + CAN LX2162N (0x8d99) - 16 cores LX2122N (0x8db9) - 12 cores LX2082N (0x8d9b) - 8 cores [bhelgaas: put PCI_VENDOR_ID_NXP definition next to PCI_VENDOR_ID_FREESCALE as a clue that they share the same Device ID namespace] Link: https://lore.kernel.org/r/20210729121747.1823086-1-wasim.khan@oss.nxp.com Link: https://lore.kernel.org/r/20210803180021.3252886-1-wasim.khan@oss.nxp.com Signed-off-by: Wasim Khan Signed-off-by: Bjorn Helgaas --- drivers/pci/quirks.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/pci_ids.h | 3 ++- 2 files changed, 47 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6d74386eadc2..207d089a8d37 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4614,6 +4614,18 @@ static int pci_quirk_qcom_rp_acs(struct pci_dev *dev, u16 acs_flags) PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF); } +/* + * Each of these NXP Root Ports is in a Root Complex with a unique segment + * number and does provide isolation features to disable peer transactions + * and validate bus numbers in requests, but does not provide an ACS + * capability. + */ +static int pci_quirk_nxp_rp_acs(struct pci_dev *dev, u16 acs_flags) +{ + return pci_acs_ctrl_enabled(acs_flags, + PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF); +} + static int pci_quirk_al_acs(struct pci_dev *dev, u16 acs_flags) { if (pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) @@ -4860,6 +4872,39 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_ZHAOXIN, 0x3038, pci_quirk_mf_endpoint_acs }, { PCI_VENDOR_ID_ZHAOXIN, 0x3104, pci_quirk_mf_endpoint_acs }, { PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs }, + /* NXP root ports, xx=16, 12, or 08 cores */ + /* LX2xx0A : without security features + CAN-FD */ + { PCI_VENDOR_ID_NXP, 0x8d81, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8da1, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d83, pci_quirk_nxp_rp_acs }, + /* LX2xx0C : security features + CAN-FD */ + { PCI_VENDOR_ID_NXP, 0x8d80, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8da0, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d82, pci_quirk_nxp_rp_acs }, + /* LX2xx0E : security features + CAN */ + { PCI_VENDOR_ID_NXP, 0x8d90, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8db0, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d92, pci_quirk_nxp_rp_acs }, + /* LX2xx0N : without security features + CAN */ + { PCI_VENDOR_ID_NXP, 0x8d91, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8db1, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d93, pci_quirk_nxp_rp_acs }, + /* LX2xx2A : without security features + CAN-FD */ + { PCI_VENDOR_ID_NXP, 0x8d89, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8da9, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d8b, pci_quirk_nxp_rp_acs }, + /* LX2xx2C : security features + CAN-FD */ + { PCI_VENDOR_ID_NXP, 0x8d88, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8da8, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d8a, pci_quirk_nxp_rp_acs }, + /* LX2xx2E : security features + CAN */ + { PCI_VENDOR_ID_NXP, 0x8d98, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8db8, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d9a, pci_quirk_nxp_rp_acs }, + /* LX2xx2N : without security features + CAN */ + { PCI_VENDOR_ID_NXP, 0x8d99, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8db9, pci_quirk_nxp_rp_acs }, + { PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs }, /* Zhaoxin Root/Downstream Ports */ { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, { 0 } diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 4bac1831de80..1a9b8589391c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2451,7 +2451,8 @@ #define PCI_VENDOR_ID_TDI 0x192E #define PCI_DEVICE_ID_TDI_EHCI 0x0101 -#define PCI_VENDOR_ID_FREESCALE 0x1957 +#define PCI_VENDOR_ID_FREESCALE 0x1957 /* duplicate: NXP */ +#define PCI_VENDOR_ID_NXP 0x1957 /* duplicate: FREESCALE */ #define PCI_DEVICE_ID_MPC8308 0xc006 #define PCI_DEVICE_ID_MPC8315E 0x00b4 #define PCI_DEVICE_ID_MPC8315 0x00b5 -- cgit v1.2.3-71-gd317 From 08bf54fcf5ca87328541e035090c6a85c8e064f4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 2 Aug 2021 21:43:54 +0300 Subject: dmaengine: dw: Convert members to u32 in platform data u32 is a type that is used for properties retrieval from DT. With the type change it allows to clean up properties reading routine. While at it, order the fields in way how they are parsed. Signed-off-by: Andy Shevchenko Reviewed-by: Serge Semin Tested-by: Serge Semin Link: https://lore.kernel.org/r/20210802184355.49879-2-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/platform_data/dma-dw.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h index b11b0c8bc5da..860ba4bc5ead 100644 --- a/include/linux/platform_data/dma-dw.h +++ b/include/linux/platform_data/dma-dw.h @@ -41,11 +41,11 @@ struct dw_dma_slave { /** * struct dw_dma_platform_data - Controller configuration parameters + * @nr_masters: Number of AHB masters supported by the controller * @nr_channels: Number of channels supported by hardware (max 8) * @chan_allocation_order: Allocate channels starting from 0 or 7 * @chan_priority: Set channel priority increasing from 0 to 7 or 7 to 0. * @block_size: Maximum block size supported by the controller - * @nr_masters: Number of AHB masters supported by the controller * @data_width: Maximum data width supported by hardware per AHB master * (in bytes, power of 2) * @multi_block: Multi block transfers supported by hardware per channel. @@ -55,25 +55,25 @@ struct dw_dma_slave { * @quirks: Optional platform quirks. */ struct dw_dma_platform_data { - unsigned int nr_channels; + u32 nr_masters; + u32 nr_channels; #define CHAN_ALLOCATION_ASCENDING 0 /* zero to seven */ #define CHAN_ALLOCATION_DESCENDING 1 /* seven to zero */ - unsigned char chan_allocation_order; + u32 chan_allocation_order; #define CHAN_PRIORITY_ASCENDING 0 /* chan0 highest */ #define CHAN_PRIORITY_DESCENDING 1 /* chan7 highest */ - unsigned char chan_priority; - unsigned int block_size; - unsigned char nr_masters; - unsigned char data_width[DW_DMA_MAX_NR_MASTERS]; - unsigned char multi_block[DW_DMA_MAX_NR_CHANNELS]; + u32 chan_priority; + u32 block_size; + u32 data_width[DW_DMA_MAX_NR_MASTERS]; + u32 multi_block[DW_DMA_MAX_NR_CHANNELS]; u32 max_burst[DW_DMA_MAX_NR_CHANNELS]; #define CHAN_PROTCTL_PRIVILEGED BIT(0) #define CHAN_PROTCTL_BUFFERABLE BIT(1) #define CHAN_PROTCTL_CACHEABLE BIT(2) #define CHAN_PROTCTL_MASK GENMASK(2, 0) - unsigned char protctl; + u32 protctl; #define DW_DMA_QUIRK_XBAR_PRESENT BIT(0) - unsigned int quirks; + u32 quirks; }; #endif /* _PLATFORM_DATA_DMA_DW_H */ -- cgit v1.2.3-71-gd317 From c17495b01b72b53bd290f442d39b060e015c7aea Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 10 Aug 2021 12:04:33 +0530 Subject: cpufreq: Add callback to register with energy model Many cpufreq drivers register with the energy model for each policy and do exactly the same thing. Follow the footsteps of thermal-cooling, to get it done from the cpufreq core itself. Provide a new callback, which will be called, if present, by the cpufreq core at the right moment (more on that in the code's comment). Also provide a generic implementation that uses dev_pm_opp_of_register_em(). This also allows us to register with the EM at a later point of time, compared to ->init(), from where the EM core can access cpufreq policy directly using cpufreq_cpu_get() type of helpers and perform other work, like marking few frequencies inefficient, this will be done separately. Reviewed-by: Quentin Perret Reviewed-by: Lukasz Luba Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 13 +++++++++++++ include/linux/cpufreq.h | 14 ++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 45f3416988f1..d301f39248a0 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1491,6 +1491,19 @@ static int cpufreq_online(unsigned int cpu) write_lock_irqsave(&cpufreq_driver_lock, flags); list_add(&policy->policy_list, &cpufreq_policy_list); write_unlock_irqrestore(&cpufreq_driver_lock, flags); + + /* + * Register with the energy model before + * sched_cpufreq_governor_change() is called, which will result + * in rebuilding of the sched domains, which should only be done + * once the energy model is properly initialized for the policy + * first. + * + * Also, this should be called before the policy is registered + * with cooling framework. + */ + if (cpufreq_driver->register_em) + cpufreq_driver->register_em(policy); } ret = cpufreq_init_policy(policy); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9fd719475fcd..c65a1d7385f8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -9,10 +9,12 @@ #define _LINUX_CPUFREQ_H #include +#include #include #include #include #include +#include #include #include #include @@ -373,6 +375,12 @@ struct cpufreq_driver { /* platform specific boost support code */ bool boost_enabled; int (*set_boost)(struct cpufreq_policy *policy, int state); + + /* + * Set by drivers that want to register with the energy model after the + * policy is properly initialized, but before the governor is started. + */ + void (*register_em)(struct cpufreq_policy *policy); }; /* flags */ @@ -1046,4 +1054,10 @@ unsigned int cpufreq_generic_get(unsigned int cpu); void cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +static inline void cpufreq_register_em_with_opp(struct cpufreq_policy *policy) +{ + dev_pm_opp_of_register_em(get_cpu_device(policy->cpu), + policy->related_cpus); +} #endif /* _LINUX_CPUFREQ_H */ -- cgit v1.2.3-71-gd317 From fd00faa375fbb9d46ae0730d0faf4a3006301005 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 8 Aug 2021 19:21:56 +0200 Subject: PCI/VPD: Embed struct pci_vpd in struct pci_dev Now that struct pci_vpd is really small, simplify the code by embedding struct pci_vpd directly in struct pci_dev instead of dynamically allocating it. Link: https://lore.kernel.org/r/d898489e-22ba-71f1-2f31-f1a78dc15849@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/probe.c | 1 - drivers/pci/vpd.c | 63 ++++++++++++----------------------------------------- include/linux/pci.h | 9 ++++++-- 3 files changed, 21 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 79177ac37880..0ec5c792c27d 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2225,7 +2225,6 @@ static void pci_release_capabilities(struct pci_dev *dev) { pci_aer_exit(dev); pci_rcec_exit(dev); - pci_vpd_release(dev); pci_iov_release(dev); pci_free_cap_save_buffers(dev); } diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 9d9cff5f89e2..ee48e167145f 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -13,12 +13,6 @@ /* VPD access through PCI 2.2+ VPD capability */ -struct pci_vpd { - struct mutex lock; - unsigned int len; - u8 cap; -}; - static struct pci_dev *pci_get_func0_dev(struct pci_dev *dev) { return pci_get_slot(dev->bus, PCI_DEVFN(PCI_SLOT(dev->devfn), 0)); @@ -37,7 +31,7 @@ static size_t pci_vpd_size(struct pci_dev *dev) unsigned char tag, header[1+2]; /* 1 byte tag, 2 bytes length */ /* Otherwise the following reads would fail. */ - dev->vpd->len = PCI_VPD_MAX_SIZE; + dev->vpd.len = PCI_VPD_MAX_SIZE; while (pci_read_vpd(dev, off, 1, header) == 1) { size = 0; @@ -89,7 +83,7 @@ error: */ static int pci_vpd_wait(struct pci_dev *dev, bool set) { - struct pci_vpd *vpd = dev->vpd; + struct pci_vpd *vpd = &dev->vpd; unsigned long timeout = jiffies + msecs_to_jiffies(125); unsigned long max_sleep = 16; u16 status; @@ -119,12 +113,12 @@ static int pci_vpd_wait(struct pci_dev *dev, bool set) static ssize_t pci_vpd_read(struct pci_dev *dev, loff_t pos, size_t count, void *arg) { - struct pci_vpd *vpd = dev->vpd; + struct pci_vpd *vpd = &dev->vpd; int ret = 0; loff_t end = pos + count; u8 *buf = arg; - if (!vpd) + if (!vpd->cap) return -ENODEV; if (pos < 0) @@ -186,12 +180,12 @@ static ssize_t pci_vpd_read(struct pci_dev *dev, loff_t pos, size_t count, static ssize_t pci_vpd_write(struct pci_dev *dev, loff_t pos, size_t count, const void *arg) { - struct pci_vpd *vpd = dev->vpd; + struct pci_vpd *vpd = &dev->vpd; const u8 *buf = arg; loff_t end = pos + count; int ret = 0; - if (!vpd) + if (!vpd->cap) return -ENODEV; if (pos < 0 || (pos & 3) || (count & 3)) @@ -238,25 +232,8 @@ static ssize_t pci_vpd_write(struct pci_dev *dev, loff_t pos, size_t count, void pci_vpd_init(struct pci_dev *dev) { - struct pci_vpd *vpd; - u8 cap; - - cap = pci_find_capability(dev, PCI_CAP_ID_VPD); - if (!cap) - return; - - vpd = kzalloc(sizeof(*vpd), GFP_ATOMIC); - if (!vpd) - return; - - mutex_init(&vpd->lock); - vpd->cap = cap; - dev->vpd = vpd; -} - -void pci_vpd_release(struct pci_dev *dev) -{ - kfree(dev->vpd); + dev->vpd.cap = pci_find_capability(dev, PCI_CAP_ID_VPD); + mutex_init(&dev->vpd.lock); } static ssize_t vpd_read(struct file *filp, struct kobject *kobj, @@ -288,7 +265,7 @@ static umode_t vpd_attr_is_visible(struct kobject *kobj, { struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); - if (!pdev->vpd) + if (!pdev->vpd.cap) return 0; return a->attr.mode; @@ -400,7 +377,7 @@ static void quirk_f0_vpd_link(struct pci_dev *dev) if (!f0) return; - if (f0->vpd && dev->class == f0->class && + if (f0->vpd.cap && dev->class == f0->class && dev->vendor == f0->vendor && dev->device == f0->device) dev->dev_flags |= PCI_DEV_FLAGS_VPD_REF_F0; @@ -418,10 +395,8 @@ DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, */ static void quirk_blacklist_vpd(struct pci_dev *dev) { - if (dev->vpd) { - dev->vpd->len = PCI_VPD_SZ_INVALID; - pci_warn(dev, FW_BUG "disabling VPD access (can't determine size of non-standard VPD format)\n"); - } + dev->vpd.len = PCI_VPD_SZ_INVALID; + pci_warn(dev, FW_BUG "disabling VPD access (can't determine size of non-standard VPD format)\n"); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LSI_LOGIC, 0x0060, quirk_blacklist_vpd); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LSI_LOGIC, 0x007c, quirk_blacklist_vpd); @@ -443,16 +418,6 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, PCI_ANY_ID, DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, PCI_CLASS_BRIDGE_PCI, 8, quirk_blacklist_vpd); -static void pci_vpd_set_size(struct pci_dev *dev, size_t len) -{ - struct pci_vpd *vpd = dev->vpd; - - if (!vpd || len == 0 || len > PCI_VPD_MAX_SIZE) - return; - - vpd->len = len; -} - static void quirk_chelsio_extend_vpd(struct pci_dev *dev) { int chip = (dev->device & 0xf000) >> 12; @@ -471,9 +436,9 @@ static void quirk_chelsio_extend_vpd(struct pci_dev *dev) * limits. */ if (chip == 0x0 && prod >= 0x20) - pci_vpd_set_size(dev, 8192); + dev->vpd.len = 8192; else if (chip >= 0x4 && func < 0x8) - pci_vpd_set_size(dev, 2048); + dev->vpd.len = 2048; } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, PCI_ANY_ID, diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..e752cc39a1fe 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -300,9 +300,14 @@ struct pci_cap_saved_state { struct pci_cap_saved_data cap; }; +struct pci_vpd { + struct mutex lock; + unsigned int len; + u8 cap; +}; + struct irq_affinity; struct pcie_link_state; -struct pci_vpd; struct pci_sriov; struct pci_p2pdma; struct rcec_ea; @@ -473,7 +478,7 @@ struct pci_dev { #ifdef CONFIG_PCI_MSI const struct attribute_group **msi_irq_groups; #endif - struct pci_vpd *vpd; + struct pci_vpd vpd; #ifdef CONFIG_PCIE_DPC u16 dpc_cap; unsigned int dpc_rp_extensions:1; -- cgit v1.2.3-71-gd317 From 7bb698f09bdd01fbb6d48c14bb1dde556dc1af00 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 28 Jul 2021 07:47:33 -0500 Subject: fs: Move notify_change permission checks into may_setattr Move the permission checks in notify_change into a separate function to make them available to filesystems. When notify_change is called, the vfs performs those checks before calling into iop->setattr. However, a filesystem like gfs2 can only lock and revalidate the inode inside ->setattr, and it must then repeat those checks to err on the safe side. It would be nice to get rid of the double checking, but moving the permission check into iop->setattr altogether isn't really an option. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Signed-off-by: Al Viro --- fs/attr.c | 50 +++++++++++++++++++++++++++++++------------------- include/linux/fs.h | 2 ++ 2 files changed, 33 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/attr.c b/fs/attr.c index 87ef39db1c34..473d21b3a86d 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -249,6 +249,34 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, } EXPORT_SYMBOL(setattr_copy); +int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, + unsigned int ia_valid) +{ + int error; + + if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + } + + /* + * If utimes(2) and friends are called with times == NULL (or both + * times are UTIME_NOW), then we need to check for write permission + */ + if (ia_valid & ATTR_TOUCH) { + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (!inode_owner_or_capable(mnt_userns, inode)) { + error = inode_permission(mnt_userns, inode, MAY_WRITE); + if (error) + return error; + } + } + return 0; +} +EXPORT_SYMBOL(may_setattr); + /** * notify_change - modify attributes of a filesytem object * @mnt_userns: user namespace of the mount the inode was found from @@ -290,25 +318,9 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, WARN_ON_ONCE(!inode_is_locked(inode)); - if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - } - - /* - * If utimes(2) and friends are called with times == NULL (or both - * times are UTIME_NOW), then we need to check for write permission - */ - if (ia_valid & ATTR_TOUCH) { - if (IS_IMMUTABLE(inode)) - return -EPERM; - - if (!inode_owner_or_capable(mnt_userns, inode)) { - error = inode_permission(mnt_userns, inode, MAY_WRITE); - if (error) - return error; - } - } + error = may_setattr(mnt_userns, inode, ia_valid); + if (error) + return error; if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..50192964bf6b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3469,6 +3469,8 @@ extern int buffer_migrate_page_norefs(struct address_space *, #define buffer_migrate_page_norefs NULL #endif +int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, + unsigned int ia_valid); int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); void setattr_copy(struct user_namespace *, struct inode *inode, -- cgit v1.2.3-71-gd317 From fb83610762dd5927212aa62a468dd3b756b57a88 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Jul 2021 11:06:44 +0200 Subject: thermal/core: Fix thermal_cooling_device_register() prototype There are two pairs of declarations for thermal_cooling_device_register() and thermal_of_cooling_device_register(), and only one set was changed in a recent patch, so the other one now causes a compile-time warning: drivers/net/wireless/mediatek/mt76/mt7915/init.c: In function 'mt7915_thermal_init': drivers/net/wireless/mediatek/mt76/mt7915/init.c:134:48: error: passing argument 1 of 'thermal_cooling_device_register' discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] 134 | cdev = thermal_cooling_device_register(wiphy_name(wiphy), phy, | ^~~~~~~~~~~~~~~~~ In file included from drivers/net/wireless/mediatek/mt76/mt7915/init.c:7: include/linux/thermal.h:407:39: note: expected 'char *' but argument is of type 'const char *' 407 | thermal_cooling_device_register(char *type, void *devdata, | ~~~~~~^~~~ Change the dummy helper functions to have the same arguments as the normal version. Fixes: f991de53a8ab ("thermal: make device_register's type argument const") Signed-off-by: Arnd Bergmann Reviewed-by: Jean-Francois Dagenais Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210722090717.1116748-1-arnd@kernel.org --- include/linux/thermal.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index d296f3b88fb9..8050d929a5b4 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -404,12 +404,13 @@ static inline void thermal_zone_device_unregister( struct thermal_zone_device *tz) { } static inline struct thermal_cooling_device * -thermal_cooling_device_register(char *type, void *devdata, +thermal_cooling_device_register(const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return ERR_PTR(-ENODEV); } static inline struct thermal_cooling_device * thermal_of_cooling_device_register(struct device_node *np, - char *type, void *devdata, const struct thermal_cooling_device_ops *ops) + const char *type, void *devdata, + const struct thermal_cooling_device_ops *ops) { return ERR_PTR(-ENODEV); } static inline struct thermal_cooling_device * devm_thermal_of_cooling_device_register(struct device *dev, -- cgit v1.2.3-71-gd317 From 454f2ed4b34f9ef5726d080b1eb5dc47a7f36d6f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 14 Jul 2021 13:27:01 +0200 Subject: thermal: Spelling s/scallbacks/callbacks/ Fix a misspelling of the word "callbacks". Signed-off-by: Geert Uytterhoeven Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/ae38372996a23bb67769e2d62ca170ae9457c4df.1626261946.git.geert+renesas@glider.be --- include/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 8050d929a5b4..c314893970b3 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -285,7 +285,7 @@ struct thermal_zone_params { }; /** - * struct thermal_zone_of_device_ops - scallbacks for handling DT based zones + * struct thermal_zone_of_device_ops - callbacks for handling DT based zones * * Mandatory: * @get_temp: a pointer to a function that reads the sensor temperature. -- cgit v1.2.3-71-gd317 From 69139244806537f9d51364f37fe146bb2ee88a05 Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Tue, 17 Aug 2021 23:34:52 +0530 Subject: PCI: Cache PCIe Device Capabilities register Add a new member called devcap in struct pci_dev for caching the PCIe Device Capabilities register to avoid reading PCI_EXP_DEVCAP multiple times. Refactor pcie_has_flr() to use cached device capabilities. Link: https://lore.kernel.org/r/20210817180500.1253-2-ameynarkhede03@gmail.com Signed-off-by: Amey Narkhede Signed-off-by: Bjorn Helgaas Reviewed-by: Raphael Norwitz --- drivers/pci/pci.c | 6 ++---- drivers/pci/probe.c | 5 +++-- include/linux/pci.h | 1 + 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index aacf575c15cf..b7a9f680c513 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "pci.h" DEFINE_MUTEX(pci_slot_mutex); @@ -4630,13 +4631,10 @@ EXPORT_SYMBOL(pci_wait_for_pending_transaction); */ bool pcie_has_flr(struct pci_dev *dev) { - u32 cap; - if (dev->dev_flags & PCI_DEV_FLAGS_NO_FLR_RESET) return false; - pcie_capability_read_dword(dev, PCI_EXP_DEVCAP, &cap); - return cap & PCI_EXP_DEVCAP_FLR; + return FIELD_GET(PCI_EXP_DEVCAP_FLR, dev->devcap) == 1; } EXPORT_SYMBOL_GPL(pcie_has_flr); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 79177ac37880..81eb88ae4301 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "pci.h" #define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */ @@ -1498,8 +1499,8 @@ void set_pcie_port_type(struct pci_dev *pdev) pdev->pcie_cap = pos; pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, ®16); pdev->pcie_flags_reg = reg16; - pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, ®16); - pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD; + pci_read_config_dword(pdev, pos + PCI_EXP_DEVCAP, &pdev->devcap); + pdev->pcie_mpss = FIELD_GET(PCI_EXP_DEVCAP_PAYLOAD, pdev->devcap); parent = pci_upstream_bridge(pdev); if (!parent) diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..1179c0ee2bfb 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -333,6 +333,7 @@ struct pci_dev { struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */ struct pci_dev *rcec; /* Associated RCEC device */ #endif + u32 devcap; /* PCIe Device Capabilities */ u8 pcie_cap; /* PCIe capability offset */ u8 msi_cap; /* MSI capability offset */ u8 msix_cap; /* MSI-X capability offset */ -- cgit v1.2.3-71-gd317 From 56f107d7813f116484019617043393a7753ffcbf Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Tue, 17 Aug 2021 23:34:53 +0530 Subject: PCI: Add pcie_reset_flr() with 'probe' argument Most reset methods are of the form "pci_*_reset(dev, probe)". pcie_flr() was an exception because it relied on a separate pcie_has_flr() function instead of taking a "probe" argument. Add "pcie_reset_flr(dev, probe)" to follow the convention. Remove pcie_has_flr(). Some pcie_flr() callers that did not use pcie_has_flr() remain. [bhelgaas: commit log, rework pcie_reset_flr() to use dev->devcap directly] Link: https://lore.kernel.org/r/20210817180500.1253-3-ameynarkhede03@gmail.com Signed-off-by: Amey Narkhede Signed-off-by: Bjorn Helgaas Reviewed-by: Raphael Norwitz --- drivers/crypto/cavium/nitrox/nitrox_main.c | 4 +-- drivers/pci/pci.c | 56 ++++++++++++++++-------------- drivers/pci/pcie/aer.c | 12 +++---- drivers/pci/quirks.c | 9 ++--- include/linux/pci.h | 2 +- 5 files changed, 40 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/drivers/crypto/cavium/nitrox/nitrox_main.c b/drivers/crypto/cavium/nitrox/nitrox_main.c index 96bc7b5c6532..2db3fd5815c8 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_main.c +++ b/drivers/crypto/cavium/nitrox/nitrox_main.c @@ -306,9 +306,7 @@ static int nitrox_device_flr(struct pci_dev *pdev) return -ENOMEM; } - /* check flr support */ - if (pcie_has_flr(pdev)) - pcie_flr(pdev); + pcie_reset_flr(pdev, 0); pci_restore_state(pdev); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b7a9f680c513..b0a63bdf8207 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4622,29 +4622,12 @@ int pci_wait_for_pending_transaction(struct pci_dev *dev) } EXPORT_SYMBOL(pci_wait_for_pending_transaction); -/** - * pcie_has_flr - check if a device supports function level resets - * @dev: device to check - * - * Returns true if the device advertises support for PCIe function level - * resets. - */ -bool pcie_has_flr(struct pci_dev *dev) -{ - if (dev->dev_flags & PCI_DEV_FLAGS_NO_FLR_RESET) - return false; - - return FIELD_GET(PCI_EXP_DEVCAP_FLR, dev->devcap) == 1; -} -EXPORT_SYMBOL_GPL(pcie_has_flr); - /** * pcie_flr - initiate a PCIe function level reset * @dev: device to reset * - * Initiate a function level reset on @dev. The caller should ensure the - * device supports FLR before calling this function, e.g. by using the - * pcie_has_flr() helper. + * Initiate a function level reset unconditionally on @dev without + * checking any flags and DEVCAP */ int pcie_flr(struct pci_dev *dev) { @@ -4667,6 +4650,28 @@ int pcie_flr(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pcie_flr); +/** + * pcie_reset_flr - initiate a PCIe function level reset + * @dev: device to reset + * @probe: If set, only check if the device can be reset this way. + * + * Initiate a function level reset on @dev. + */ +int pcie_reset_flr(struct pci_dev *dev, int probe) +{ + if (dev->dev_flags & PCI_DEV_FLAGS_NO_FLR_RESET) + return -ENOTTY; + + if (!(dev->devcap & PCI_EXP_DEVCAP_FLR)) + return -ENOTTY; + + if (probe) + return 0; + + return pcie_flr(dev); +} +EXPORT_SYMBOL_GPL(pcie_reset_flr); + static int pci_af_flr(struct pci_dev *dev, int probe) { int pos; @@ -5149,11 +5154,9 @@ int __pci_reset_function_locked(struct pci_dev *dev) rc = pci_dev_specific_reset(dev, 0); if (rc != -ENOTTY) return rc; - if (pcie_has_flr(dev)) { - rc = pcie_flr(dev); - if (rc != -ENOTTY) - return rc; - } + rc = pcie_reset_flr(dev, 0); + if (rc != -ENOTTY) + return rc; rc = pci_af_flr(dev, 0); if (rc != -ENOTTY) return rc; @@ -5184,8 +5187,9 @@ int pci_probe_reset_function(struct pci_dev *dev) rc = pci_dev_specific_reset(dev, 1); if (rc != -ENOTTY) return rc; - if (pcie_has_flr(dev)) - return 0; + rc = pcie_reset_flr(dev, 1); + if (rc != -ENOTTY) + return rc; rc = pci_af_flr(dev, 1); if (rc != -ENOTTY) return rc; diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index df4ba9b384c2..031379deb130 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1407,13 +1407,11 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev) } if (type == PCI_EXP_TYPE_RC_EC || type == PCI_EXP_TYPE_RC_END) { - if (pcie_has_flr(dev)) { - rc = pcie_flr(dev); - pci_info(dev, "has been reset (%d)\n", rc); - } else { - pci_info(dev, "not reset (no FLR support)\n"); - rc = -ENOTTY; - } + rc = pcie_reset_flr(dev, 0); + if (!rc) + pci_info(dev, "has been reset\n"); + else + pci_info(dev, "not reset (no FLR support: %d)\n", rc); } else { rc = pci_bus_error_reset(dev); pci_info(dev, "%s Port link has been reset (%d)\n", diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6d74386eadc2..a8a167bbc1d7 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3852,7 +3852,7 @@ static int nvme_disable_and_flr(struct pci_dev *dev, int probe) u32 cfg; if (dev->class != PCI_CLASS_STORAGE_EXPRESS || - !pcie_has_flr(dev) || !pci_resource_start(dev, 0)) + pcie_reset_flr(dev, 1) || !pci_resource_start(dev, 0)) return -ENOTTY; if (probe) @@ -3921,13 +3921,10 @@ static int nvme_disable_and_flr(struct pci_dev *dev, int probe) */ static int delay_250ms_after_flr(struct pci_dev *dev, int probe) { - if (!pcie_has_flr(dev)) - return -ENOTTY; - if (probe) - return 0; + return pcie_reset_flr(dev, 1); - pcie_flr(dev); + pcie_reset_flr(dev, 0); msleep(250); diff --git a/include/linux/pci.h b/include/linux/pci.h index 1179c0ee2bfb..1de37e3fc29d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1229,7 +1229,7 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev, enum pci_bus_speed *speed, enum pcie_link_width *width); void pcie_print_link_status(struct pci_dev *dev); -bool pcie_has_flr(struct pci_dev *dev); +int pcie_reset_flr(struct pci_dev *dev, int probe); int pcie_flr(struct pci_dev *dev); int __pci_reset_function_locked(struct pci_dev *dev); int pci_reset_function(struct pci_dev *dev); -- cgit v1.2.3-71-gd317 From e20afa06244eb5d7fa850f9fe2a78ae17ba96f81 Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Tue, 17 Aug 2021 23:34:54 +0530 Subject: PCI: Add array to track reset method ordering Add reset_methods[] in struct pci_dev to keep track of reset mechanisms supported by the device and their ordering. Refactor probing and reset functions to take advantage of calling convention of reset functions. Co-developed-by: Alex Williamson Link: https://lore.kernel.org/r/20210817180500.1253-4-ameynarkhede03@gmail.com Signed-off-by: Alex Williamson Signed-off-by: Amey Narkhede Signed-off-by: Bjorn Helgaas Reviewed-by: Raphael Norwitz --- drivers/pci/pci.c | 94 ++++++++++++++++++++++++++++++----------------------- drivers/pci/pci.h | 8 ++++- drivers/pci/probe.c | 5 ++- include/linux/pci.h | 6 ++++ 4 files changed, 69 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b0a63bdf8207..43a823f8dd69 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -73,6 +73,11 @@ static void pci_dev_d3_sleep(struct pci_dev *dev) msleep(delay); } +bool pci_reset_supported(struct pci_dev *dev) +{ + return dev->reset_methods[0] != 0; +} + #ifdef CONFIG_PCI_DOMAINS int pci_domains_supported = 1; #endif @@ -5117,6 +5122,16 @@ static void pci_dev_restore(struct pci_dev *dev) err_handler->reset_done(dev); } +/* dev->reset_methods[] is a 0-terminated list of indices into this array */ +static const struct pci_reset_fn_method pci_reset_fn_methods[] = { + { }, + { pci_dev_specific_reset, .name = "device_specific" }, + { pcie_reset_flr, .name = "flr" }, + { pci_af_flr, .name = "af_flr" }, + { pci_pm_reset, .name = "pm" }, + { pci_reset_bus_function, .name = "bus" }, +}; + /** * __pci_reset_function_locked - reset a PCI device function while holding * the @dev mutex lock. @@ -5139,65 +5154,64 @@ static void pci_dev_restore(struct pci_dev *dev) */ int __pci_reset_function_locked(struct pci_dev *dev) { - int rc; + int i, m, rc = -ENOTTY; might_sleep(); /* - * A reset method returns -ENOTTY if it doesn't support this device - * and we should try the next method. + * A reset method returns -ENOTTY if it doesn't support this device and + * we should try the next method. * - * If it returns 0 (success), we're finished. If it returns any - * other error, we're also finished: this indicates that further - * reset mechanisms might be broken on the device. + * If it returns 0 (success), we're finished. If it returns any other + * error, we're also finished: this indicates that further reset + * mechanisms might be broken on the device. */ - rc = pci_dev_specific_reset(dev, 0); - if (rc != -ENOTTY) - return rc; - rc = pcie_reset_flr(dev, 0); - if (rc != -ENOTTY) - return rc; - rc = pci_af_flr(dev, 0); - if (rc != -ENOTTY) - return rc; - rc = pci_pm_reset(dev, 0); - if (rc != -ENOTTY) - return rc; - return pci_reset_bus_function(dev, 0); + for (i = 0; i < PCI_NUM_RESET_METHODS; i++) { + m = dev->reset_methods[i]; + if (!m) + return -ENOTTY; + + rc = pci_reset_fn_methods[m].reset_fn(dev, 0); + if (!rc) + return 0; + if (rc != -ENOTTY) + return rc; + } + + return -ENOTTY; } EXPORT_SYMBOL_GPL(__pci_reset_function_locked); /** - * pci_probe_reset_function - check whether the device can be safely reset - * @dev: PCI device to reset + * pci_init_reset_methods - check whether device can be safely reset + * and store supported reset mechanisms. + * @dev: PCI device to check for reset mechanisms * * Some devices allow an individual function to be reset without affecting - * other functions in the same device. The PCI device must be responsive - * to PCI config space in order to use this function. + * other functions in the same device. The PCI device must be in D0-D3hot + * state. * - * Returns 0 if the device function can be reset or negative if the - * device doesn't support resetting a single function. + * Stores reset mechanisms supported by device in reset_methods byte array + * which is a member of struct pci_dev. */ -int pci_probe_reset_function(struct pci_dev *dev) +void pci_init_reset_methods(struct pci_dev *dev) { - int rc; + int m, i, rc; + + BUILD_BUG_ON(ARRAY_SIZE(pci_reset_fn_methods) != PCI_NUM_RESET_METHODS); might_sleep(); - rc = pci_dev_specific_reset(dev, 1); - if (rc != -ENOTTY) - return rc; - rc = pcie_reset_flr(dev, 1); - if (rc != -ENOTTY) - return rc; - rc = pci_af_flr(dev, 1); - if (rc != -ENOTTY) - return rc; - rc = pci_pm_reset(dev, 1); - if (rc != -ENOTTY) - return rc; + i = 0; + for (m = 1; m < PCI_NUM_RESET_METHODS; m++) { + rc = pci_reset_fn_methods[m].reset_fn(dev, 1); + if (!rc) + dev->reset_methods[i++] = m; + else if (rc != -ENOTTY) + break; + } - return pci_reset_bus_function(dev, 1); + dev->reset_methods[i] = 0; } /** diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 93dcdd431072..ebeacb3dbe1e 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -33,7 +33,8 @@ enum pci_mmap_api { int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vmai, enum pci_mmap_api mmap_api); -int pci_probe_reset_function(struct pci_dev *dev); +bool pci_reset_supported(struct pci_dev *dev); +void pci_init_reset_methods(struct pci_dev *dev); int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); @@ -610,6 +611,11 @@ struct pci_dev_reset_methods { int (*reset)(struct pci_dev *dev, int probe); }; +struct pci_reset_fn_method { + int (*reset_fn)(struct pci_dev *pdev, int probe); + char *name; +}; + #ifdef CONFIG_PCI_QUIRKS int pci_dev_specific_reset(struct pci_dev *dev, int probe); #else diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 81eb88ae4301..817ad149ebd1 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2429,9 +2429,8 @@ static void pci_init_capabilities(struct pci_dev *dev) pci_rcec_init(dev); /* Root Complex Event Collector */ pcie_report_downtraining(dev); - - if (pci_probe_reset_function(dev) == 0) - dev->reset_fn = 1; + pci_init_reset_methods(dev); + dev->reset_fn = pci_reset_supported(dev); } /* diff --git a/include/linux/pci.h b/include/linux/pci.h index 1de37e3fc29d..2faf517d20c1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -49,6 +49,9 @@ PCI_STATUS_SIG_TARGET_ABORT | \ PCI_STATUS_PARITY) +/* Number of reset methods used in pci_reset_fn_methods array in pci.c */ +#define PCI_NUM_RESET_METHODS 6 + /* * The PCI interface treats multi-function devices as independent * devices. The slot/function address of each device is encoded @@ -506,6 +509,9 @@ struct pci_dev { char *driver_override; /* Driver name to force a match */ unsigned long priv_flags; /* Private flags for the PCI driver */ + + /* These methods index pci_reset_fn_methods[] */ + u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */ }; static inline struct pci_dev *pci_physfn(struct pci_dev *dev) -- cgit v1.2.3-71-gd317 From 4ec36dfeb155b72da8d28ab006a46f2f8b981eac Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Tue, 17 Aug 2021 23:34:55 +0530 Subject: PCI: Remove reset_fn field from pci_dev "reset_fn" indicates whether the device supports any reset mechanism. Remove the use of reset_fn in favor of the reset_methods array that tracks supported reset mechanisms of a device and their ordering. The octeon driver incorrectly used reset_fn to detect whether the device supports FLR or not. Use pcie_reset_flr() to probe whether it supports FLR. Co-developed-by: Alex Williamson Link: https://lore.kernel.org/r/20210817180500.1253-5-ameynarkhede03@gmail.com Signed-off-by: Alex Williamson Signed-off-by: Amey Narkhede Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson Reviewed-by: Raphael Norwitz --- drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 2 +- drivers/pci/pci-sysfs.c | 2 +- drivers/pci/pci.c | 6 +++--- drivers/pci/probe.c | 1 - drivers/pci/quirks.c | 2 +- drivers/pci/remove.c | 1 - include/linux/pci.h | 1 - 7 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c index ffddb3126a32..d185df5acea6 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c @@ -526,7 +526,7 @@ static void octeon_destroy_resources(struct octeon_device *oct) oct->irq_name_storage = NULL; } /* Soft reset the octeon device before exiting */ - if (oct->pci_dev->reset_fn) + if (!pcie_reset_flr(oct->pci_dev, 1)) octeon_pci_flr(oct); else cn23xx_vf_ask_pf_to_do_flr(oct); diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 5d63df7c1820..a1d9b0e83615 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1367,7 +1367,7 @@ static umode_t pci_dev_reset_attr_is_visible(struct kobject *kobj, { struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); - if (!pdev->reset_fn) + if (!pci_reset_supported(pdev)) return 0; return a->mode; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 43a823f8dd69..5ead8826c702 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5234,7 +5234,7 @@ int pci_reset_function(struct pci_dev *dev) { int rc; - if (!dev->reset_fn) + if (!pci_reset_supported(dev)) return -ENOTTY; pci_dev_lock(dev); @@ -5270,7 +5270,7 @@ int pci_reset_function_locked(struct pci_dev *dev) { int rc; - if (!dev->reset_fn) + if (!pci_reset_supported(dev)) return -ENOTTY; pci_dev_save_and_disable(dev); @@ -5293,7 +5293,7 @@ int pci_try_reset_function(struct pci_dev *dev) { int rc; - if (!dev->reset_fn) + if (!pci_reset_supported(dev)) return -ENOTTY; if (!pci_dev_trylock(dev)) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 817ad149ebd1..3325d4682cd6 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2430,7 +2430,6 @@ static void pci_init_capabilities(struct pci_dev *dev) pcie_report_downtraining(dev); pci_init_reset_methods(dev); - dev->reset_fn = pci_reset_supported(dev); } /* diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index a8a167bbc1d7..a1b57b63c624 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5626,7 +5626,7 @@ static void quirk_reset_lenovo_thinkpad_p50_nvgpu(struct pci_dev *pdev) if (pdev->subsystem_vendor != PCI_VENDOR_ID_LENOVO || pdev->subsystem_device != 0x222e || - !pdev->reset_fn) + !pci_reset_supported(pdev)) return; if (pci_enable_device_mem(pdev)) diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index dd12c2fcc7dc..4c54c75050dc 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -19,7 +19,6 @@ static void pci_stop_dev(struct pci_dev *dev) pci_pme_active(dev, false); if (pci_dev_is_added(dev)) { - dev->reset_fn = 0; device_release_driver(&dev->dev); pci_proc_detach_device(dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 2faf517d20c1..d1f4d248617b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -431,7 +431,6 @@ struct pci_dev { unsigned int state_saved:1; unsigned int is_physfn:1; unsigned int is_virtfn:1; - unsigned int reset_fn:1; unsigned int is_hotplug_bridge:1; unsigned int shpc_managed:1; /* SHPC owned by shpchp */ unsigned int is_thunderbolt:1; /* Thunderbolt controller */ -- cgit v1.2.3-71-gd317 From de3438c47a8ddc75548e62a03736a9321c2b7bac Mon Sep 17 00:00:00 2001 From: Thara Gopinath Date: Mon, 9 Aug 2021 15:15:59 -0400 Subject: firmware: qcom_scm: Introduce SCM calls to access LMh Introduce SCM calls to access/configure limits management hardware(LMH). Reviewed-by: Bjorn Andersson Signed-off-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210809191605.3742979-2-thara.gopinath@linaro.org --- drivers/firmware/qcom_scm.c | 58 +++++++++++++++++++++++++++++++++++++++++++++ drivers/firmware/qcom_scm.h | 4 ++++ include/linux/qcom_scm.h | 14 +++++++++++ 3 files changed, 76 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c index 47ea2bd42b10..8a503753fe2a 100644 --- a/drivers/firmware/qcom_scm.c +++ b/drivers/firmware/qcom_scm.c @@ -1147,6 +1147,64 @@ int qcom_scm_qsmmu500_wait_safe_toggle(bool en) } EXPORT_SYMBOL(qcom_scm_qsmmu500_wait_safe_toggle); +bool qcom_scm_lmh_dcvsh_available(void) +{ + return __qcom_scm_is_call_available(__scm->dev, QCOM_SCM_SVC_LMH, QCOM_SCM_LMH_LIMIT_DCVSH); +} +EXPORT_SYMBOL(qcom_scm_lmh_dcvsh_available); + +int qcom_scm_lmh_profile_change(u32 profile_id) +{ + struct qcom_scm_desc desc = { + .svc = QCOM_SCM_SVC_LMH, + .cmd = QCOM_SCM_LMH_LIMIT_PROFILE_CHANGE, + .arginfo = QCOM_SCM_ARGS(1, QCOM_SCM_VAL), + .args[0] = profile_id, + .owner = ARM_SMCCC_OWNER_SIP, + }; + + return qcom_scm_call(__scm->dev, &desc, NULL); +} +EXPORT_SYMBOL(qcom_scm_lmh_profile_change); + +int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, + u64 limit_node, u32 node_id, u64 version) +{ + dma_addr_t payload_phys; + u32 *payload_buf; + int ret, payload_size = 5 * sizeof(u32); + + struct qcom_scm_desc desc = { + .svc = QCOM_SCM_SVC_LMH, + .cmd = QCOM_SCM_LMH_LIMIT_DCVSH, + .arginfo = QCOM_SCM_ARGS(5, QCOM_SCM_RO, QCOM_SCM_VAL, QCOM_SCM_VAL, + QCOM_SCM_VAL, QCOM_SCM_VAL), + .args[1] = payload_size, + .args[2] = limit_node, + .args[3] = node_id, + .args[4] = version, + .owner = ARM_SMCCC_OWNER_SIP, + }; + + payload_buf = dma_alloc_coherent(__scm->dev, payload_size, &payload_phys, GFP_KERNEL); + if (!payload_buf) + return -ENOMEM; + + payload_buf[0] = payload_fn; + payload_buf[1] = 0; + payload_buf[2] = payload_reg; + payload_buf[3] = 1; + payload_buf[4] = payload_val; + + desc.args[0] = payload_phys; + + ret = qcom_scm_call(__scm->dev, &desc, NULL); + + dma_free_coherent(__scm->dev, payload_size, payload_buf, payload_phys); + return ret; +} +EXPORT_SYMBOL(qcom_scm_lmh_dcvsh); + static int qcom_scm_find_dload_address(struct device *dev, u64 *addr) { struct device_node *tcsr; diff --git a/drivers/firmware/qcom_scm.h b/drivers/firmware/qcom_scm.h index 632fe3142462..d92156ceb3ac 100644 --- a/drivers/firmware/qcom_scm.h +++ b/drivers/firmware/qcom_scm.h @@ -114,6 +114,10 @@ extern int scm_legacy_call(struct device *dev, const struct qcom_scm_desc *desc, #define QCOM_SCM_SVC_HDCP 0x11 #define QCOM_SCM_HDCP_INVOKE 0x01 +#define QCOM_SCM_SVC_LMH 0x13 +#define QCOM_SCM_LMH_LIMIT_PROFILE_CHANGE 0x01 +#define QCOM_SCM_LMH_LIMIT_DCVSH 0x10 + #define QCOM_SCM_SVC_SMMU_PROGRAM 0x15 #define QCOM_SCM_SMMU_CONFIG_ERRATA1 0x03 #define QCOM_SCM_SMMU_CONFIG_ERRATA1_CLIENT_ALL 0x02 diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h index 0165824c5128..c0475d1c9885 100644 --- a/include/linux/qcom_scm.h +++ b/include/linux/qcom_scm.h @@ -109,6 +109,12 @@ extern int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, u32 *resp); extern int qcom_scm_qsmmu500_wait_safe_toggle(bool en); + +extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, + u64 limit_node, u32 node_id, u64 version); +extern int qcom_scm_lmh_profile_change(u32 profile_id); +extern bool qcom_scm_lmh_dcvsh_available(void); + #else #include @@ -170,5 +176,13 @@ static inline int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, static inline int qcom_scm_qsmmu500_wait_safe_toggle(bool en) { return -ENODEV; } + +static inline int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, + u64 limit_node, u32 node_id, u64 version) + { return -ENODEV; } + +static inline int qcom_scm_lmh_profile_change(u32 profile_id) { return -ENODEV; } + +static inline bool qcom_scm_lmh_dcvsh_available(void) { return -ENODEV; } #endif #endif -- cgit v1.2.3-71-gd317 From 6937b7dd434962377e00efc04adac0390c287199 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 17 Aug 2021 23:34:59 +0530 Subject: PCI: Add support for ACPI _RST reset method _RST is a standard ACPI method that performs a function level reset of a device (ACPI v6.3, sec 7.3.25). Add pci_dev_acpi_reset() to probe for _RST method and execute if present. The default priority of this reset is set to below device-specific and above hardware resets. Suggested-by: Alex Williamson Link: https://lore.kernel.org/r/20210817180500.1253-9-ameynarkhede03@gmail.com Signed-off-by: Shanker Donthineni Signed-off-by: Bjorn Helgaas Reviewed-by: Sinan Kaya Reviewed-by: Alex Williamson --- drivers/pci/pci-acpi.c | 23 +++++++++++++++++++++++ drivers/pci/pci.c | 1 + drivers/pci/pci.h | 6 ++++++ include/linux/pci.h | 2 +- 4 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index c27dbb2294e3..b63db75a3dbf 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -941,6 +941,29 @@ void pci_set_acpi_fwnode(struct pci_dev *dev) acpi_pci_find_companion(&dev->dev)); } +/** + * pci_dev_acpi_reset - do a function level reset using _RST method + * @dev: device to reset + * @probe: check if _RST method is included in the acpi_device context. + */ +int pci_dev_acpi_reset(struct pci_dev *dev, int probe) +{ + acpi_handle handle = ACPI_HANDLE(&dev->dev); + + if (!handle || !acpi_has_method(handle, "_RST")) + return -ENOTTY; + + if (probe) + return 0; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_RST", NULL, NULL))) { + pci_warn(dev, "ACPI _RST failed\n"); + return -ENOTTY; + } + + return 0; +} + static bool acpi_pci_power_manageable(struct pci_dev *dev) { struct acpi_device *adev = ACPI_COMPANION(&dev->dev); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 6da5f6d87f6a..4d9828160c48 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5126,6 +5126,7 @@ static void pci_dev_restore(struct pci_dev *dev) static const struct pci_reset_fn_method pci_reset_fn_methods[] = { { }, { pci_dev_specific_reset, .name = "device_specific" }, + { pci_dev_acpi_reset, .name = "acpi" }, { pcie_reset_flr, .name = "flr" }, { pci_af_flr, .name = "af_flr" }, { pci_pm_reset, .name = "pm" }, diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 232047e58b73..87cfd8db8827 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -708,7 +708,13 @@ static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL int pci_acpi_program_hp_params(struct pci_dev *dev); extern const struct attribute_group pci_dev_acpi_attr_group; void pci_set_acpi_fwnode(struct pci_dev *dev); +int pci_dev_acpi_reset(struct pci_dev *dev, int probe); #else +static inline int pci_dev_acpi_reset(struct pci_dev *dev, int probe) +{ + return -ENOTTY; +} + static inline void pci_set_acpi_fwnode(struct pci_dev *dev) {} static inline int pci_acpi_program_hp_params(struct pci_dev *dev) { diff --git a/include/linux/pci.h b/include/linux/pci.h index d1f4d248617b..98718f46a61c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -50,7 +50,7 @@ PCI_STATUS_PARITY) /* Number of reset methods used in pci_reset_fn_methods array in pci.c */ -#define PCI_NUM_RESET_METHODS 6 +#define PCI_NUM_RESET_METHODS 7 /* * The PCI interface treats multi-function devices as independent -- cgit v1.2.3-71-gd317 From 9bdc81ce440ec6ea899b236879aee470ec388020 Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Tue, 17 Aug 2021 23:35:00 +0530 Subject: PCI: Change the type of probe argument in reset functions Change the type of probe argument in functions which implement reset methods from int to bool to make the context and intent clear. Suggested-by: Alex Williamson Link: https://lore.kernel.org/r/20210817180500.1253-10-ameynarkhede03@gmail.com Signed-off-by: Amey Narkhede Signed-off-by: Bjorn Helgaas --- drivers/crypto/cavium/nitrox/nitrox_main.c | 2 +- drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 2 +- drivers/pci/hotplug/pciehp.h | 2 +- drivers/pci/hotplug/pciehp_hpc.c | 2 +- drivers/pci/hotplug/pnv_php.c | 2 +- drivers/pci/pci-acpi.c | 4 +- drivers/pci/pci.c | 44 +++++++++++----------- drivers/pci/pci.h | 12 +++--- drivers/pci/pcie/aer.c | 2 +- drivers/pci/quirks.c | 20 +++++----- include/linux/pci.h | 5 ++- include/linux/pci_hotplug.h | 2 +- 12 files changed, 51 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/drivers/crypto/cavium/nitrox/nitrox_main.c b/drivers/crypto/cavium/nitrox/nitrox_main.c index 2db3fd5815c8..6c61817996a3 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_main.c +++ b/drivers/crypto/cavium/nitrox/nitrox_main.c @@ -306,7 +306,7 @@ static int nitrox_device_flr(struct pci_dev *pdev) return -ENOMEM; } - pcie_reset_flr(pdev, 0); + pcie_reset_flr(pdev, PCI_RESET_DO_RESET); pci_restore_state(pdev); diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c index d185df5acea6..ac821c5532a4 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c @@ -526,7 +526,7 @@ static void octeon_destroy_resources(struct octeon_device *oct) oct->irq_name_storage = NULL; } /* Soft reset the octeon device before exiting */ - if (!pcie_reset_flr(oct->pci_dev, 1)) + if (!pcie_reset_flr(oct->pci_dev, PCI_RESET_PROBE)) octeon_pci_flr(oct); else cn23xx_vf_ask_pf_to_do_flr(oct); diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index d4a930881054..69fd401691be 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -184,7 +184,7 @@ void pciehp_release_ctrl(struct controller *ctrl); int pciehp_sysfs_enable_slot(struct hotplug_slot *hotplug_slot); int pciehp_sysfs_disable_slot(struct hotplug_slot *hotplug_slot); -int pciehp_reset_slot(struct hotplug_slot *hotplug_slot, int probe); +int pciehp_reset_slot(struct hotplug_slot *hotplug_slot, bool probe); int pciehp_get_attention_status(struct hotplug_slot *hotplug_slot, u8 *status); int pciehp_set_raw_indicator_status(struct hotplug_slot *h_slot, u8 status); int pciehp_get_raw_indicator_status(struct hotplug_slot *h_slot, u8 *status); diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 9d06939736c0..3024d7e85e6a 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -870,7 +870,7 @@ void pcie_disable_interrupt(struct controller *ctrl) * momentarily, if we see that they could interfere. Also, clear any spurious * events after. */ -int pciehp_reset_slot(struct hotplug_slot *hotplug_slot, int probe) +int pciehp_reset_slot(struct hotplug_slot *hotplug_slot, bool probe) { struct controller *ctrl = to_ctrl(hotplug_slot); struct pci_dev *pdev = ctrl_dev(ctrl); diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index 04565162a449..f4c2e6e01be0 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -526,7 +526,7 @@ scan: return 0; } -static int pnv_php_reset_slot(struct hotplug_slot *slot, int probe) +static int pnv_php_reset_slot(struct hotplug_slot *slot, bool probe) { struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); struct pci_dev *bridge = php_slot->pdev; diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index b63db75a3dbf..fe286c861187 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -944,9 +944,9 @@ void pci_set_acpi_fwnode(struct pci_dev *dev) /** * pci_dev_acpi_reset - do a function level reset using _RST method * @dev: device to reset - * @probe: check if _RST method is included in the acpi_device context. + * @probe: if true, return 0 if device supports _RST */ -int pci_dev_acpi_reset(struct pci_dev *dev, int probe) +int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) { acpi_handle handle = ACPI_HANDLE(&dev->dev); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 4d9828160c48..b87bac5e4572 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4658,11 +4658,11 @@ EXPORT_SYMBOL_GPL(pcie_flr); /** * pcie_reset_flr - initiate a PCIe function level reset * @dev: device to reset - * @probe: If set, only check if the device can be reset this way. + * @probe: if true, return 0 if device can be reset this way * * Initiate a function level reset on @dev. */ -int pcie_reset_flr(struct pci_dev *dev, int probe) +int pcie_reset_flr(struct pci_dev *dev, bool probe) { if (dev->dev_flags & PCI_DEV_FLAGS_NO_FLR_RESET) return -ENOTTY; @@ -4677,7 +4677,7 @@ int pcie_reset_flr(struct pci_dev *dev, int probe) } EXPORT_SYMBOL_GPL(pcie_reset_flr); -static int pci_af_flr(struct pci_dev *dev, int probe) +static int pci_af_flr(struct pci_dev *dev, bool probe) { int pos; u8 cap; @@ -4724,7 +4724,7 @@ static int pci_af_flr(struct pci_dev *dev, int probe) /** * pci_pm_reset - Put device into PCI_D3 and back into PCI_D0. * @dev: Device to reset. - * @probe: If set, only check if the device can be reset this way. + * @probe: if true, return 0 if the device can be reset this way. * * If @dev supports native PCI PM and its PCI_PM_CTRL_NO_SOFT_RESET flag is * unset, it will be reinitialized internally when going from PCI_D3hot to @@ -4736,7 +4736,7 @@ static int pci_af_flr(struct pci_dev *dev, int probe) * by default (i.e. unless the @dev's d3hot_delay field has a different value). * Moreover, only devices in D0 can be reset by this function. */ -static int pci_pm_reset(struct pci_dev *dev, int probe) +static int pci_pm_reset(struct pci_dev *dev, bool probe) { u16 csr; @@ -4996,7 +4996,7 @@ int pci_bridge_secondary_bus_reset(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_bridge_secondary_bus_reset); -static int pci_parent_bus_reset(struct pci_dev *dev, int probe) +static int pci_parent_bus_reset(struct pci_dev *dev, bool probe) { struct pci_dev *pdev; @@ -5014,7 +5014,7 @@ static int pci_parent_bus_reset(struct pci_dev *dev, int probe) return pci_bridge_secondary_bus_reset(dev->bus->self); } -static int pci_reset_hotplug_slot(struct hotplug_slot *hotplug, int probe) +static int pci_reset_hotplug_slot(struct hotplug_slot *hotplug, bool probe) { int rc = -ENOTTY; @@ -5029,7 +5029,7 @@ static int pci_reset_hotplug_slot(struct hotplug_slot *hotplug, int probe) return rc; } -static int pci_dev_reset_slot_function(struct pci_dev *dev, int probe) +static int pci_dev_reset_slot_function(struct pci_dev *dev, bool probe) { if (dev->multifunction || dev->subordinate || !dev->slot || dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET) @@ -5038,7 +5038,7 @@ static int pci_dev_reset_slot_function(struct pci_dev *dev, int probe) return pci_reset_hotplug_slot(dev->slot->hotplug, probe); } -static int pci_reset_bus_function(struct pci_dev *dev, int probe) +static int pci_reset_bus_function(struct pci_dev *dev, bool probe) { int rc; @@ -5204,7 +5204,7 @@ static ssize_t reset_method_store(struct device *dev, goto error; } - if (pci_reset_fn_methods[m].reset_fn(pdev, 1)) { + if (pci_reset_fn_methods[m].reset_fn(pdev, PCI_RESET_PROBE)) { pci_err(pdev, "Unsupported reset method '%s'", name); goto error; } @@ -5220,7 +5220,7 @@ static ssize_t reset_method_store(struct device *dev, reset_methods[n] = 0; /* Warn if dev-specific supported but not highest priority */ - if (pci_reset_fn_methods[1].reset_fn(pdev, 1) == 0 && + if (pci_reset_fn_methods[1].reset_fn(pdev, PCI_RESET_PROBE) == 0 && reset_methods[0] != 1) pci_warn(pdev, "Device-specific reset disabled/de-prioritized by user"); memcpy(pdev->reset_methods, reset_methods, sizeof(pdev->reset_methods)); @@ -5294,7 +5294,7 @@ int __pci_reset_function_locked(struct pci_dev *dev) if (!m) return -ENOTTY; - rc = pci_reset_fn_methods[m].reset_fn(dev, 0); + rc = pci_reset_fn_methods[m].reset_fn(dev, PCI_RESET_DO_RESET); if (!rc) return 0; if (rc != -ENOTTY) @@ -5327,7 +5327,7 @@ void pci_init_reset_methods(struct pci_dev *dev) i = 0; for (m = 1; m < PCI_NUM_RESET_METHODS; m++) { - rc = pci_reset_fn_methods[m].reset_fn(dev, 1); + rc = pci_reset_fn_methods[m].reset_fn(dev, PCI_RESET_PROBE); if (!rc) dev->reset_methods[i++] = m; else if (rc != -ENOTTY) @@ -5644,7 +5644,7 @@ static void pci_slot_restore_locked(struct pci_slot *slot) } } -static int pci_slot_reset(struct pci_slot *slot, int probe) +static int pci_slot_reset(struct pci_slot *slot, bool probe) { int rc; @@ -5672,7 +5672,7 @@ static int pci_slot_reset(struct pci_slot *slot, int probe) */ int pci_probe_reset_slot(struct pci_slot *slot) { - return pci_slot_reset(slot, 1); + return pci_slot_reset(slot, PCI_RESET_PROBE); } EXPORT_SYMBOL_GPL(pci_probe_reset_slot); @@ -5695,14 +5695,14 @@ static int __pci_reset_slot(struct pci_slot *slot) { int rc; - rc = pci_slot_reset(slot, 1); + rc = pci_slot_reset(slot, PCI_RESET_PROBE); if (rc) return rc; if (pci_slot_trylock(slot)) { pci_slot_save_and_disable_locked(slot); might_sleep(); - rc = pci_reset_hotplug_slot(slot->hotplug, 0); + rc = pci_reset_hotplug_slot(slot->hotplug, PCI_RESET_DO_RESET); pci_slot_restore_locked(slot); pci_slot_unlock(slot); } else @@ -5711,7 +5711,7 @@ static int __pci_reset_slot(struct pci_slot *slot) return rc; } -static int pci_bus_reset(struct pci_bus *bus, int probe) +static int pci_bus_reset(struct pci_bus *bus, bool probe) { int ret; @@ -5757,14 +5757,14 @@ int pci_bus_error_reset(struct pci_dev *bridge) goto bus_reset; list_for_each_entry(slot, &bus->slots, list) - if (pci_slot_reset(slot, 0)) + if (pci_slot_reset(slot, PCI_RESET_DO_RESET)) goto bus_reset; mutex_unlock(&pci_slot_mutex); return 0; bus_reset: mutex_unlock(&pci_slot_mutex); - return pci_bus_reset(bridge->subordinate, 0); + return pci_bus_reset(bridge->subordinate, PCI_RESET_DO_RESET); } /** @@ -5775,7 +5775,7 @@ bus_reset: */ int pci_probe_reset_bus(struct pci_bus *bus) { - return pci_bus_reset(bus, 1); + return pci_bus_reset(bus, PCI_RESET_PROBE); } EXPORT_SYMBOL_GPL(pci_probe_reset_bus); @@ -5789,7 +5789,7 @@ static int __pci_reset_bus(struct pci_bus *bus) { int rc; - rc = pci_bus_reset(bus, 1); + rc = pci_bus_reset(bus, PCI_RESET_PROBE); if (rc) return rc; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 87cfd8db8827..05b7e7e04246 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -608,18 +608,18 @@ static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) struct pci_dev_reset_methods { u16 vendor; u16 device; - int (*reset)(struct pci_dev *dev, int probe); + int (*reset)(struct pci_dev *dev, bool probe); }; struct pci_reset_fn_method { - int (*reset_fn)(struct pci_dev *pdev, int probe); + int (*reset_fn)(struct pci_dev *pdev, bool probe); char *name; }; #ifdef CONFIG_PCI_QUIRKS -int pci_dev_specific_reset(struct pci_dev *dev, int probe); +int pci_dev_specific_reset(struct pci_dev *dev, bool probe); #else -static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe) +static inline int pci_dev_specific_reset(struct pci_dev *dev, bool probe) { return -ENOTTY; } @@ -708,9 +708,9 @@ static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL int pci_acpi_program_hp_params(struct pci_dev *dev); extern const struct attribute_group pci_dev_acpi_attr_group; void pci_set_acpi_fwnode(struct pci_dev *dev); -int pci_dev_acpi_reset(struct pci_dev *dev, int probe); +int pci_dev_acpi_reset(struct pci_dev *dev, bool probe); #else -static inline int pci_dev_acpi_reset(struct pci_dev *dev, int probe) +static inline int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) { return -ENOTTY; } diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 031379deb130..9784fdcf3006 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1407,7 +1407,7 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev) } if (type == PCI_EXP_TYPE_RC_EC || type == PCI_EXP_TYPE_RC_END) { - rc = pcie_reset_flr(dev, 0); + rc = pcie_reset_flr(dev, PCI_RESET_DO_RESET); if (!rc) pci_info(dev, "has been reset\n"); else diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index a1b57b63c624..e7657b8c8a33 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3702,7 +3702,7 @@ DECLARE_PCI_FIXUP_SUSPEND_LATE(PCI_VENDOR_ID_INTEL, * reset a single function if other methods (e.g. FLR, PM D0->D3) are * not available. */ -static int reset_intel_82599_sfp_virtfn(struct pci_dev *dev, int probe) +static int reset_intel_82599_sfp_virtfn(struct pci_dev *dev, bool probe) { /* * http://www.intel.com/content/dam/doc/datasheet/82599-10-gbe-controller-datasheet.pdf @@ -3724,7 +3724,7 @@ static int reset_intel_82599_sfp_virtfn(struct pci_dev *dev, int probe) #define NSDE_PWR_STATE 0xd0100 #define IGD_OPERATION_TIMEOUT 10000 /* set timeout 10 seconds */ -static int reset_ivb_igd(struct pci_dev *dev, int probe) +static int reset_ivb_igd(struct pci_dev *dev, bool probe) { void __iomem *mmio_base; unsigned long timeout; @@ -3767,7 +3767,7 @@ reset_complete: } /* Device-specific reset method for Chelsio T4-based adapters */ -static int reset_chelsio_generic_dev(struct pci_dev *dev, int probe) +static int reset_chelsio_generic_dev(struct pci_dev *dev, bool probe) { u16 old_command; u16 msix_flags; @@ -3845,14 +3845,14 @@ static int reset_chelsio_generic_dev(struct pci_dev *dev, int probe) * Chapter 3: NVMe control registers * Chapter 7.3: Reset behavior */ -static int nvme_disable_and_flr(struct pci_dev *dev, int probe) +static int nvme_disable_and_flr(struct pci_dev *dev, bool probe) { void __iomem *bar; u16 cmd; u32 cfg; if (dev->class != PCI_CLASS_STORAGE_EXPRESS || - pcie_reset_flr(dev, 1) || !pci_resource_start(dev, 0)) + pcie_reset_flr(dev, PCI_RESET_PROBE) || !pci_resource_start(dev, 0)) return -ENOTTY; if (probe) @@ -3919,12 +3919,12 @@ static int nvme_disable_and_flr(struct pci_dev *dev, int probe) * device too soon after FLR. A 250ms delay after FLR has heuristically * proven to produce reliably working results for device assignment cases. */ -static int delay_250ms_after_flr(struct pci_dev *dev, int probe) +static int delay_250ms_after_flr(struct pci_dev *dev, bool probe) { if (probe) - return pcie_reset_flr(dev, 1); + return pcie_reset_flr(dev, PCI_RESET_PROBE); - pcie_reset_flr(dev, 0); + pcie_reset_flr(dev, PCI_RESET_DO_RESET); msleep(250); @@ -3939,7 +3939,7 @@ static int delay_250ms_after_flr(struct pci_dev *dev, int probe) #define HINIC_OPERATION_TIMEOUT 15000 /* 15 seconds */ /* Device-specific reset method for Huawei Intelligent NIC virtual functions */ -static int reset_hinic_vf_dev(struct pci_dev *pdev, int probe) +static int reset_hinic_vf_dev(struct pci_dev *pdev, bool probe) { unsigned long timeout; void __iomem *bar; @@ -4016,7 +4016,7 @@ static const struct pci_dev_reset_methods pci_dev_reset_methods[] = { * because when a host assigns a device to a guest VM, the host may need * to reset the device but probably doesn't have a driver for it. */ -int pci_dev_specific_reset(struct pci_dev *dev, int probe) +int pci_dev_specific_reset(struct pci_dev *dev, bool probe) { const struct pci_dev_reset_methods *i; diff --git a/include/linux/pci.h b/include/linux/pci.h index 98718f46a61c..a46363f29b68 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -52,6 +52,9 @@ /* Number of reset methods used in pci_reset_fn_methods array in pci.c */ #define PCI_NUM_RESET_METHODS 7 +#define PCI_RESET_PROBE true +#define PCI_RESET_DO_RESET false + /* * The PCI interface treats multi-function devices as independent * devices. The slot/function address of each device is encoded @@ -1234,7 +1237,7 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev, enum pci_bus_speed *speed, enum pcie_link_width *width); void pcie_print_link_status(struct pci_dev *dev); -int pcie_reset_flr(struct pci_dev *dev, int probe); +int pcie_reset_flr(struct pci_dev *dev, bool probe); int pcie_flr(struct pci_dev *dev); int __pci_reset_function_locked(struct pci_dev *dev); int pci_reset_function(struct pci_dev *dev); diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h index 2dac431d94ac..3a10d6ec3ee7 100644 --- a/include/linux/pci_hotplug.h +++ b/include/linux/pci_hotplug.h @@ -44,7 +44,7 @@ struct hotplug_slot_ops { int (*get_attention_status) (struct hotplug_slot *slot, u8 *value); int (*get_latch_status) (struct hotplug_slot *slot, u8 *value); int (*get_adapter_status) (struct hotplug_slot *slot, u8 *value); - int (*reset_slot) (struct hotplug_slot *slot, int probe); + int (*reset_slot) (struct hotplug_slot *slot, bool probe); }; /** -- cgit v1.2.3-71-gd317 From 1cf362e907f36f104b9cf590ee6ced786226b388 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 19 Aug 2021 18:03:37 +0530 Subject: PCI: endpoint: Add support to add virtual function in endpoint core Add support to add virtual function in endpoint core. The virtual function can only be associated with a physical function instead of a endpoint controller. Provide APIs to associate a virtual function with a physical function here. [weiyongjun1@huawei.com: PCI: endpoint: Fix missing unlock on error in pci_epf_add_vepf() - Reported-by: Hulk Robot ] Link: https://lore.kernel.org/r/20210819123343.1951-3-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Wei Yongjun Signed-off-by: Lorenzo Pieralisi --- drivers/pci/endpoint/pci-epc-core.c | 2 +- drivers/pci/endpoint/pci-epf-core.c | 98 ++++++++++++++++++++++++++++++++++++- include/linux/pci-epf.h | 16 +++++- 3 files changed, 113 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c index adec9bee72cf..01c58ca84dcc 100644 --- a/drivers/pci/endpoint/pci-epc-core.c +++ b/drivers/pci/endpoint/pci-epc-core.c @@ -548,7 +548,7 @@ int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf, u32 func_no; int ret = 0; - if (IS_ERR_OR_NULL(epc)) + if (IS_ERR_OR_NULL(epc) || epf->is_vf) return -EINVAL; if (type == PRIMARY_INTERFACE && epf->epc) diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c index e9289d10f822..296479659aa2 100644 --- a/drivers/pci/endpoint/pci-epf-core.c +++ b/drivers/pci/endpoint/pci-epf-core.c @@ -62,13 +62,20 @@ EXPORT_SYMBOL_GPL(pci_epf_type_add_cfs); */ void pci_epf_unbind(struct pci_epf *epf) { + struct pci_epf *epf_vf; + if (!epf->driver) { dev_WARN(&epf->dev, "epf device not bound to driver\n"); return; } mutex_lock(&epf->lock); - epf->driver->ops->unbind(epf); + list_for_each_entry(epf_vf, &epf->pci_vepf, list) { + if (epf_vf->is_bound) + epf_vf->driver->ops->unbind(epf_vf); + } + if (epf->is_bound) + epf->driver->ops->unbind(epf); mutex_unlock(&epf->lock); module_put(epf->driver->owner); } @@ -83,6 +90,7 @@ EXPORT_SYMBOL_GPL(pci_epf_unbind); */ int pci_epf_bind(struct pci_epf *epf) { + struct pci_epf *epf_vf; int ret; if (!epf->driver) { @@ -94,13 +102,97 @@ int pci_epf_bind(struct pci_epf *epf) return -EAGAIN; mutex_lock(&epf->lock); + list_for_each_entry(epf_vf, &epf->pci_vepf, list) { + epf_vf->func_no = epf->func_no; + epf_vf->epc = epf->epc; + epf_vf->sec_epc = epf->sec_epc; + ret = epf_vf->driver->ops->bind(epf_vf); + if (ret) + goto ret; + epf_vf->is_bound = true; + } + ret = epf->driver->ops->bind(epf); + if (ret) + goto ret; + epf->is_bound = true; + + mutex_unlock(&epf->lock); + return 0; + +ret: mutex_unlock(&epf->lock); + pci_epf_unbind(epf); return ret; } EXPORT_SYMBOL_GPL(pci_epf_bind); +/** + * pci_epf_add_vepf() - associate virtual EP function to physical EP function + * @epf_pf: the physical EP function to which the virtual EP function should be + * associated + * @epf_vf: the virtual EP function to be added + * + * A physical endpoint function can be associated with multiple virtual + * endpoint functions. Invoke pci_epf_add_epf() to add a virtual PCI endpoint + * function to a physical PCI endpoint function. + */ +int pci_epf_add_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf) +{ + u32 vfunc_no; + + if (IS_ERR_OR_NULL(epf_pf) || IS_ERR_OR_NULL(epf_vf)) + return -EINVAL; + + if (epf_pf->epc || epf_vf->epc || epf_vf->epf_pf) + return -EBUSY; + + if (epf_pf->sec_epc || epf_vf->sec_epc) + return -EBUSY; + + mutex_lock(&epf_pf->lock); + vfunc_no = find_first_zero_bit(&epf_pf->vfunction_num_map, + BITS_PER_LONG); + if (vfunc_no >= BITS_PER_LONG) { + mutex_unlock(&epf_pf->lock); + return -EINVAL; + } + + set_bit(vfunc_no, &epf_pf->vfunction_num_map); + epf_vf->vfunc_no = vfunc_no; + + epf_vf->epf_pf = epf_pf; + epf_vf->is_vf = true; + + list_add_tail(&epf_vf->list, &epf_pf->pci_vepf); + mutex_unlock(&epf_pf->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(pci_epf_add_vepf); + +/** + * pci_epf_remove_vepf() - remove virtual EP function from physical EP function + * @epf_pf: the physical EP function from which the virtual EP function should + * be removed + * @epf_vf: the virtual EP function to be removed + * + * Invoke to remove a virtual endpoint function from the physcial endpoint + * function. + */ +void pci_epf_remove_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf) +{ + if (IS_ERR_OR_NULL(epf_pf) || IS_ERR_OR_NULL(epf_vf)) + return; + + mutex_lock(&epf_pf->lock); + clear_bit(epf_vf->vfunc_no, &epf_pf->vfunction_num_map); + list_del(&epf_vf->list); + mutex_unlock(&epf_pf->lock); +} +EXPORT_SYMBOL_GPL(pci_epf_remove_vepf); + /** * pci_epf_free_space() - free the allocated PCI EPF register space * @epf: the EPF device from whom to free the memory @@ -317,6 +409,10 @@ struct pci_epf *pci_epf_create(const char *name) return ERR_PTR(-ENOMEM); } + /* VFs are numbered starting with 1. So set BIT(0) by default */ + epf->vfunction_num_map = 1; + INIT_LIST_HEAD(&epf->pci_vepf); + dev = &epf->dev; device_initialize(dev); dev->bus = &pci_epf_bus_type; diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 2debc27ba95e..043b4c9c7188 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -121,8 +121,10 @@ struct pci_epf_bar { * @bar: represents the BAR of EPF device * @msi_interrupts: number of MSI interrupts required by this function * @msix_interrupts: number of MSI-X interrupts required by this function - * @func_no: unique function number within this endpoint device + * @func_no: unique (physical) function number within this endpoint device + * @vfunc_no: unique virtual function number within a physical function * @epc: the EPC device to which this EPF device is bound + * @epf_pf: the physical EPF device to which this virtual EPF device is bound * @driver: the EPF driver to which this EPF device is bound * @list: to add pci_epf as a list of PCI endpoint functions to pci_epc * @nb: notifier block to notify EPF of any EPC events (like linkup) @@ -133,6 +135,10 @@ struct pci_epf_bar { * @sec_epc_bar: represents the BAR of EPF device associated with secondary EPC * @sec_epc_func_no: unique (physical) function number within the secondary EPC * @group: configfs group associated with the EPF device + * @is_bound: indicates if bind notification to function driver has been invoked + * @is_vf: true - virtual function, false - physical function + * @vfunction_num_map: bitmap to manage virtual function number + * @pci_vepf: list of virtual endpoint functions associated with this function */ struct pci_epf { struct device dev; @@ -142,8 +148,10 @@ struct pci_epf { u8 msi_interrupts; u16 msix_interrupts; u8 func_no; + u8 vfunc_no; struct pci_epc *epc; + struct pci_epf *epf_pf; struct pci_epf_driver *driver; struct list_head list; struct notifier_block nb; @@ -156,6 +164,10 @@ struct pci_epf { struct pci_epf_bar sec_epc_bar[6]; u8 sec_epc_func_no; struct config_group *group; + unsigned int is_bound; + unsigned int is_vf; + unsigned long vfunction_num_map; + struct list_head pci_vepf; }; /** @@ -199,4 +211,6 @@ int pci_epf_bind(struct pci_epf *epf); void pci_epf_unbind(struct pci_epf *epf); struct config_group *pci_epf_type_add_cfs(struct pci_epf *epf, struct config_group *group); +int pci_epf_add_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf); +void pci_epf_remove_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf); #endif /* __LINUX_PCI_EPF_H */ -- cgit v1.2.3-71-gd317 From 53fd3cbe5e9d791d6bb6059f73a3851f155ce7c6 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 19 Aug 2021 18:03:39 +0530 Subject: PCI: endpoint: Add virtual function number in pci_epc ops Add virtual function number in pci_epc ops. EPC controller driver can perform virtual function specific initialization based on the virtual function number. Link: https://lore.kernel.org/r/20210819123343.1951-5-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/cadence/pcie-cadence-ep.c | 44 ++++---- drivers/pci/controller/dwc/pcie-designware-ep.c | 36 +++---- drivers/pci/controller/pcie-rcar-ep.c | 19 ++-- drivers/pci/controller/pcie-rockchip-ep.c | 18 ++-- drivers/pci/endpoint/functions/pci-epf-ntb.c | 89 +++++++++------ drivers/pci/endpoint/functions/pci-epf-test.c | 74 +++++++------ drivers/pci/endpoint/pci-epc-core.c | 132 ++++++++++++++++------- drivers/pci/endpoint/pci-epf-core.c | 48 ++++++++- include/linux/pci-epc.h | 57 +++++----- 9 files changed, 328 insertions(+), 189 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/controller/cadence/pcie-cadence-ep.c b/drivers/pci/controller/cadence/pcie-cadence-ep.c index 897cdde02bd8..912a15be8bfd 100644 --- a/drivers/pci/controller/cadence/pcie-cadence-ep.c +++ b/drivers/pci/controller/cadence/pcie-cadence-ep.c @@ -16,7 +16,7 @@ #define CDNS_PCIE_EP_IRQ_PCI_ADDR_NONE 0x1 #define CDNS_PCIE_EP_IRQ_PCI_ADDR_LEGACY 0x3 -static int cdns_pcie_ep_write_header(struct pci_epc *epc, u8 fn, +static int cdns_pcie_ep_write_header(struct pci_epc *epc, u8 fn, u8 vfn, struct pci_epf_header *hdr) { struct cdns_pcie_ep *ep = epc_get_drvdata(epc); @@ -47,7 +47,7 @@ static int cdns_pcie_ep_write_header(struct pci_epc *epc, u8 fn, return 0; } -static int cdns_pcie_ep_set_bar(struct pci_epc *epc, u8 fn, +static int cdns_pcie_ep_set_bar(struct pci_epc *epc, u8 fn, u8 vfn, struct pci_epf_bar *epf_bar) { struct cdns_pcie_ep *ep = epc_get_drvdata(epc); @@ -117,7 +117,7 @@ static int cdns_pcie_ep_set_bar(struct pci_epc *epc, u8 fn, return 0; } -static void cdns_pcie_ep_clear_bar(struct pci_epc *epc, u8 fn, +static void cdns_pcie_ep_clear_bar(struct pci_epc *epc, u8 fn, u8 vfn, struct pci_epf_bar *epf_bar) { struct cdns_pcie_ep *ep = epc_get_drvdata(epc); @@ -147,8 +147,8 @@ static void cdns_pcie_ep_clear_bar(struct pci_epc *epc, u8 fn, epf->epf_bar[bar] = NULL; } -static int cdns_pcie_ep_map_addr(struct pci_epc *epc, u8 fn, phys_addr_t addr, - u64 pci_addr, size_t size) +static int cdns_pcie_ep_map_addr(struct pci_epc *epc, u8 fn, u8 vfn, + phys_addr_t addr, u64 pci_addr, size_t size) { struct cdns_pcie_ep *ep = epc_get_drvdata(epc); struct cdns_pcie *pcie = &ep->pcie; @@ -169,7 +169,7 @@ static int cdns_pcie_ep_map_addr(struct pci_epc *epc, u8 fn, phys_addr_t addr, return 0; } -static void cdns_pcie_ep_unmap_addr(struct pci_epc *epc, u8 fn, +static void cdns_pcie_ep_unmap_addr(struct pci_epc *epc, u8 fn, u8 vfn, phys_addr_t addr) { struct cdns_pcie_ep *ep = epc_get_drvdata(epc); @@ -189,7 +189,7 @@ static void cdns_pcie_ep_unmap_addr(struct pci_epc *epc, u8 fn, clear_bit(r, &ep->ob_region_map); } -static int cdns_pcie_ep_set_msi(struct pci_epc *epc, u8 fn, u8 mmc) +static int cdns_pcie_ep_set_msi(struct pci_epc *epc, u8 fn, u8 vfn, u8 mmc) { struct cdns_pcie_ep *ep = epc_get_drvdata(epc); struct cdns_pcie *pcie = &ep->pcie; @@ -209,7 +209,7 @@ static int cdns_pcie_ep_set_msi(struct pci_epc *epc, u8 fn, u8 mmc) return 0; } -static int cdns_pcie_ep_get_msi(struct pci_epc *epc, u8 fn) +static int cdns_pcie_ep_get_msi(struct pci_epc *epc, u8 fn, u8 vfn) { struct cdns_pcie_ep *ep = epc_get_drvdata(epc); struct cdns_pcie *pcie = &ep->pcie; @@ -230,7 +230,7 @@ static int cdns_pcie_ep_get_msi(struct pci_epc *epc, u8 fn) return mme; } -static int cdns_pcie_ep_get_msix(struct pci_epc *epc, u8 func_no) +static int cdns_pcie_ep_get_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no) { struct cdns_pcie_ep *ep = epc_get_drvdata(epc); struct cdns_pcie *pcie = &ep->pcie; @@ -247,8 +247,9 @@ static int cdns_pcie_ep_get_msix(struct pci_epc *epc, u8 func_no) return val; } -static int cdns_pcie_ep_set_msix(struct pci_epc *epc, u8 fn, u16 interrupts, - enum pci_barno bir, u32 offset) +static int cdns_pcie_ep_set_msix(struct pci_epc *epc, u8 fn, u8 vfn, + u16 interrupts, enum pci_barno bir, + u32 offset) { struct cdns_pcie_ep *ep = epc_get_drvdata(epc); struct cdns_pcie *pcie = &ep->pcie; @@ -317,7 +318,8 @@ static void cdns_pcie_ep_assert_intx(struct cdns_pcie_ep *ep, u8 fn, writel(0, ep->irq_cpu_addr + offset); } -static int cdns_pcie_ep_send_legacy_irq(struct cdns_pcie_ep *ep, u8 fn, u8 intx) +static int cdns_pcie_ep_send_legacy_irq(struct cdns_pcie_ep *ep, u8 fn, u8 vfn, + u8 intx) { u16 cmd; @@ -334,7 +336,7 @@ static int cdns_pcie_ep_send_legacy_irq(struct cdns_pcie_ep *ep, u8 fn, u8 intx) return 0; } -static int cdns_pcie_ep_send_msi_irq(struct cdns_pcie_ep *ep, u8 fn, +static int cdns_pcie_ep_send_msi_irq(struct cdns_pcie_ep *ep, u8 fn, u8 vfn, u8 interrupt_num) { struct cdns_pcie *pcie = &ep->pcie; @@ -382,7 +384,7 @@ static int cdns_pcie_ep_send_msi_irq(struct cdns_pcie_ep *ep, u8 fn, return 0; } -static int cdns_pcie_ep_map_msi_irq(struct pci_epc *epc, u8 fn, +static int cdns_pcie_ep_map_msi_irq(struct pci_epc *epc, u8 fn, u8 vfn, phys_addr_t addr, u8 interrupt_num, u32 entry_size, u32 *msi_data, u32 *msi_addr_offset) @@ -419,7 +421,7 @@ static int cdns_pcie_ep_map_msi_irq(struct pci_epc *epc, u8 fn, pci_addr &= GENMASK_ULL(63, 2); for (i = 0; i < interrupt_num; i++) { - ret = cdns_pcie_ep_map_addr(epc, fn, addr, + ret = cdns_pcie_ep_map_addr(epc, fn, vfn, addr, pci_addr & ~pci_addr_mask, entry_size); if (ret) @@ -433,7 +435,7 @@ static int cdns_pcie_ep_map_msi_irq(struct pci_epc *epc, u8 fn, return 0; } -static int cdns_pcie_ep_send_msix_irq(struct cdns_pcie_ep *ep, u8 fn, +static int cdns_pcie_ep_send_msix_irq(struct cdns_pcie_ep *ep, u8 fn, u8 vfn, u16 interrupt_num) { u32 cap = CDNS_PCIE_EP_FUNC_MSIX_CAP_OFFSET; @@ -478,7 +480,7 @@ static int cdns_pcie_ep_send_msix_irq(struct cdns_pcie_ep *ep, u8 fn, return 0; } -static int cdns_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn, +static int cdns_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn, u8 vfn, enum pci_epc_irq_type type, u16 interrupt_num) { @@ -486,13 +488,13 @@ static int cdns_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn, switch (type) { case PCI_EPC_IRQ_LEGACY: - return cdns_pcie_ep_send_legacy_irq(ep, fn, 0); + return cdns_pcie_ep_send_legacy_irq(ep, fn, vfn, 0); case PCI_EPC_IRQ_MSI: - return cdns_pcie_ep_send_msi_irq(ep, fn, interrupt_num); + return cdns_pcie_ep_send_msi_irq(ep, fn, vfn, interrupt_num); case PCI_EPC_IRQ_MSIX: - return cdns_pcie_ep_send_msix_irq(ep, fn, interrupt_num); + return cdns_pcie_ep_send_msix_irq(ep, fn, vfn, interrupt_num); default: break; @@ -531,7 +533,7 @@ static const struct pci_epc_features cdns_pcie_epc_features = { }; static const struct pci_epc_features* -cdns_pcie_ep_get_features(struct pci_epc *epc, u8 func_no) +cdns_pcie_ep_get_features(struct pci_epc *epc, u8 func_no, u8 vfunc_no) { return &cdns_pcie_epc_features; } diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 8d028a88b375..998b698f4085 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -125,7 +125,7 @@ static u8 dw_pcie_ep_find_capability(struct dw_pcie_ep *ep, u8 func_no, u8 cap) return __dw_pcie_ep_find_next_cap(ep, func_no, next_cap_ptr, cap); } -static int dw_pcie_ep_write_header(struct pci_epc *epc, u8 func_no, +static int dw_pcie_ep_write_header(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_header *hdr) { struct dw_pcie_ep *ep = epc_get_drvdata(epc); @@ -202,7 +202,7 @@ static int dw_pcie_ep_outbound_atu(struct dw_pcie_ep *ep, u8 func_no, return 0; } -static void dw_pcie_ep_clear_bar(struct pci_epc *epc, u8 func_no, +static void dw_pcie_ep_clear_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar) { struct dw_pcie_ep *ep = epc_get_drvdata(epc); @@ -217,7 +217,7 @@ static void dw_pcie_ep_clear_bar(struct pci_epc *epc, u8 func_no, ep->epf_bar[bar] = NULL; } -static int dw_pcie_ep_set_bar(struct pci_epc *epc, u8 func_no, +static int dw_pcie_ep_set_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar) { int ret; @@ -276,7 +276,7 @@ static int dw_pcie_find_index(struct dw_pcie_ep *ep, phys_addr_t addr, return -EINVAL; } -static void dw_pcie_ep_unmap_addr(struct pci_epc *epc, u8 func_no, +static void dw_pcie_ep_unmap_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t addr) { int ret; @@ -292,9 +292,8 @@ static void dw_pcie_ep_unmap_addr(struct pci_epc *epc, u8 func_no, clear_bit(atu_index, ep->ob_window_map); } -static int dw_pcie_ep_map_addr(struct pci_epc *epc, u8 func_no, - phys_addr_t addr, - u64 pci_addr, size_t size) +static int dw_pcie_ep_map_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + phys_addr_t addr, u64 pci_addr, size_t size) { int ret; struct dw_pcie_ep *ep = epc_get_drvdata(epc); @@ -309,7 +308,7 @@ static int dw_pcie_ep_map_addr(struct pci_epc *epc, u8 func_no, return 0; } -static int dw_pcie_ep_get_msi(struct pci_epc *epc, u8 func_no) +static int dw_pcie_ep_get_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no) { struct dw_pcie_ep *ep = epc_get_drvdata(epc); struct dw_pcie *pci = to_dw_pcie_from_ep(ep); @@ -333,7 +332,8 @@ static int dw_pcie_ep_get_msi(struct pci_epc *epc, u8 func_no) return val; } -static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts) +static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u8 interrupts) { struct dw_pcie_ep *ep = epc_get_drvdata(epc); struct dw_pcie *pci = to_dw_pcie_from_ep(ep); @@ -358,7 +358,7 @@ static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts) return 0; } -static int dw_pcie_ep_get_msix(struct pci_epc *epc, u8 func_no) +static int dw_pcie_ep_get_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no) { struct dw_pcie_ep *ep = epc_get_drvdata(epc); struct dw_pcie *pci = to_dw_pcie_from_ep(ep); @@ -382,8 +382,8 @@ static int dw_pcie_ep_get_msix(struct pci_epc *epc, u8 func_no) return val; } -static int dw_pcie_ep_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts, - enum pci_barno bir, u32 offset) +static int dw_pcie_ep_set_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u16 interrupts, enum pci_barno bir, u32 offset) { struct dw_pcie_ep *ep = epc_get_drvdata(epc); struct dw_pcie *pci = to_dw_pcie_from_ep(ep); @@ -418,7 +418,7 @@ static int dw_pcie_ep_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts, return 0; } -static int dw_pcie_ep_raise_irq(struct pci_epc *epc, u8 func_no, +static int dw_pcie_ep_raise_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no, enum pci_epc_irq_type type, u16 interrupt_num) { struct dw_pcie_ep *ep = epc_get_drvdata(epc); @@ -450,7 +450,7 @@ static int dw_pcie_ep_start(struct pci_epc *epc) } static const struct pci_epc_features* -dw_pcie_ep_get_features(struct pci_epc *epc, u8 func_no) +dw_pcie_ep_get_features(struct pci_epc *epc, u8 func_no, u8 vfunc_no) { struct dw_pcie_ep *ep = epc_get_drvdata(epc); @@ -525,14 +525,14 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no, aligned_offset = msg_addr_lower & (epc->mem->window.page_size - 1); msg_addr = ((u64)msg_addr_upper) << 32 | (msg_addr_lower & ~aligned_offset); - ret = dw_pcie_ep_map_addr(epc, func_no, ep->msi_mem_phys, msg_addr, + ret = dw_pcie_ep_map_addr(epc, func_no, 0, ep->msi_mem_phys, msg_addr, epc->mem->window.page_size); if (ret) return ret; writel(msg_data | (interrupt_num - 1), ep->msi_mem + aligned_offset); - dw_pcie_ep_unmap_addr(epc, func_no, ep->msi_mem_phys); + dw_pcie_ep_unmap_addr(epc, func_no, 0, ep->msi_mem_phys); return 0; } @@ -593,14 +593,14 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no, } aligned_offset = msg_addr & (epc->mem->window.page_size - 1); - ret = dw_pcie_ep_map_addr(epc, func_no, ep->msi_mem_phys, msg_addr, + ret = dw_pcie_ep_map_addr(epc, func_no, 0, ep->msi_mem_phys, msg_addr, epc->mem->window.page_size); if (ret) return ret; writel(msg_data, ep->msi_mem + aligned_offset); - dw_pcie_ep_unmap_addr(epc, func_no, ep->msi_mem_phys); + dw_pcie_ep_unmap_addr(epc, func_no, 0, ep->msi_mem_phys); return 0; } diff --git a/drivers/pci/controller/pcie-rcar-ep.c b/drivers/pci/controller/pcie-rcar-ep.c index b4a288e24aaf..6cee4e09acca 100644 --- a/drivers/pci/controller/pcie-rcar-ep.c +++ b/drivers/pci/controller/pcie-rcar-ep.c @@ -159,7 +159,7 @@ static int rcar_pcie_ep_get_pdata(struct rcar_pcie_endpoint *ep, return 0; } -static int rcar_pcie_ep_write_header(struct pci_epc *epc, u8 fn, +static int rcar_pcie_ep_write_header(struct pci_epc *epc, u8 fn, u8 vfn, struct pci_epf_header *hdr) { struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); @@ -195,7 +195,7 @@ static int rcar_pcie_ep_write_header(struct pci_epc *epc, u8 fn, return 0; } -static int rcar_pcie_ep_set_bar(struct pci_epc *epc, u8 func_no, +static int rcar_pcie_ep_set_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar) { int flags = epf_bar->flags | LAR_ENABLE | LAM_64BIT; @@ -246,7 +246,7 @@ static int rcar_pcie_ep_set_bar(struct pci_epc *epc, u8 func_no, return 0; } -static void rcar_pcie_ep_clear_bar(struct pci_epc *epc, u8 fn, +static void rcar_pcie_ep_clear_bar(struct pci_epc *epc, u8 fn, u8 vfn, struct pci_epf_bar *epf_bar) { struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); @@ -259,7 +259,8 @@ static void rcar_pcie_ep_clear_bar(struct pci_epc *epc, u8 fn, clear_bit(atu_index + 1, ep->ib_window_map); } -static int rcar_pcie_ep_set_msi(struct pci_epc *epc, u8 fn, u8 interrupts) +static int rcar_pcie_ep_set_msi(struct pci_epc *epc, u8 fn, u8 vfn, + u8 interrupts) { struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); struct rcar_pcie *pcie = &ep->pcie; @@ -272,7 +273,7 @@ static int rcar_pcie_ep_set_msi(struct pci_epc *epc, u8 fn, u8 interrupts) return 0; } -static int rcar_pcie_ep_get_msi(struct pci_epc *epc, u8 fn) +static int rcar_pcie_ep_get_msi(struct pci_epc *epc, u8 fn, u8 vfn) { struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); struct rcar_pcie *pcie = &ep->pcie; @@ -285,7 +286,7 @@ static int rcar_pcie_ep_get_msi(struct pci_epc *epc, u8 fn) return ((flags & MSICAP0_MMESE_MASK) >> MSICAP0_MMESE_OFFSET); } -static int rcar_pcie_ep_map_addr(struct pci_epc *epc, u8 fn, +static int rcar_pcie_ep_map_addr(struct pci_epc *epc, u8 fn, u8 vfn, phys_addr_t addr, u64 pci_addr, size_t size) { struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); @@ -322,7 +323,7 @@ static int rcar_pcie_ep_map_addr(struct pci_epc *epc, u8 fn, return 0; } -static void rcar_pcie_ep_unmap_addr(struct pci_epc *epc, u8 fn, +static void rcar_pcie_ep_unmap_addr(struct pci_epc *epc, u8 fn, u8 vfn, phys_addr_t addr) { struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc); @@ -403,7 +404,7 @@ static int rcar_pcie_ep_assert_msi(struct rcar_pcie *pcie, return 0; } -static int rcar_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn, +static int rcar_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn, u8 vfn, enum pci_epc_irq_type type, u16 interrupt_num) { @@ -451,7 +452,7 @@ static const struct pci_epc_features rcar_pcie_epc_features = { }; static const struct pci_epc_features* -rcar_pcie_ep_get_features(struct pci_epc *epc, u8 func_no) +rcar_pcie_ep_get_features(struct pci_epc *epc, u8 func_no, u8 vfunc_no) { return &rcar_pcie_epc_features; } diff --git a/drivers/pci/controller/pcie-rockchip-ep.c b/drivers/pci/controller/pcie-rockchip-ep.c index 7631dc3961c1..5fb9ce6e536e 100644 --- a/drivers/pci/controller/pcie-rockchip-ep.c +++ b/drivers/pci/controller/pcie-rockchip-ep.c @@ -122,7 +122,7 @@ static void rockchip_pcie_prog_ep_ob_atu(struct rockchip_pcie *rockchip, u8 fn, ROCKCHIP_PCIE_AT_OB_REGION_CPU_ADDR1(r)); } -static int rockchip_pcie_ep_write_header(struct pci_epc *epc, u8 fn, +static int rockchip_pcie_ep_write_header(struct pci_epc *epc, u8 fn, u8 vfn, struct pci_epf_header *hdr) { struct rockchip_pcie_ep *ep = epc_get_drvdata(epc); @@ -159,7 +159,7 @@ static int rockchip_pcie_ep_write_header(struct pci_epc *epc, u8 fn, return 0; } -static int rockchip_pcie_ep_set_bar(struct pci_epc *epc, u8 fn, +static int rockchip_pcie_ep_set_bar(struct pci_epc *epc, u8 fn, u8 vfn, struct pci_epf_bar *epf_bar) { struct rockchip_pcie_ep *ep = epc_get_drvdata(epc); @@ -227,7 +227,7 @@ static int rockchip_pcie_ep_set_bar(struct pci_epc *epc, u8 fn, return 0; } -static void rockchip_pcie_ep_clear_bar(struct pci_epc *epc, u8 fn, +static void rockchip_pcie_ep_clear_bar(struct pci_epc *epc, u8 fn, u8 vfn, struct pci_epf_bar *epf_bar) { struct rockchip_pcie_ep *ep = epc_get_drvdata(epc); @@ -256,7 +256,7 @@ static void rockchip_pcie_ep_clear_bar(struct pci_epc *epc, u8 fn, ROCKCHIP_PCIE_AT_IB_EP_FUNC_BAR_ADDR1(fn, bar)); } -static int rockchip_pcie_ep_map_addr(struct pci_epc *epc, u8 fn, +static int rockchip_pcie_ep_map_addr(struct pci_epc *epc, u8 fn, u8 vfn, phys_addr_t addr, u64 pci_addr, size_t size) { @@ -284,7 +284,7 @@ static int rockchip_pcie_ep_map_addr(struct pci_epc *epc, u8 fn, return 0; } -static void rockchip_pcie_ep_unmap_addr(struct pci_epc *epc, u8 fn, +static void rockchip_pcie_ep_unmap_addr(struct pci_epc *epc, u8 fn, u8 vfn, phys_addr_t addr) { struct rockchip_pcie_ep *ep = epc_get_drvdata(epc); @@ -308,7 +308,7 @@ static void rockchip_pcie_ep_unmap_addr(struct pci_epc *epc, u8 fn, clear_bit(r, &ep->ob_region_map); } -static int rockchip_pcie_ep_set_msi(struct pci_epc *epc, u8 fn, +static int rockchip_pcie_ep_set_msi(struct pci_epc *epc, u8 fn, u8 vfn, u8 multi_msg_cap) { struct rockchip_pcie_ep *ep = epc_get_drvdata(epc); @@ -329,7 +329,7 @@ static int rockchip_pcie_ep_set_msi(struct pci_epc *epc, u8 fn, return 0; } -static int rockchip_pcie_ep_get_msi(struct pci_epc *epc, u8 fn) +static int rockchip_pcie_ep_get_msi(struct pci_epc *epc, u8 fn, u8 vfn) { struct rockchip_pcie_ep *ep = epc_get_drvdata(epc); struct rockchip_pcie *rockchip = &ep->rockchip; @@ -471,7 +471,7 @@ static int rockchip_pcie_ep_send_msi_irq(struct rockchip_pcie_ep *ep, u8 fn, return 0; } -static int rockchip_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn, +static int rockchip_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn, u8 vfn, enum pci_epc_irq_type type, u16 interrupt_num) { @@ -510,7 +510,7 @@ static const struct pci_epc_features rockchip_pcie_epc_features = { }; static const struct pci_epc_features* -rockchip_pcie_ep_get_features(struct pci_epc *epc, u8 func_no) +rockchip_pcie_ep_get_features(struct pci_epc *epc, u8 func_no, u8 vfunc_no) { return &rockchip_pcie_epc_features; } diff --git a/drivers/pci/endpoint/functions/pci-epf-ntb.c b/drivers/pci/endpoint/functions/pci-epf-ntb.c index bce274d02dcf..8b4756159f15 100644 --- a/drivers/pci/endpoint/functions/pci-epf-ntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-ntb.c @@ -87,6 +87,7 @@ struct epf_ntb { struct epf_ntb_epc { u8 func_no; + u8 vfunc_no; bool linkup; bool is_msix; int msix_bar; @@ -143,14 +144,15 @@ static int epf_ntb_link_up(struct epf_ntb *ntb, bool link_up) struct epf_ntb_epc *ntb_epc; struct epf_ntb_ctrl *ctrl; struct pci_epc *epc; + u8 func_no, vfunc_no; bool is_msix; - u8 func_no; int ret; for (type = PRIMARY_INTERFACE; type <= SECONDARY_INTERFACE; type++) { ntb_epc = ntb->epc[type]; epc = ntb_epc->epc; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; is_msix = ntb_epc->is_msix; ctrl = ntb_epc->reg; if (link_up) @@ -158,7 +160,7 @@ static int epf_ntb_link_up(struct epf_ntb *ntb, bool link_up) else ctrl->link_status &= ~LINK_STATUS_UP; irq_type = is_msix ? PCI_EPC_IRQ_MSIX : PCI_EPC_IRQ_MSI; - ret = pci_epc_raise_irq(epc, func_no, irq_type, 1); + ret = pci_epc_raise_irq(epc, func_no, vfunc_no, irq_type, 1); if (ret) { dev_err(&epc->dev, "%s intf: Failed to raise Link Up IRQ\n", @@ -238,10 +240,10 @@ static int epf_ntb_configure_mw(struct epf_ntb *ntb, enum pci_barno peer_barno; struct epf_ntb_ctrl *ctrl; phys_addr_t phys_addr; + u8 func_no, vfunc_no; struct pci_epc *epc; u64 addr, size; int ret = 0; - u8 func_no; ntb_epc = ntb->epc[type]; epc = ntb_epc->epc; @@ -267,8 +269,9 @@ static int epf_ntb_configure_mw(struct epf_ntb *ntb, } func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; - ret = pci_epc_map_addr(epc, func_no, phys_addr, addr, size); + ret = pci_epc_map_addr(epc, func_no, vfunc_no, phys_addr, addr, size); if (ret) dev_err(&epc->dev, "%s intf: Failed to map memory window %d address\n", @@ -296,8 +299,8 @@ static void epf_ntb_teardown_mw(struct epf_ntb *ntb, enum pci_barno peer_barno; struct epf_ntb_ctrl *ctrl; phys_addr_t phys_addr; + u8 func_no, vfunc_no; struct pci_epc *epc; - u8 func_no; ntb_epc = ntb->epc[type]; epc = ntb_epc->epc; @@ -311,8 +314,9 @@ static void epf_ntb_teardown_mw(struct epf_ntb *ntb, if (mw + NTB_MW_OFFSET == BAR_DB_MW1) phys_addr += ctrl->mw1_offset; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; - pci_epc_unmap_addr(epc, func_no, phys_addr); + pci_epc_unmap_addr(epc, func_no, vfunc_no, phys_addr); } /** @@ -385,8 +389,8 @@ static int epf_ntb_configure_msi(struct epf_ntb *ntb, struct epf_ntb_ctrl *peer_ctrl; enum pci_barno peer_barno; phys_addr_t phys_addr; + u8 func_no, vfunc_no; struct pci_epc *epc; - u8 func_no; int ret, i; ntb_epc = ntb->epc[type]; @@ -400,8 +404,9 @@ static int epf_ntb_configure_msi(struct epf_ntb *ntb, phys_addr = peer_epf_bar->phys_addr; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; - ret = pci_epc_map_msi_irq(epc, func_no, phys_addr, db_count, + ret = pci_epc_map_msi_irq(epc, func_no, vfunc_no, phys_addr, db_count, db_entry_size, &db_data, &db_offset); if (ret) { dev_err(&epc->dev, "%s intf: Failed to map MSI IRQ\n", @@ -491,10 +496,10 @@ static int epf_ntb_configure_msix(struct epf_ntb *ntb, u32 db_entry_size, msg_data; enum pci_barno peer_barno; phys_addr_t phys_addr; + u8 func_no, vfunc_no; struct pci_epc *epc; size_t align; u64 msg_addr; - u8 func_no; int ret, i; ntb_epc = ntb->epc[type]; @@ -512,12 +517,13 @@ static int epf_ntb_configure_msix(struct epf_ntb *ntb, align = epc_features->align; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; db_entry_size = peer_ctrl->db_entry_size; for (i = 0; i < db_count; i++) { msg_addr = ALIGN_DOWN(msix_tbl[i].msg_addr, align); msg_data = msix_tbl[i].msg_data; - ret = pci_epc_map_addr(epc, func_no, phys_addr, msg_addr, + ret = pci_epc_map_addr(epc, func_no, vfunc_no, phys_addr, msg_addr, db_entry_size); if (ret) { dev_err(&epc->dev, @@ -586,8 +592,8 @@ epf_ntb_teardown_db(struct epf_ntb *ntb, enum pci_epc_interface_type type) struct pci_epf_bar *peer_epf_bar; enum pci_barno peer_barno; phys_addr_t phys_addr; + u8 func_no, vfunc_no; struct pci_epc *epc; - u8 func_no; ntb_epc = ntb->epc[type]; epc = ntb_epc->epc; @@ -597,8 +603,9 @@ epf_ntb_teardown_db(struct epf_ntb *ntb, enum pci_epc_interface_type type) peer_epf_bar = &peer_ntb_epc->epf_bar[peer_barno]; phys_addr = peer_epf_bar->phys_addr; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; - pci_epc_unmap_addr(epc, func_no, phys_addr); + pci_epc_unmap_addr(epc, func_no, vfunc_no, phys_addr); } /** @@ -728,14 +735,15 @@ static void epf_ntb_peer_spad_bar_clear(struct epf_ntb_epc *ntb_epc) { struct pci_epf_bar *epf_bar; enum pci_barno barno; + u8 func_no, vfunc_no; struct pci_epc *epc; - u8 func_no; epc = ntb_epc->epc; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; barno = ntb_epc->epf_ntb_bar[BAR_PEER_SPAD]; epf_bar = &ntb_epc->epf_bar[barno]; - pci_epc_clear_bar(epc, func_no, epf_bar); + pci_epc_clear_bar(epc, func_no, vfunc_no, epf_bar); } /** @@ -775,9 +783,9 @@ static int epf_ntb_peer_spad_bar_set(struct epf_ntb *ntb, struct pci_epf_bar *peer_epf_bar, *epf_bar; enum pci_barno peer_barno, barno; u32 peer_spad_offset; + u8 func_no, vfunc_no; struct pci_epc *epc; struct device *dev; - u8 func_no; int ret; dev = &ntb->epf->dev; @@ -790,6 +798,7 @@ static int epf_ntb_peer_spad_bar_set(struct epf_ntb *ntb, barno = ntb_epc->epf_ntb_bar[BAR_PEER_SPAD]; epf_bar = &ntb_epc->epf_bar[barno]; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; epc = ntb_epc->epc; peer_spad_offset = peer_ntb_epc->reg->spad_offset; @@ -798,7 +807,7 @@ static int epf_ntb_peer_spad_bar_set(struct epf_ntb *ntb, epf_bar->barno = barno; epf_bar->flags = PCI_BASE_ADDRESS_MEM_TYPE_32; - ret = pci_epc_set_bar(epc, func_no, epf_bar); + ret = pci_epc_set_bar(epc, func_no, vfunc_no, epf_bar); if (ret) { dev_err(dev, "%s intf: peer SPAD BAR set failed\n", pci_epc_interface_string(type)); @@ -842,14 +851,15 @@ static void epf_ntb_config_sspad_bar_clear(struct epf_ntb_epc *ntb_epc) { struct pci_epf_bar *epf_bar; enum pci_barno barno; + u8 func_no, vfunc_no; struct pci_epc *epc; - u8 func_no; epc = ntb_epc->epc; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; barno = ntb_epc->epf_ntb_bar[BAR_CONFIG]; epf_bar = &ntb_epc->epf_bar[barno]; - pci_epc_clear_bar(epc, func_no, epf_bar); + pci_epc_clear_bar(epc, func_no, vfunc_no, epf_bar); } /** @@ -886,10 +896,10 @@ static int epf_ntb_config_sspad_bar_set(struct epf_ntb_epc *ntb_epc) { struct pci_epf_bar *epf_bar; enum pci_barno barno; + u8 func_no, vfunc_no; struct epf_ntb *ntb; struct pci_epc *epc; struct device *dev; - u8 func_no; int ret; ntb = ntb_epc->epf_ntb; @@ -897,10 +907,11 @@ static int epf_ntb_config_sspad_bar_set(struct epf_ntb_epc *ntb_epc) epc = ntb_epc->epc; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; barno = ntb_epc->epf_ntb_bar[BAR_CONFIG]; epf_bar = &ntb_epc->epf_bar[barno]; - ret = pci_epc_set_bar(epc, func_no, epf_bar); + ret = pci_epc_set_bar(epc, func_no, vfunc_no, epf_bar); if (ret) { dev_err(dev, "%s inft: Config/Status/SPAD BAR set failed\n", pci_epc_interface_string(ntb_epc->type)); @@ -1214,17 +1225,18 @@ static void epf_ntb_db_mw_bar_clear(struct epf_ntb_epc *ntb_epc) struct pci_epf_bar *epf_bar; enum epf_ntb_bar bar; enum pci_barno barno; + u8 func_no, vfunc_no; struct pci_epc *epc; - u8 func_no; epc = ntb_epc->epc; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; for (bar = BAR_DB_MW1; bar < BAR_MW4; bar++) { barno = ntb_epc->epf_ntb_bar[bar]; epf_bar = &ntb_epc->epf_bar[barno]; - pci_epc_clear_bar(epc, func_no, epf_bar); + pci_epc_clear_bar(epc, func_no, vfunc_no, epf_bar); } } @@ -1263,10 +1275,10 @@ static int epf_ntb_configure_interrupt(struct epf_ntb *ntb, const struct pci_epc_features *epc_features; bool msix_capable, msi_capable; struct epf_ntb_epc *ntb_epc; + u8 func_no, vfunc_no; struct pci_epc *epc; struct device *dev; u32 db_count; - u8 func_no; int ret; ntb_epc = ntb->epc[type]; @@ -1282,6 +1294,7 @@ static int epf_ntb_configure_interrupt(struct epf_ntb *ntb, } func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; db_count = ntb->db_count; if (db_count > MAX_DB_COUNT) { @@ -1293,7 +1306,7 @@ static int epf_ntb_configure_interrupt(struct epf_ntb *ntb, epc = ntb_epc->epc; if (msi_capable) { - ret = pci_epc_set_msi(epc, func_no, db_count); + ret = pci_epc_set_msi(epc, func_no, vfunc_no, db_count); if (ret) { dev_err(dev, "%s intf: MSI configuration failed\n", pci_epc_interface_string(type)); @@ -1302,7 +1315,7 @@ static int epf_ntb_configure_interrupt(struct epf_ntb *ntb, } if (msix_capable) { - ret = pci_epc_set_msix(epc, func_no, db_count, + ret = pci_epc_set_msix(epc, func_no, vfunc_no, db_count, ntb_epc->msix_bar, ntb_epc->msix_table_offset); if (ret) { @@ -1423,11 +1436,11 @@ static int epf_ntb_db_mw_bar_init(struct epf_ntb *ntb, u32 num_mws, db_count; enum epf_ntb_bar bar; enum pci_barno barno; + u8 func_no, vfunc_no; struct pci_epc *epc; struct device *dev; size_t align; int ret, i; - u8 func_no; u64 size; ntb_epc = ntb->epc[type]; @@ -1437,6 +1450,7 @@ static int epf_ntb_db_mw_bar_init(struct epf_ntb *ntb, epc_features = ntb_epc->epc_features; align = epc_features->align; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; epc = ntb_epc->epc; num_mws = ntb->num_mws; db_count = ntb->db_count; @@ -1464,7 +1478,7 @@ static int epf_ntb_db_mw_bar_init(struct epf_ntb *ntb, barno = ntb_epc->epf_ntb_bar[bar]; epf_bar = &ntb_epc->epf_bar[barno]; - ret = pci_epc_set_bar(epc, func_no, epf_bar); + ret = pci_epc_set_bar(epc, func_no, vfunc_no, epf_bar); if (ret) { dev_err(dev, "%s intf: DoorBell BAR set failed\n", pci_epc_interface_string(type)); @@ -1536,9 +1550,9 @@ static int epf_ntb_epc_create_interface(struct epf_ntb *ntb, const struct pci_epc_features *epc_features; struct pci_epf_bar *epf_bar; struct epf_ntb_epc *ntb_epc; + u8 func_no, vfunc_no; struct pci_epf *epf; struct device *dev; - u8 func_no; dev = &ntb->epf->dev; @@ -1547,6 +1561,7 @@ static int epf_ntb_epc_create_interface(struct epf_ntb *ntb, return -ENOMEM; epf = ntb->epf; + vfunc_no = epf->vfunc_no; if (type == PRIMARY_INTERFACE) { func_no = epf->func_no; epf_bar = epf->bar; @@ -1558,11 +1573,12 @@ static int epf_ntb_epc_create_interface(struct epf_ntb *ntb, ntb_epc->linkup = false; ntb_epc->epc = epc; ntb_epc->func_no = func_no; + ntb_epc->vfunc_no = vfunc_no; ntb_epc->type = type; ntb_epc->epf_bar = epf_bar; ntb_epc->epf_ntb = ntb; - epc_features = pci_epc_get_features(epc, func_no); + epc_features = pci_epc_get_features(epc, func_no, vfunc_no); if (!epc_features) return -EINVAL; ntb_epc->epc_features = epc_features; @@ -1702,10 +1718,10 @@ static int epf_ntb_epc_init_interface(struct epf_ntb *ntb, enum pci_epc_interface_type type) { struct epf_ntb_epc *ntb_epc; + u8 func_no, vfunc_no; struct pci_epc *epc; struct pci_epf *epf; struct device *dev; - u8 func_no; int ret; ntb_epc = ntb->epc[type]; @@ -1713,6 +1729,7 @@ static int epf_ntb_epc_init_interface(struct epf_ntb *ntb, dev = &epf->dev; epc = ntb_epc->epc; func_no = ntb_epc->func_no; + vfunc_no = ntb_epc->vfunc_no; ret = epf_ntb_config_sspad_bar_set(ntb->epc[type]); if (ret) { @@ -1742,11 +1759,13 @@ static int epf_ntb_epc_init_interface(struct epf_ntb *ntb, goto err_db_mw_bar_init; } - ret = pci_epc_write_header(epc, func_no, epf->header); - if (ret) { - dev_err(dev, "%s intf: Configuration header write failed\n", - pci_epc_interface_string(type)); - goto err_write_header; + if (vfunc_no <= 1) { + ret = pci_epc_write_header(epc, func_no, vfunc_no, epf->header); + if (ret) { + dev_err(dev, "%s intf: Configuration header write failed\n", + pci_epc_interface_string(type)); + goto err_write_header; + } } INIT_DELAYED_WORK(&ntb->epc[type]->cmd_handler, epf_ntb_cmd_handler); diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c index d2708ca4bece..90d84d3bc868 100644 --- a/drivers/pci/endpoint/functions/pci-epf-test.c +++ b/drivers/pci/endpoint/functions/pci-epf-test.c @@ -247,8 +247,8 @@ static int pci_epf_test_copy(struct pci_epf_test *epf_test) goto err; } - ret = pci_epc_map_addr(epc, epf->func_no, src_phys_addr, reg->src_addr, - reg->size); + ret = pci_epc_map_addr(epc, epf->func_no, epf->vfunc_no, src_phys_addr, + reg->src_addr, reg->size); if (ret) { dev_err(dev, "Failed to map source address\n"); reg->status = STATUS_SRC_ADDR_INVALID; @@ -263,8 +263,8 @@ static int pci_epf_test_copy(struct pci_epf_test *epf_test) goto err_src_map_addr; } - ret = pci_epc_map_addr(epc, epf->func_no, dst_phys_addr, reg->dst_addr, - reg->size); + ret = pci_epc_map_addr(epc, epf->func_no, epf->vfunc_no, dst_phys_addr, + reg->dst_addr, reg->size); if (ret) { dev_err(dev, "Failed to map destination address\n"); reg->status = STATUS_DST_ADDR_INVALID; @@ -291,13 +291,13 @@ static int pci_epf_test_copy(struct pci_epf_test *epf_test) pci_epf_test_print_rate("COPY", reg->size, &start, &end, use_dma); err_map_addr: - pci_epc_unmap_addr(epc, epf->func_no, dst_phys_addr); + pci_epc_unmap_addr(epc, epf->func_no, epf->vfunc_no, dst_phys_addr); err_dst_addr: pci_epc_mem_free_addr(epc, dst_phys_addr, dst_addr, reg->size); err_src_map_addr: - pci_epc_unmap_addr(epc, epf->func_no, src_phys_addr); + pci_epc_unmap_addr(epc, epf->func_no, epf->vfunc_no, src_phys_addr); err_src_addr: pci_epc_mem_free_addr(epc, src_phys_addr, src_addr, reg->size); @@ -331,8 +331,8 @@ static int pci_epf_test_read(struct pci_epf_test *epf_test) goto err; } - ret = pci_epc_map_addr(epc, epf->func_no, phys_addr, reg->src_addr, - reg->size); + ret = pci_epc_map_addr(epc, epf->func_no, epf->vfunc_no, phys_addr, + reg->src_addr, reg->size); if (ret) { dev_err(dev, "Failed to map address\n"); reg->status = STATUS_SRC_ADDR_INVALID; @@ -386,7 +386,7 @@ err_dma_map: kfree(buf); err_map_addr: - pci_epc_unmap_addr(epc, epf->func_no, phys_addr); + pci_epc_unmap_addr(epc, epf->func_no, epf->vfunc_no, phys_addr); err_addr: pci_epc_mem_free_addr(epc, phys_addr, src_addr, reg->size); @@ -419,8 +419,8 @@ static int pci_epf_test_write(struct pci_epf_test *epf_test) goto err; } - ret = pci_epc_map_addr(epc, epf->func_no, phys_addr, reg->dst_addr, - reg->size); + ret = pci_epc_map_addr(epc, epf->func_no, epf->vfunc_no, phys_addr, + reg->dst_addr, reg->size); if (ret) { dev_err(dev, "Failed to map address\n"); reg->status = STATUS_DST_ADDR_INVALID; @@ -479,7 +479,7 @@ err_dma_map: kfree(buf); err_map_addr: - pci_epc_unmap_addr(epc, epf->func_no, phys_addr); + pci_epc_unmap_addr(epc, epf->func_no, epf->vfunc_no, phys_addr); err_addr: pci_epc_mem_free_addr(epc, phys_addr, dst_addr, reg->size); @@ -501,13 +501,16 @@ static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test, u8 irq_type, switch (irq_type) { case IRQ_TYPE_LEGACY: - pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_LEGACY, 0); + pci_epc_raise_irq(epc, epf->func_no, epf->vfunc_no, + PCI_EPC_IRQ_LEGACY, 0); break; case IRQ_TYPE_MSI: - pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSI, irq); + pci_epc_raise_irq(epc, epf->func_no, epf->vfunc_no, + PCI_EPC_IRQ_MSI, irq); break; case IRQ_TYPE_MSIX: - pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSIX, irq); + pci_epc_raise_irq(epc, epf->func_no, epf->vfunc_no, + PCI_EPC_IRQ_MSIX, irq); break; default: dev_err(dev, "Failed to raise IRQ, unknown type\n"); @@ -542,7 +545,8 @@ static void pci_epf_test_cmd_handler(struct work_struct *work) if (command & COMMAND_RAISE_LEGACY_IRQ) { reg->status = STATUS_IRQ_RAISED; - pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_LEGACY, 0); + pci_epc_raise_irq(epc, epf->func_no, epf->vfunc_no, + PCI_EPC_IRQ_LEGACY, 0); goto reset_handler; } @@ -580,22 +584,22 @@ static void pci_epf_test_cmd_handler(struct work_struct *work) } if (command & COMMAND_RAISE_MSI_IRQ) { - count = pci_epc_get_msi(epc, epf->func_no); + count = pci_epc_get_msi(epc, epf->func_no, epf->vfunc_no); if (reg->irq_number > count || count <= 0) goto reset_handler; reg->status = STATUS_IRQ_RAISED; - pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSI, - reg->irq_number); + pci_epc_raise_irq(epc, epf->func_no, epf->vfunc_no, + PCI_EPC_IRQ_MSI, reg->irq_number); goto reset_handler; } if (command & COMMAND_RAISE_MSIX_IRQ) { - count = pci_epc_get_msix(epc, epf->func_no); + count = pci_epc_get_msix(epc, epf->func_no, epf->vfunc_no); if (reg->irq_number > count || count <= 0) goto reset_handler; reg->status = STATUS_IRQ_RAISED; - pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSIX, - reg->irq_number); + pci_epc_raise_irq(epc, epf->func_no, epf->vfunc_no, + PCI_EPC_IRQ_MSIX, reg->irq_number); goto reset_handler; } @@ -618,7 +622,8 @@ static void pci_epf_test_unbind(struct pci_epf *epf) epf_bar = &epf->bar[bar]; if (epf_test->reg[bar]) { - pci_epc_clear_bar(epc, epf->func_no, epf_bar); + pci_epc_clear_bar(epc, epf->func_no, epf->vfunc_no, + epf_bar); pci_epf_free_space(epf, epf_test->reg[bar], bar, PRIMARY_INTERFACE); } @@ -650,7 +655,8 @@ static int pci_epf_test_set_bar(struct pci_epf *epf) if (!!(epc_features->reserved_bar & (1 << bar))) continue; - ret = pci_epc_set_bar(epc, epf->func_no, epf_bar); + ret = pci_epc_set_bar(epc, epf->func_no, epf->vfunc_no, + epf_bar); if (ret) { pci_epf_free_space(epf, epf_test->reg[bar], bar, PRIMARY_INTERFACE); @@ -674,16 +680,18 @@ static int pci_epf_test_core_init(struct pci_epf *epf) bool msi_capable = true; int ret; - epc_features = pci_epc_get_features(epc, epf->func_no); + epc_features = pci_epc_get_features(epc, epf->func_no, epf->vfunc_no); if (epc_features) { msix_capable = epc_features->msix_capable; msi_capable = epc_features->msi_capable; } - ret = pci_epc_write_header(epc, epf->func_no, header); - if (ret) { - dev_err(dev, "Configuration header write failed\n"); - return ret; + if (epf->vfunc_no <= 1) { + ret = pci_epc_write_header(epc, epf->func_no, epf->vfunc_no, header); + if (ret) { + dev_err(dev, "Configuration header write failed\n"); + return ret; + } } ret = pci_epf_test_set_bar(epf); @@ -691,7 +699,8 @@ static int pci_epf_test_core_init(struct pci_epf *epf) return ret; if (msi_capable) { - ret = pci_epc_set_msi(epc, epf->func_no, epf->msi_interrupts); + ret = pci_epc_set_msi(epc, epf->func_no, epf->vfunc_no, + epf->msi_interrupts); if (ret) { dev_err(dev, "MSI configuration failed\n"); return ret; @@ -699,7 +708,8 @@ static int pci_epf_test_core_init(struct pci_epf *epf) } if (msix_capable) { - ret = pci_epc_set_msix(epc, epf->func_no, epf->msix_interrupts, + ret = pci_epc_set_msix(epc, epf->func_no, epf->vfunc_no, + epf->msix_interrupts, epf_test->test_reg_bar, epf_test->msix_table_offset); if (ret) { @@ -832,7 +842,7 @@ static int pci_epf_test_bind(struct pci_epf *epf) if (WARN_ON_ONCE(!epc)) return -EINVAL; - epc_features = pci_epc_get_features(epc, epf->func_no); + epc_features = pci_epc_get_features(epc, epf->func_no, epf->vfunc_no); if (!epc_features) { dev_err(&epf->dev, "epc_features not implemented\n"); return -EOPNOTSUPP; diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c index 01c58ca84dcc..ecbb0fb3b653 100644 --- a/drivers/pci/endpoint/pci-epc-core.c +++ b/drivers/pci/endpoint/pci-epc-core.c @@ -137,24 +137,29 @@ EXPORT_SYMBOL_GPL(pci_epc_get_next_free_bar); * @epc: the features supported by *this* EPC device will be returned * @func_no: the features supported by the EPC device specific to the * endpoint function with func_no will be returned + * @vfunc_no: the features supported by the EPC device specific to the + * virtual endpoint function with vfunc_no will be returned * * Invoke to get the features provided by the EPC which may be * specific to an endpoint function. Returns pci_epc_features on success * and NULL for any failures. */ const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc, - u8 func_no) + u8 func_no, u8 vfunc_no) { const struct pci_epc_features *epc_features; if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions) return NULL; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return NULL; + if (!epc->ops->get_features) return NULL; mutex_lock(&epc->lock); - epc_features = epc->ops->get_features(epc, func_no); + epc_features = epc->ops->get_features(epc, func_no, vfunc_no); mutex_unlock(&epc->lock); return epc_features; @@ -205,13 +210,14 @@ EXPORT_SYMBOL_GPL(pci_epc_start); /** * pci_epc_raise_irq() - interrupt the host system * @epc: the EPC device which has to interrupt the host - * @func_no: the endpoint function number in the EPC device + * @func_no: the physical endpoint function number in the EPC device + * @vfunc_no: the virtual endpoint function number in the physical function * @type: specify the type of interrupt; legacy, MSI or MSI-X * @interrupt_num: the MSI or MSI-X interrupt number * * Invoke to raise an legacy, MSI or MSI-X interrupt */ -int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, +int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no, enum pci_epc_irq_type type, u16 interrupt_num) { int ret; @@ -219,11 +225,14 @@ int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions) return -EINVAL; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return -EINVAL; + if (!epc->ops->raise_irq) return 0; mutex_lock(&epc->lock); - ret = epc->ops->raise_irq(epc, func_no, type, interrupt_num); + ret = epc->ops->raise_irq(epc, func_no, vfunc_no, type, interrupt_num); mutex_unlock(&epc->lock); return ret; @@ -235,6 +244,7 @@ EXPORT_SYMBOL_GPL(pci_epc_raise_irq); * MSI data * @epc: the EPC device which has the MSI capability * @func_no: the physical endpoint function number in the EPC device + * @vfunc_no: the virtual endpoint function number in the physical function * @phys_addr: the physical address of the outbound region * @interrupt_num: the MSI interrupt number * @entry_size: Size of Outbound address region for each interrupt @@ -250,21 +260,25 @@ EXPORT_SYMBOL_GPL(pci_epc_raise_irq); * physical address (in outbound region) of the other interface to ring * doorbell. */ -int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, phys_addr_t phys_addr, - u8 interrupt_num, u32 entry_size, u32 *msi_data, - u32 *msi_addr_offset) +int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + phys_addr_t phys_addr, u8 interrupt_num, u32 entry_size, + u32 *msi_data, u32 *msi_addr_offset) { int ret; if (IS_ERR_OR_NULL(epc)) return -EINVAL; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return -EINVAL; + if (!epc->ops->map_msi_irq) return -EINVAL; mutex_lock(&epc->lock); - ret = epc->ops->map_msi_irq(epc, func_no, phys_addr, interrupt_num, - entry_size, msi_data, msi_addr_offset); + ret = epc->ops->map_msi_irq(epc, func_no, vfunc_no, phys_addr, + interrupt_num, entry_size, msi_data, + msi_addr_offset); mutex_unlock(&epc->lock); return ret; @@ -274,22 +288,26 @@ EXPORT_SYMBOL_GPL(pci_epc_map_msi_irq); /** * pci_epc_get_msi() - get the number of MSI interrupt numbers allocated * @epc: the EPC device to which MSI interrupts was requested - * @func_no: the endpoint function number in the EPC device + * @func_no: the physical endpoint function number in the EPC device + * @vfunc_no: the virtual endpoint function number in the physical function * * Invoke to get the number of MSI interrupts allocated by the RC */ -int pci_epc_get_msi(struct pci_epc *epc, u8 func_no) +int pci_epc_get_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no) { int interrupt; if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions) return 0; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return 0; + if (!epc->ops->get_msi) return 0; mutex_lock(&epc->lock); - interrupt = epc->ops->get_msi(epc, func_no); + interrupt = epc->ops->get_msi(epc, func_no, vfunc_no); mutex_unlock(&epc->lock); if (interrupt < 0) @@ -304,12 +322,13 @@ EXPORT_SYMBOL_GPL(pci_epc_get_msi); /** * pci_epc_set_msi() - set the number of MSI interrupt numbers required * @epc: the EPC device on which MSI has to be configured - * @func_no: the endpoint function number in the EPC device + * @func_no: the physical endpoint function number in the EPC device + * @vfunc_no: the virtual endpoint function number in the physical function * @interrupts: number of MSI interrupts required by the EPF * * Invoke to set the required number of MSI interrupts. */ -int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts) +int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no, u8 interrupts) { int ret; u8 encode_int; @@ -318,13 +337,16 @@ int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts) interrupts > 32) return -EINVAL; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return -EINVAL; + if (!epc->ops->set_msi) return 0; encode_int = order_base_2(interrupts); mutex_lock(&epc->lock); - ret = epc->ops->set_msi(epc, func_no, encode_int); + ret = epc->ops->set_msi(epc, func_no, vfunc_no, encode_int); mutex_unlock(&epc->lock); return ret; @@ -334,22 +356,26 @@ EXPORT_SYMBOL_GPL(pci_epc_set_msi); /** * pci_epc_get_msix() - get the number of MSI-X interrupt numbers allocated * @epc: the EPC device to which MSI-X interrupts was requested - * @func_no: the endpoint function number in the EPC device + * @func_no: the physical endpoint function number in the EPC device + * @vfunc_no: the virtual endpoint function number in the physical function * * Invoke to get the number of MSI-X interrupts allocated by the RC */ -int pci_epc_get_msix(struct pci_epc *epc, u8 func_no) +int pci_epc_get_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no) { int interrupt; if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions) return 0; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return 0; + if (!epc->ops->get_msix) return 0; mutex_lock(&epc->lock); - interrupt = epc->ops->get_msix(epc, func_no); + interrupt = epc->ops->get_msix(epc, func_no, vfunc_no); mutex_unlock(&epc->lock); if (interrupt < 0) @@ -362,15 +388,16 @@ EXPORT_SYMBOL_GPL(pci_epc_get_msix); /** * pci_epc_set_msix() - set the number of MSI-X interrupt numbers required * @epc: the EPC device on which MSI-X has to be configured - * @func_no: the endpoint function number in the EPC device + * @func_no: the physical endpoint function number in the EPC device + * @vfunc_no: the virtual endpoint function number in the physical function * @interrupts: number of MSI-X interrupts required by the EPF * @bir: BAR where the MSI-X table resides * @offset: Offset pointing to the start of MSI-X table * * Invoke to set the required number of MSI-X interrupts. */ -int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts, - enum pci_barno bir, u32 offset) +int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u16 interrupts, enum pci_barno bir, u32 offset) { int ret; @@ -378,11 +405,15 @@ int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts, interrupts < 1 || interrupts > 2048) return -EINVAL; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return -EINVAL; + if (!epc->ops->set_msix) return 0; mutex_lock(&epc->lock); - ret = epc->ops->set_msix(epc, func_no, interrupts - 1, bir, offset); + ret = epc->ops->set_msix(epc, func_no, vfunc_no, interrupts - 1, bir, + offset); mutex_unlock(&epc->lock); return ret; @@ -392,22 +423,26 @@ EXPORT_SYMBOL_GPL(pci_epc_set_msix); /** * pci_epc_unmap_addr() - unmap CPU address from PCI address * @epc: the EPC device on which address is allocated - * @func_no: the endpoint function number in the EPC device + * @func_no: the physical endpoint function number in the EPC device + * @vfunc_no: the virtual endpoint function number in the physical function * @phys_addr: physical address of the local system * * Invoke to unmap the CPU address from PCI address. */ -void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no, +void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr) { if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions) return; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return; + if (!epc->ops->unmap_addr) return; mutex_lock(&epc->lock); - epc->ops->unmap_addr(epc, func_no, phys_addr); + epc->ops->unmap_addr(epc, func_no, vfunc_no, phys_addr); mutex_unlock(&epc->lock); } EXPORT_SYMBOL_GPL(pci_epc_unmap_addr); @@ -415,14 +450,15 @@ EXPORT_SYMBOL_GPL(pci_epc_unmap_addr); /** * pci_epc_map_addr() - map CPU address to PCI address * @epc: the EPC device on which address is allocated - * @func_no: the endpoint function number in the EPC device + * @func_no: the physical endpoint function number in the EPC device + * @vfunc_no: the virtual endpoint function number in the physical function * @phys_addr: physical address of the local system * @pci_addr: PCI address to which the physical address should be mapped * @size: the size of the allocation * * Invoke to map CPU address with PCI address. */ -int pci_epc_map_addr(struct pci_epc *epc, u8 func_no, +int pci_epc_map_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr, u64 pci_addr, size_t size) { int ret; @@ -430,11 +466,15 @@ int pci_epc_map_addr(struct pci_epc *epc, u8 func_no, if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions) return -EINVAL; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return -EINVAL; + if (!epc->ops->map_addr) return 0; mutex_lock(&epc->lock); - ret = epc->ops->map_addr(epc, func_no, phys_addr, pci_addr, size); + ret = epc->ops->map_addr(epc, func_no, vfunc_no, phys_addr, pci_addr, + size); mutex_unlock(&epc->lock); return ret; @@ -444,12 +484,13 @@ EXPORT_SYMBOL_GPL(pci_epc_map_addr); /** * pci_epc_clear_bar() - reset the BAR * @epc: the EPC device for which the BAR has to be cleared - * @func_no: the endpoint function number in the EPC device + * @func_no: the physical endpoint function number in the EPC device + * @vfunc_no: the virtual endpoint function number in the physical function * @epf_bar: the struct epf_bar that contains the BAR information * * Invoke to reset the BAR of the endpoint device. */ -void pci_epc_clear_bar(struct pci_epc *epc, u8 func_no, +void pci_epc_clear_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar) { if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions || @@ -457,11 +498,14 @@ void pci_epc_clear_bar(struct pci_epc *epc, u8 func_no, epf_bar->flags & PCI_BASE_ADDRESS_MEM_TYPE_64)) return; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return; + if (!epc->ops->clear_bar) return; mutex_lock(&epc->lock); - epc->ops->clear_bar(epc, func_no, epf_bar); + epc->ops->clear_bar(epc, func_no, vfunc_no, epf_bar); mutex_unlock(&epc->lock); } EXPORT_SYMBOL_GPL(pci_epc_clear_bar); @@ -469,12 +513,13 @@ EXPORT_SYMBOL_GPL(pci_epc_clear_bar); /** * pci_epc_set_bar() - configure BAR in order for host to assign PCI addr space * @epc: the EPC device on which BAR has to be configured - * @func_no: the endpoint function number in the EPC device + * @func_no: the physical endpoint function number in the EPC device + * @vfunc_no: the virtual endpoint function number in the physical function * @epf_bar: the struct epf_bar that contains the BAR information * * Invoke to configure the BAR of the endpoint device. */ -int pci_epc_set_bar(struct pci_epc *epc, u8 func_no, +int pci_epc_set_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar) { int ret; @@ -489,11 +534,14 @@ int pci_epc_set_bar(struct pci_epc *epc, u8 func_no, !(flags & PCI_BASE_ADDRESS_MEM_TYPE_64))) return -EINVAL; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return -EINVAL; + if (!epc->ops->set_bar) return 0; mutex_lock(&epc->lock); - ret = epc->ops->set_bar(epc, func_no, epf_bar); + ret = epc->ops->set_bar(epc, func_no, vfunc_no, epf_bar); mutex_unlock(&epc->lock); return ret; @@ -503,7 +551,8 @@ EXPORT_SYMBOL_GPL(pci_epc_set_bar); /** * pci_epc_write_header() - write standard configuration header * @epc: the EPC device to which the configuration header should be written - * @func_no: the endpoint function number in the EPC device + * @func_no: the physical endpoint function number in the EPC device + * @vfunc_no: the virtual endpoint function number in the physical function * @header: standard configuration header fields * * Invoke to write the configuration header to the endpoint controller. Every @@ -511,7 +560,7 @@ EXPORT_SYMBOL_GPL(pci_epc_set_bar); * configuration header would be written. The callback function should write * the header fields to this dedicated location. */ -int pci_epc_write_header(struct pci_epc *epc, u8 func_no, +int pci_epc_write_header(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_header *header) { int ret; @@ -519,11 +568,18 @@ int pci_epc_write_header(struct pci_epc *epc, u8 func_no, if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions) return -EINVAL; + if (vfunc_no > 0 && (!epc->max_vfs || vfunc_no > epc->max_vfs[func_no])) + return -EINVAL; + + /* Only Virtual Function #1 has deviceID */ + if (vfunc_no > 1) + return -EINVAL; + if (!epc->ops->write_header) return 0; mutex_lock(&epc->lock); - ret = epc->ops->write_header(epc, func_no, header); + ret = epc->ops->write_header(epc, func_no, vfunc_no, header); mutex_unlock(&epc->lock); return ret; diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c index 296479659aa2..af691b317f74 100644 --- a/drivers/pci/endpoint/pci-epf-core.c +++ b/drivers/pci/endpoint/pci-epf-core.c @@ -90,11 +90,14 @@ EXPORT_SYMBOL_GPL(pci_epf_unbind); */ int pci_epf_bind(struct pci_epf *epf) { + struct device *dev = &epf->dev; struct pci_epf *epf_vf; + u8 func_no, vfunc_no; + struct pci_epc *epc; int ret; if (!epf->driver) { - dev_WARN(&epf->dev, "epf device not bound to driver\n"); + dev_WARN(dev, "epf device not bound to driver\n"); return -EINVAL; } @@ -103,7 +106,50 @@ int pci_epf_bind(struct pci_epf *epf) mutex_lock(&epf->lock); list_for_each_entry(epf_vf, &epf->pci_vepf, list) { + vfunc_no = epf_vf->vfunc_no; + + if (vfunc_no < 1) { + dev_err(dev, "Invalid virtual function number\n"); + ret = -EINVAL; + goto ret; + } + + epc = epf->epc; + func_no = epf->func_no; + if (!IS_ERR_OR_NULL(epc)) { + if (!epc->max_vfs) { + dev_err(dev, "No support for virt function\n"); + ret = -EINVAL; + goto ret; + } + + if (vfunc_no > epc->max_vfs[func_no]) { + dev_err(dev, "PF%d: Exceeds max vfunc number\n", + func_no); + ret = -EINVAL; + goto ret; + } + } + + epc = epf->sec_epc; + func_no = epf->sec_epc_func_no; + if (!IS_ERR_OR_NULL(epc)) { + if (!epc->max_vfs) { + dev_err(dev, "No support for virt function\n"); + ret = -EINVAL; + goto ret; + } + + if (vfunc_no > epc->max_vfs[func_no]) { + dev_err(dev, "PF%d: Exceeds max vfunc number\n", + func_no); + ret = -EINVAL; + goto ret; + } + } + epf_vf->func_no = epf->func_no; + epf_vf->sec_epc_func_no = epf->sec_epc_func_no; epf_vf->epc = epf->epc; epf_vf->sec_epc = epf->sec_epc; ret = epf_vf->driver->ops->bind(epf_vf); diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 50a649d33e68..a48778e1a4ee 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -62,31 +62,32 @@ pci_epc_interface_string(enum pci_epc_interface_type type) * @owner: the module owner containing the ops */ struct pci_epc_ops { - int (*write_header)(struct pci_epc *epc, u8 func_no, + int (*write_header)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_header *hdr); - int (*set_bar)(struct pci_epc *epc, u8 func_no, + int (*set_bar)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar); - void (*clear_bar)(struct pci_epc *epc, u8 func_no, + void (*clear_bar)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar); - int (*map_addr)(struct pci_epc *epc, u8 func_no, + int (*map_addr)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t addr, u64 pci_addr, size_t size); - void (*unmap_addr)(struct pci_epc *epc, u8 func_no, + void (*unmap_addr)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t addr); - int (*set_msi)(struct pci_epc *epc, u8 func_no, u8 interrupts); - int (*get_msi)(struct pci_epc *epc, u8 func_no); - int (*set_msix)(struct pci_epc *epc, u8 func_no, u16 interrupts, - enum pci_barno, u32 offset); - int (*get_msix)(struct pci_epc *epc, u8 func_no); - int (*raise_irq)(struct pci_epc *epc, u8 func_no, + int (*set_msi)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u8 interrupts); + int (*get_msi)(struct pci_epc *epc, u8 func_no, u8 vfunc_no); + int (*set_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u16 interrupts, enum pci_barno, u32 offset); + int (*get_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no); + int (*raise_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, enum pci_epc_irq_type type, u16 interrupt_num); - int (*map_msi_irq)(struct pci_epc *epc, u8 func_no, + int (*map_msi_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr, u8 interrupt_num, u32 entry_size, u32 *msi_data, u32 *msi_addr_offset); int (*start)(struct pci_epc *epc); void (*stop)(struct pci_epc *epc); const struct pci_epc_features* (*get_features)(struct pci_epc *epc, - u8 func_no); + u8 func_no, u8 vfunc_no); struct module *owner; }; @@ -128,6 +129,8 @@ struct pci_epc_mem { * single window. * @num_windows: number of windows supported by device * @max_functions: max number of functions that can be configured in this EPC + * @max_vfs: Array indicating the maximum number of virtual functions that can + * be associated with each physical function * @group: configfs group representing the PCI EPC device * @lock: mutex to protect pci_epc ops * @function_num_map: bitmap to manage physical function number @@ -141,6 +144,7 @@ struct pci_epc { struct pci_epc_mem *mem; unsigned int num_windows; u8 max_functions; + u8 *max_vfs; struct config_group *group; /* mutex to protect against concurrent access of EP controller */ struct mutex lock; @@ -208,31 +212,32 @@ void pci_epc_linkup(struct pci_epc *epc); void pci_epc_init_notify(struct pci_epc *epc); void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf, enum pci_epc_interface_type type); -int pci_epc_write_header(struct pci_epc *epc, u8 func_no, +int pci_epc_write_header(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_header *hdr); -int pci_epc_set_bar(struct pci_epc *epc, u8 func_no, +int pci_epc_set_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar); -void pci_epc_clear_bar(struct pci_epc *epc, u8 func_no, +void pci_epc_clear_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar); -int pci_epc_map_addr(struct pci_epc *epc, u8 func_no, +int pci_epc_map_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr, u64 pci_addr, size_t size); -void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no, +void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr); -int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts); -int pci_epc_get_msi(struct pci_epc *epc, u8 func_no); -int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts, - enum pci_barno, u32 offset); -int pci_epc_get_msix(struct pci_epc *epc, u8 func_no); -int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, +int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u8 interrupts); +int pci_epc_get_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no); +int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u16 interrupts, enum pci_barno, u32 offset); +int pci_epc_get_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no); +int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr, u8 interrupt_num, u32 entry_size, u32 *msi_data, u32 *msi_addr_offset); -int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, +int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no, enum pci_epc_irq_type type, u16 interrupt_num); int pci_epc_start(struct pci_epc *epc); void pci_epc_stop(struct pci_epc *epc); const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc, - u8 func_no); + u8 func_no, u8 vfunc_no); enum pci_barno pci_epc_get_first_free_bar(const struct pci_epc_features *epc_features); enum pci_barno pci_epc_get_next_free_bar(const struct pci_epc_features -- cgit v1.2.3-71-gd317 From 249dbe74d3c4b568a623fb55c56cddf19fdf0b89 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Aug 2021 08:30:22 +0100 Subject: ARM: 9108/1: oabi-compat: rework epoll_wait/epoll_pwait emulation The epoll_wait() system call wrapper is one of the remaining users of the set_fs() infrasturcture for Arm. Changing it to not require set_fs() is rather complex unfortunately. The approach I'm taking here is to allow architectures to override the code that copies the output to user space, and let the oabi-compat implementation check whether it is getting called from an EABI or OABI system call based on the thread_info->syscall value. The in_oabi_syscall() check here mirrors the in_compat_syscall() and in_x32_syscall() helpers for 32-bit compat implementations on other architectures. Overall, the amount of code goes down, at least with the newly added sys_oabi_epoll_pwait() helper getting removed again. The downside is added complexity in the source code for the native implementation. There should be no difference in runtime performance except for Arm kernels with CONFIG_OABI_COMPAT enabled that now have to go through an external function call to check which of the two variants to use. Acked-by: Christoph Hellwig Signed-off-by: Arnd Bergmann Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/syscall.h | 11 ++++++ arch/arm/kernel/sys_oabi-compat.c | 83 ++++++++------------------------------- arch/arm/tools/syscall.tbl | 4 +- fs/eventpoll.c | 5 +-- include/linux/eventpoll.h | 18 +++++++++ 5 files changed, 49 insertions(+), 72 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h index f055e846a5cc..24c19d63ff0a 100644 --- a/arch/arm/include/asm/syscall.h +++ b/arch/arm/include/asm/syscall.h @@ -28,6 +28,17 @@ static inline int syscall_get_nr(struct task_struct *task, return task_thread_info(task)->abi_syscall & __NR_SYSCALL_MASK; } +static inline bool __in_oabi_syscall(struct task_struct *task) +{ + return IS_ENABLED(CONFIG_OABI_COMPAT) && + (task_thread_info(task)->abi_syscall & __NR_OABI_SYSCALL_BASE); +} + +static inline bool in_oabi_syscall(void) +{ + return __in_oabi_syscall(current); +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 443203fafb6b..1f6a433200f1 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -83,6 +83,8 @@ #include #include +#include + struct oldabi_stat64 { unsigned long long st_dev; unsigned int __pad1; @@ -264,87 +266,34 @@ asmlinkage long sys_oabi_epoll_ctl(int epfd, int op, int fd, return do_epoll_ctl(epfd, op, fd, &kernel, false); } - -static long do_oabi_epoll_wait(int epfd, struct oabi_epoll_event __user *events, - int maxevents, int timeout) -{ - struct epoll_event *kbuf; - struct oabi_epoll_event e; - mm_segment_t fs; - long ret, err, i; - - if (maxevents <= 0 || - maxevents > (INT_MAX/sizeof(*kbuf)) || - maxevents > (INT_MAX/sizeof(*events))) - return -EINVAL; - if (!access_ok(events, sizeof(*events) * maxevents)) - return -EFAULT; - kbuf = kmalloc_array(maxevents, sizeof(*kbuf), GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_epoll_wait(epfd, kbuf, maxevents, timeout); - set_fs(fs); - err = 0; - for (i = 0; i < ret; i++) { - e.events = kbuf[i].events; - e.data = kbuf[i].data; - err = __copy_to_user(events, &e, sizeof(e)); - if (err) - break; - events++; - } - kfree(kbuf); - return err ? -EFAULT : ret; -} #else asmlinkage long sys_oabi_epoll_ctl(int epfd, int op, int fd, struct oabi_epoll_event __user *event) { return -EINVAL; } - -asmlinkage long sys_oabi_epoll_wait(int epfd, - struct oabi_epoll_event __user *events, - int maxevents, int timeout) -{ - return -EINVAL; -} #endif -SYSCALL_DEFINE4(oabi_epoll_wait, int, epfd, - struct oabi_epoll_event __user *, events, - int, maxevents, int, timeout) +struct epoll_event __user * +epoll_put_uevent(__poll_t revents, __u64 data, + struct epoll_event __user *uevent) { - return do_oabi_epoll_wait(epfd, events, maxevents, timeout); -} + if (in_oabi_syscall()) { + struct oabi_epoll_event __user *oevent = (void __user *)uevent; -/* - * Implement the event wait interface for the eventpoll file. It is the kernel - * part of the user space epoll_pwait(2). - */ -SYSCALL_DEFINE6(oabi_epoll_pwait, int, epfd, - struct oabi_epoll_event __user *, events, int, maxevents, - int, timeout, const sigset_t __user *, sigmask, - size_t, sigsetsize) -{ - int error; + if (__put_user(revents, &oevent->events) || + __put_user(data, &oevent->data)) + return NULL; - /* - * If the caller wants a certain signal mask to be set during the wait, - * we apply it here. - */ - error = set_user_sigmask(sigmask, sigsetsize); - if (error) - return error; + return (void __user *)(oevent+1); + } - error = do_oabi_epoll_wait(epfd, events, maxevents, timeout); - restore_saved_sigmask_unless(error == -EINTR); + if (__put_user(revents, &uevent->events) || + __put_user(data, &uevent->data)) + return NULL; - return error; + return uevent+1; } -#endif struct oabi_sembuf { unsigned short sem_num; diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 11d0b960b2c2..344424a9611f 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -266,7 +266,7 @@ 249 common lookup_dcookie sys_lookup_dcookie 250 common epoll_create sys_epoll_create 251 common epoll_ctl sys_epoll_ctl sys_oabi_epoll_ctl -252 common epoll_wait sys_epoll_wait sys_oabi_epoll_wait +252 common epoll_wait sys_epoll_wait 253 common remap_file_pages sys_remap_file_pages # 254 for set_thread_area # 255 for get_thread_area @@ -360,7 +360,7 @@ 343 common vmsplice sys_vmsplice 344 common move_pages sys_move_pages 345 common getcpu sys_getcpu -346 common epoll_pwait sys_epoll_pwait sys_oabi_epoll_pwait +346 common epoll_pwait sys_epoll_pwait 347 common kexec_load sys_kexec_load 348 common utimensat sys_utimensat_time32 349 common signalfd sys_signalfd diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1e596e1d0bba..c90c4352325e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1684,8 +1684,8 @@ static int ep_send_events(struct eventpoll *ep, if (!revents) continue; - if (__put_user(revents, &events->events) || - __put_user(epi->event.data, &events->data)) { + events = epoll_put_uevent(revents, epi->event.data, events); + if (!events) { list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); if (!res) @@ -1693,7 +1693,6 @@ static int ep_send_events(struct eventpoll *ep, break; } res++; - events++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 593322c946e6..3337745d81bd 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -68,4 +68,22 @@ static inline void eventpoll_release(struct file *file) {} #endif +#if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT) +/* ARM OABI has an incompatible struct layout and needs a special handler */ +extern struct epoll_event __user * +epoll_put_uevent(__poll_t revents, __u64 data, + struct epoll_event __user *uevent); +#else +static inline struct epoll_event __user * +epoll_put_uevent(__poll_t revents, __u64 data, + struct epoll_event __user *uevent) +{ + if (__put_user(revents, &uevent->events) || + __put_user(data, &uevent->data)) + return NULL; + + return uevent+1; +} +#endif + #endif /* #ifndef _LINUX_EVENTPOLL_H */ -- cgit v1.2.3-71-gd317 From bdec0145286f7e6be9b3134aa35f0f335fa27c38 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Aug 2021 08:30:23 +0100 Subject: ARM: 9114/1: oabi-compat: rework sys_semtimedop emulation sys_oabi_semtimedop() is one of the last users of set_fs() on Arm. To remove this one, expose the internal code of the actual implementation that operates on a kernel pointer and call it directly after copying. There should be no measurable impact on the normal execution of this function, and it makes the overly long function a little shorter, which may help readability. While reworking the oabi version, make it behave a little more like the native one, using kvmalloc_array() and restructure the code flow in a similar way. The naming of __do_semtimedop() is not very good, I hope someone can come up with a better name. One regression was spotted by kernel test robot and fixed before the first mailing list submission. Acked-by: Christoph Hellwig Signed-off-by: Arnd Bergmann Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/sys_oabi-compat.c | 60 ++++++++++++++++++++-------- include/linux/syscalls.h | 3 ++ ipc/sem.c | 84 ++++++++++++++++++++++++--------------- 3 files changed, 99 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 1f6a433200f1..5ea365c35ca5 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -302,46 +303,52 @@ struct oabi_sembuf { unsigned short __pad; }; +#define sc_semopm sem_ctls[2] + +#ifdef CONFIG_SYSVIPC asmlinkage long sys_oabi_semtimedop(int semid, struct oabi_sembuf __user *tsops, unsigned nsops, const struct old_timespec32 __user *timeout) { + struct ipc_namespace *ns; struct sembuf *sops; - struct old_timespec32 local_timeout; long err; int i; + ns = current->nsproxy->ipc_ns; + if (nsops > ns->sc_semopm) + return -E2BIG; if (nsops < 1 || nsops > SEMOPM) return -EINVAL; - if (!access_ok(tsops, sizeof(*tsops) * nsops)) - return -EFAULT; - sops = kmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); + sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); if (!sops) return -ENOMEM; err = 0; for (i = 0; i < nsops; i++) { struct oabi_sembuf osb; - err |= __copy_from_user(&osb, tsops, sizeof(osb)); + err |= copy_from_user(&osb, tsops, sizeof(osb)); sops[i].sem_num = osb.sem_num; sops[i].sem_op = osb.sem_op; sops[i].sem_flg = osb.sem_flg; tsops++; } - if (timeout) { - /* copy this as well before changing domain protection */ - err |= copy_from_user(&local_timeout, timeout, sizeof(*timeout)); - timeout = &local_timeout; - } if (err) { err = -EFAULT; - } else { - mm_segment_t fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_semtimedop_time32(semid, sops, nsops, timeout); - set_fs(fs); + goto out; + } + + if (timeout) { + struct timespec64 ts; + err = get_old_timespec32(&ts, timeout); + if (err) + goto out; + err = __do_semtimedop(semid, sops, nsops, &ts, ns); + goto out; } - kfree(sops); + err = __do_semtimedop(semid, sops, nsops, NULL, ns); +out: + kvfree(sops); return err; } @@ -368,6 +375,27 @@ asmlinkage int sys_oabi_ipc(uint call, int first, int second, int third, return sys_ipc(call, first, second, third, ptr, fifth); } } +#else +asmlinkage long sys_oabi_semtimedop(int semid, + struct oabi_sembuf __user *tsops, + unsigned nsops, + const struct old_timespec32 __user *timeout) +{ + return -ENOSYS; +} + +asmlinkage long sys_oabi_semop(int semid, struct oabi_sembuf __user *tsops, + unsigned nsops) +{ + return -ENOSYS; +} + +asmlinkage int sys_oabi_ipc(uint call, int first, int second, int third, + void __user *ptr, long fifth) +{ + return -ENOSYS; +} +#endif asmlinkage long sys_oabi_bind(int fd, struct sockaddr __user *addr, int addrlen) { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 69c9a7010081..6c6fc3fd5b72 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1373,6 +1373,9 @@ long ksys_old_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, unsigned int nsops, const struct old_timespec32 __user *timeout); +long __do_semtimedop(int semid, struct sembuf *tsems, unsigned int nsops, + const struct timespec64 *timeout, + struct ipc_namespace *ns); int __sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen); diff --git a/ipc/sem.c b/ipc/sem.c index 971e75d28364..ae8d9104b0a0 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1984,46 +1984,34 @@ out: return un; } -static long do_semtimedop(int semid, struct sembuf __user *tsops, - unsigned nsops, const struct timespec64 *timeout) +long __do_semtimedop(int semid, struct sembuf *sops, + unsigned nsops, const struct timespec64 *timeout, + struct ipc_namespace *ns) { int error = -EINVAL; struct sem_array *sma; - struct sembuf fast_sops[SEMOPM_FAST]; - struct sembuf *sops = fast_sops, *sop; + struct sembuf *sop; struct sem_undo *un; int max, locknum; bool undos = false, alter = false, dupsop = false; struct sem_queue queue; unsigned long dup = 0, jiffies_left = 0; - struct ipc_namespace *ns; - - ns = current->nsproxy->ipc_ns; if (nsops < 1 || semid < 0) return -EINVAL; if (nsops > ns->sc_semopm) return -E2BIG; - if (nsops > SEMOPM_FAST) { - sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); - if (sops == NULL) - return -ENOMEM; - } - - if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { - error = -EFAULT; - goto out_free; - } if (timeout) { if (timeout->tv_sec < 0 || timeout->tv_nsec < 0 || timeout->tv_nsec >= 1000000000L) { error = -EINVAL; - goto out_free; + goto out; } jiffies_left = timespec64_to_jiffies(timeout); } + max = 0; for (sop = sops; sop < sops + nsops; sop++) { unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG); @@ -2052,7 +2040,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, un = find_alloc_undo(ns, semid); if (IS_ERR(un)) { error = PTR_ERR(un); - goto out_free; + goto out; } } else { un = NULL; @@ -2063,25 +2051,25 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, if (IS_ERR(sma)) { rcu_read_unlock(); error = PTR_ERR(sma); - goto out_free; + goto out; } error = -EFBIG; if (max >= sma->sem_nsems) { rcu_read_unlock(); - goto out_free; + goto out; } error = -EACCES; if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) { rcu_read_unlock(); - goto out_free; + goto out; } error = security_sem_semop(&sma->sem_perm, sops, nsops, alter); if (error) { rcu_read_unlock(); - goto out_free; + goto out; } error = -EIDRM; @@ -2095,7 +2083,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, * entangled here and why it's RMID race safe on comments at sem_lock() */ if (!ipc_valid_object(&sma->sem_perm)) - goto out_unlock_free; + goto out_unlock; /* * semid identifiers are not unique - find_alloc_undo may have * allocated an undo structure, it was invalidated by an RMID @@ -2104,7 +2092,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, * "un" itself is guaranteed by rcu. */ if (un && un->semid == -1) - goto out_unlock_free; + goto out_unlock; queue.sops = sops; queue.nsops = nsops; @@ -2130,10 +2118,10 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, rcu_read_unlock(); wake_up_q(&wake_q); - goto out_free; + goto out; } if (error < 0) /* non-blocking error path */ - goto out_unlock_free; + goto out_unlock; /* * We need to sleep on this operation, so we put the current @@ -2198,14 +2186,14 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, if (error != -EINTR) { /* see SEM_BARRIER_2 for purpose/pairing */ smp_acquire__after_ctrl_dep(); - goto out_free; + goto out; } rcu_read_lock(); locknum = sem_lock(sma, sops, nsops); if (!ipc_valid_object(&sma->sem_perm)) - goto out_unlock_free; + goto out_unlock; /* * No necessity for any barrier: We are protect by sem_lock() @@ -2217,7 +2205,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, * Leave without unlink_queue(), but with sem_unlock(). */ if (error != -EINTR) - goto out_unlock_free; + goto out_unlock; /* * If an interrupt occurred we have to clean up the queue. @@ -2228,13 +2216,45 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, unlink_queue(sma, &queue); -out_unlock_free: +out_unlock: sem_unlock(sma, locknum); rcu_read_unlock(); +out: + return error; +} + +static long do_semtimedop(int semid, struct sembuf __user *tsops, + unsigned nsops, const struct timespec64 *timeout) +{ + struct sembuf fast_sops[SEMOPM_FAST]; + struct sembuf *sops = fast_sops; + struct ipc_namespace *ns; + int ret; + + ns = current->nsproxy->ipc_ns; + if (nsops > ns->sc_semopm) + return -E2BIG; + if (nsops < 1) + return -EINVAL; + + if (nsops > SEMOPM_FAST) { + sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); + if (sops == NULL) + return -ENOMEM; + } + + if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { + ret = -EFAULT; + goto out_free; + } + + ret = __do_semtimedop(semid, sops, nsops, timeout, ns); + out_free: if (sops != fast_sops) kvfree(sops); - return error; + + return ret; } long ksys_semtimedop(int semid, struct sembuf __user *tsops, -- cgit v1.2.3-71-gd317 From 76f3c032adad86aad26f8ad3eebc993b4ba32138 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 18 Aug 2021 20:59:31 +0200 Subject: PCI/VPD: Add pci_vpd_alloc() Several users of the VPD API use a fixed-size buffer and read the VPD into it for further usage. This requires special handling for the case that the buffer isn't big enough to hold the full VPD data. Also the buffer is often allocated on the stack, which isn't too nice. Add pci_vpd_alloc() to dynamically allocate buffer of the correct size and read VPD into it. Link: https://lore.kernel.org/r/955ff598-0021-8446-f856-0c2c077635d7@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 26 ++++++++++++++++++++++++++ include/linux/pci.h | 9 +++++++++ 2 files changed, 35 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 3b0425fb49f5..7c3a097379bb 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -270,6 +270,32 @@ const struct attribute_group pci_dev_vpd_attr_group = { .is_bin_visible = vpd_attr_is_visible, }; +void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size) +{ + unsigned int len = dev->vpd.len; + void *buf; + int cnt; + + if (!dev->vpd.cap) + return ERR_PTR(-ENODEV); + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + cnt = pci_read_vpd(dev, 0, len, buf); + if (cnt != len) { + kfree(buf); + return ERR_PTR(-EIO); + } + + if (size) + *size = len; + + return buf; +} +EXPORT_SYMBOL_GPL(pci_vpd_alloc); + int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt) { int i = 0; diff --git a/include/linux/pci.h b/include/linux/pci.h index e752cc39a1fe..8c681e24be8b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2330,6 +2330,15 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field) return info_field[2]; } +/** + * pci_vpd_alloc - Allocate buffer and read VPD into it + * @dev: PCI device + * @size: pointer to field where VPD length is returned + * + * Returns pointer to allocated buffer or an ERR_PTR in case of failure + */ +void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size); + /** * pci_vpd_find_tag - Locates the Resource Data Type tag provided * @buf: Pointer to buffered vpd data -- cgit v1.2.3-71-gd317 From 9e515c9f6c0b6f0ace6f5cf2202b527d745b494d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 18 Aug 2021 21:00:57 +0200 Subject: PCI/VPD: Add pci_vpd_find_ro_info_keyword() All users of pci_vpd_find_info_keyword() are interested in the VPD RO section only. In addition all calls are followed by the same activities to calculate start of tag data area and size of the data area. Add pci_vpd_find_ro_info_keyword() that combines these functionalities. pci_vpd_find_info_keyword() can be phased out once all users are converted. [bhelgaas: split pci_vpd_check_csum() to separate patch] Link: https://lore.kernel.org/r/1643bd7a-088e-1028-c9b0-9d112cf48d63@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 33 +++++++++++++++++++++++++++++++++ include/linux/pci.h | 13 +++++++++++++ 2 files changed, 46 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 7c3a097379bb..b1d012900f1e 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -380,6 +380,39 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void } EXPORT_SYMBOL(pci_write_vpd); +int pci_vpd_find_ro_info_keyword(const void *buf, unsigned int len, + const char *kw, unsigned int *size) +{ + int ro_start, infokw_start; + unsigned int ro_len, infokw_size; + + ro_start = pci_vpd_find_tag(buf, len, PCI_VPD_LRDT_RO_DATA); + if (ro_start < 0) + return ro_start; + + ro_len = pci_vpd_lrdt_size(buf + ro_start); + ro_start += PCI_VPD_LRDT_TAG_SIZE; + + if (ro_start + ro_len > len) + ro_len = len - ro_start; + + infokw_start = pci_vpd_find_info_keyword(buf, ro_start, ro_len, kw); + if (infokw_start < 0) + return infokw_start; + + infokw_size = pci_vpd_info_field_size(buf + infokw_start); + infokw_start += PCI_VPD_INFO_FLD_HDR_SIZE; + + if (infokw_start + infokw_size > len) + return -EINVAL; + + if (size) + *size = infokw_size; + + return infokw_start; +} +EXPORT_SYMBOL_GPL(pci_vpd_find_ro_info_keyword); + #ifdef CONFIG_PCI_QUIRKS /* * Quirk non-zero PCI functions to route VPD access through function 0 for diff --git a/include/linux/pci.h b/include/linux/pci.h index 8c681e24be8b..9e3b60963a52 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2363,6 +2363,19 @@ int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt); int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, unsigned int len, const char *kw); +/** + * pci_vpd_find_ro_info_keyword - Locate info field keyword in VPD RO section + * @buf: Pointer to buffered VPD data + * @len: The length of the buffer area in which to search + * @kw: The keyword to search for + * @size: Pointer to field where length of found keyword data is returned + * + * Returns the index of the information field keyword data or -ENOENT if + * not found. + */ +int pci_vpd_find_ro_info_keyword(const void *buf, unsigned int len, + const char *kw, unsigned int *size); + /* PCI <-> OF binding helpers */ #ifdef CONFIG_OF struct device_node; -- cgit v1.2.3-71-gd317 From 6107e5cb907cffc5576cc1297847f9fc69a8d5d9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 20 Aug 2021 15:32:42 -0500 Subject: PCI/VPD: Add pci_vpd_check_csum() VPD checksum information and checksum calculation are specified by PCIe r5.0, sec 6.28.2.2. Therefore checksum handling can and should be moved into the PCI VPD core. Add pci_vpd_check_csum() to validate the VPD checksum. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/1643bd7a-088e-1028-c9b0-9d112cf48d63@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 23 +++++++++++++++++++++++ include/linux/pci.h | 9 +++++++++ 2 files changed, 32 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index b1d012900f1e..01e57594781e 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -413,6 +413,29 @@ int pci_vpd_find_ro_info_keyword(const void *buf, unsigned int len, } EXPORT_SYMBOL_GPL(pci_vpd_find_ro_info_keyword); +int pci_vpd_check_csum(const void *buf, unsigned int len) +{ + const u8 *vpd = buf; + unsigned int size; + u8 csum = 0; + int rv_start; + + rv_start = pci_vpd_find_ro_info_keyword(buf, len, PCI_VPD_RO_KEYWORD_CHKSUM, &size); + if (rv_start == -ENOENT) /* no checksum in VPD */ + return 1; + else if (rv_start < 0) + return rv_start; + + if (!size) + return -EINVAL; + + while (rv_start >= 0) + csum += vpd[rv_start--]; + + return csum ? -EILSEQ : 0; +} +EXPORT_SYMBOL_GPL(pci_vpd_check_csum); + #ifdef CONFIG_PCI_QUIRKS /* * Quirk non-zero PCI functions to route VPD access through function 0 for diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e3b60963a52..827b7eefd550 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2376,6 +2376,15 @@ int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, int pci_vpd_find_ro_info_keyword(const void *buf, unsigned int len, const char *kw, unsigned int *size); +/** + * pci_vpd_check_csum - Check VPD checksum + * @buf: Pointer to buffered VPD data + * @len: VPD size + * + * Returns 1 if VPD has no checksum, otherwise 0 or an errno + */ +int pci_vpd_check_csum(const void *buf, unsigned int len); + /* PCI <-> OF binding helpers */ #ifdef CONFIG_OF struct device_node; -- cgit v1.2.3-71-gd317 From f0ab00174eb7574732737fc0734d4b406aed6231 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 2 Aug 2021 17:17:28 -0500 Subject: PCI: Make saved capability state private to core Interfaces and structs for saving and restoring PCI Capability state were declared in include/linux/pci.h, but aren't needed outside drivers/pci/. Move these to drivers/pci/pci.h: struct pci_cap_saved_data struct pci_cap_saved_state void pci_allocate_cap_save_buffers() void pci_free_cap_save_buffers() int pci_add_cap_save_buffer() int pci_add_ext_cap_save_buffer() struct pci_cap_saved_state *pci_find_saved_cap() struct pci_cap_saved_state *pci_find_saved_ext_cap() Link: https://lore.kernel.org/r/20210802221728.1469304-1-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson --- drivers/pci/pci.h | 23 +++++++++++++++++++++-- include/linux/pci.h | 18 ------------------ 2 files changed, 21 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 93dcdd431072..288126062a38 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -37,6 +37,27 @@ int pci_probe_reset_function(struct pci_dev *dev); int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); +struct pci_cap_saved_data { + u16 cap_nr; + bool cap_extended; + unsigned int size; + u32 data[]; +}; + +struct pci_cap_saved_state { + struct hlist_node next; + struct pci_cap_saved_data cap; +}; + +void pci_allocate_cap_save_buffers(struct pci_dev *dev); +void pci_free_cap_save_buffers(struct pci_dev *dev); +int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size); +int pci_add_ext_cap_save_buffer(struct pci_dev *dev, + u16 cap, unsigned int size); +struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap); +struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, + u16 cap); + #define PCI_PM_D2_DELAY 200 /* usec; see PCIe r4.0, sec 5.9.1 */ #define PCI_PM_D3HOT_WAIT 10 /* msec */ #define PCI_PM_D3COLD_WAIT 100 /* msec */ @@ -100,8 +121,6 @@ void pci_pm_init(struct pci_dev *dev); void pci_ea_init(struct pci_dev *dev); void pci_msi_init(struct pci_dev *dev); void pci_msix_init(struct pci_dev *dev); -void pci_allocate_cap_save_buffers(struct pci_dev *dev); -void pci_free_cap_save_buffers(struct pci_dev *dev); bool pci_bridge_d3_possible(struct pci_dev *dev); void pci_bridge_d3_update(struct pci_dev *dev); void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..fd35327812af 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -288,18 +288,6 @@ enum pci_bus_speed { enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev); enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev); -struct pci_cap_saved_data { - u16 cap_nr; - bool cap_extended; - unsigned int size; - u32 data[]; -}; - -struct pci_cap_saved_state { - struct hlist_node next; - struct pci_cap_saved_data cap; -}; - struct irq_affinity; struct pcie_link_state; struct pci_vpd; @@ -1278,12 +1266,6 @@ int pci_load_saved_state(struct pci_dev *dev, struct pci_saved_state *state); int pci_load_and_free_saved_state(struct pci_dev *dev, struct pci_saved_state **state); -struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap); -struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, - u16 cap); -int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size); -int pci_add_ext_cap_save_buffer(struct pci_dev *dev, - u16 cap, unsigned int size); int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state); int pci_set_power_state(struct pci_dev *dev, pci_power_t state); pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state); -- cgit v1.2.3-71-gd317 From ca32b5310a1a3835f81f498367f1bb7450c8b67b Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Tue, 13 Jul 2021 15:22:36 +0800 Subject: PCI: Optimize pci_resource_len() to reduce kernel size pci_resource_end() can be 0 only when pci_resource_start() is 0. Otherwise, it is definitely an error. In this case, pci_resource_len() should be regarded as 0. Therefore, determining whether pci_resource_start() and pci_resource_end() are both 0 can be reduced to determining only whether pci_resource_end() is 0. Although only one condition judgment is reduced, the macro function pci_resource_len() is widely referenced in the kernel. I used defconfig to compile the latest kernel on X86, and its binary code size was reduced by about 3KB. Before: [ 2] .rela.text RELA 0000000000000000 093bfcb0 0000000001a67168 0000000000000018 I 68 1 8 After: [ 2] .rela.text RELA 0000000000000000 093bfcb0 0000000001a66598 0000000000000018 I 68 1 8 Link: https://lore.kernel.org/r/20210713072236.3043-1-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..23ef1a15eb5d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1881,9 +1881,7 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma); #define pci_resource_end(dev, bar) ((dev)->resource[(bar)].end) #define pci_resource_flags(dev, bar) ((dev)->resource[(bar)].flags) #define pci_resource_len(dev,bar) \ - ((pci_resource_start((dev), (bar)) == 0 && \ - pci_resource_end((dev), (bar)) == \ - pci_resource_start((dev), (bar))) ? 0 : \ + ((pci_resource_end((dev), (bar)) == 0) ? 0 : \ \ (pci_resource_end((dev), (bar)) - \ pci_resource_start((dev), (bar)) + 1)) -- cgit v1.2.3-71-gd317 From 817f9916a6e96ae43acdd4e75459ef4f92d96eb1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 13 Aug 2021 18:36:19 +0300 Subject: PCI: Sync __pci_register_driver() stub for CONFIG_PCI=n The CONFIG_PCI=y case got a new parameter long time ago. Sync the stub as well. [bhelgaas: add parameter names] Fixes: 725522b5453d ("PCI: add the sysfs driver name to all modules") Link: https://lore.kernel.org/r/20210813153619.89574-1-andriy.shevchenko@linux.intel.com Reported-by: kernel test robot Signed-off-by: Andy Shevchenko Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index fd35327812af..a662f6c1f120 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1722,8 +1722,9 @@ static inline void pci_disable_device(struct pci_dev *dev) { } static inline int pcim_enable_device(struct pci_dev *pdev) { return -EIO; } static inline int pci_assign_resource(struct pci_dev *dev, int i) { return -EBUSY; } -static inline int __pci_register_driver(struct pci_driver *drv, - struct module *owner) +static inline int __must_check __pci_register_driver(struct pci_driver *drv, + struct module *owner, + const char *mod_name) { return 0; } static inline int pci_register_driver(struct pci_driver *drv) { return 0; } -- cgit v1.2.3-71-gd317 From 6e7c1770a212239e88ec01ddc7a741505bfd10e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jul 2021 16:23:21 -0400 Subject: fs: simplify get_filesystem_list / get_all_fs_names Just output the '\0' separate list of supported file systems for block devices directly rather than going through a pointless round of string manipulation. Based on an earlier patch from Al Viro . Vivek: Modified list_bdev_fs_names() and split_fs_names() to return number of null terminted strings to caller. Callers now use that information to loop through all the strings instead of relying on one extra null char being present at the end. Signed-off-by: Christoph Hellwig Signed-off-by: Vivek Goyal Signed-off-by: Al Viro --- fs/filesystems.c | 27 +++++++++++++++++---------- include/linux/fs.h | 2 +- init/do_mounts.c | 49 +++++++++++++++++++++---------------------------- 3 files changed, 39 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/fs/filesystems.c b/fs/filesystems.c index 90b8d879fbaf..58b9067b2391 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -209,21 +209,28 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) } #endif -int __init get_filesystem_list(char *buf) +int __init list_bdev_fs_names(char *buf, size_t size) { - int len = 0; - struct file_system_type * tmp; + struct file_system_type *p; + size_t len; + int count = 0; read_lock(&file_systems_lock); - tmp = file_systems; - while (tmp && len < PAGE_SIZE - 80) { - len += sprintf(buf+len, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); - tmp = tmp->next; + for (p = file_systems; p; p = p->next) { + if (!(p->fs_flags & FS_REQUIRES_DEV)) + continue; + len = strlen(p->name) + 1; + if (len > size) { + pr_warn("%s: truncating file system list\n", __func__); + break; + } + memcpy(buf, p->name, len); + buf += len; + size -= len; + count++; } read_unlock(&file_systems_lock); - return len; + return count; } #ifdef CONFIG_PROC_FS diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..c76dfc01cf9d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3622,7 +3622,7 @@ int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_inodes(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int __init get_filesystem_list(char *buf); +int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) #define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) diff --git a/init/do_mounts.c b/init/do_mounts.c index bdeb90b8d669..9b4a1f877e47 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -338,32 +338,22 @@ __setup("rootflags=", root_data_setup); __setup("rootfstype=", fs_names_setup); __setup("rootdelay=", root_delay_setup); -static void __init split_fs_names(char *page, char *names) +static int __init split_fs_names(char *page, char *names) { - strcpy(page, root_fs_names); - while (*page++) { - if (page[-1] == ',') - page[-1] = '\0'; - } - *page = '\0'; -} - -static void __init get_all_fs_names(char *page) -{ - int len = get_filesystem_list(page); - char *s = page, *p, *next; + int count = 0; + char *p = page; - page[len] = '\0'; - for (p = page - 1; p; p = next) { - next = strchr(++p, '\n'); - if (*p++ != '\t') - continue; - while ((*s++ = *p++) != '\n') - ; - s[-1] = '\0'; + strcpy(p, root_fs_names); + while (*p++) { + if (p[-1] == ',') + p[-1] = '\0'; } + *p = '\0'; + + for (p = page; *p; p += strlen(p)+1) + count++; - *s = '\0'; + return count; } static int __init do_mount_root(const char *name, const char *fs, @@ -409,15 +399,16 @@ void __init mount_block_root(char *name, int flags) char *fs_names = page_address(page); char *p; char b[BDEVNAME_SIZE]; + int num_fs, i; scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)", MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); if (root_fs_names) - split_fs_names(fs_names, root_fs_names); + num_fs = split_fs_names(fs_names, root_fs_names); else - get_all_fs_names(fs_names); + num_fs = list_bdev_fs_names(fs_names, PAGE_SIZE); retry: - for (p = fs_names; *p; p += strlen(p)+1) { + for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1) { int err = do_mount_root(name, p, flags, root_mount_data); switch (err) { case 0: @@ -450,7 +441,7 @@ retry: printk("List of all partitions:\n"); printk_all_partitions(); printk("No filesystem could mount root, tried: "); - for (p = fs_names; *p; p += strlen(p)+1) + for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1) printk(" %s", p); printk("\n"); panic("VFS: Unable to mount root fs on %s", b); @@ -551,13 +542,15 @@ static int __init mount_nodev_root(void) { char *fs_names, *fstype; int err = -EINVAL; + int num_fs, i; fs_names = (void *)__get_free_page(GFP_KERNEL); if (!fs_names) return -EINVAL; - split_fs_names(fs_names, root_fs_names); + num_fs = split_fs_names(fs_names, root_fs_names); - for (fstype = fs_names; *fstype; fstype += strlen(fstype) + 1) { + for (i = 0, fstype = fs_names; i < num_fs; + i++, fstype += strlen(fstype) + 1) { if (!fs_is_nodev(fstype)) continue; err = do_mount_root(root_device_name, fstype, root_mountflags, -- cgit v1.2.3-71-gd317 From 15d82ca23c996d50062286d27ed6a42a8105c04a Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 27 Jul 2021 02:06:50 +0800 Subject: PCI: Introduce domain_nr in pci_host_bridge Currently we retrieve the PCI domain number of the host bridge from the bus sysdata (or pci_config_window if PCI_DOMAINS_GENERIC=y). Actually we have the information at PCI host bridge probing time, and it makes sense that we store it into pci_host_bridge. One benefit of doing so is the requirement for supporting PCI on Hyper-V for ARM64, because the host bridge of Hyper-V doesn't have pci_config_window, whereas ARM64 is a PCI_DOMAINS_GENERIC=y arch, so we cannot retrieve the PCI domain number from pci_config_window on ARM64 Hyper-V guest. As the preparation for ARM64 Hyper-V PCI support, we introduce the domain_nr in pci_host_bridge and a sentinel value to allow drivers to set domain numbers properly at probing time. Currently CONFIG_PCI_DOMAINS_GENERIC=y archs are only users of this newly-introduced field. Link: https://lore.kernel.org/r/20210726180657.142727-2-boqun.feng@gmail.com Signed-off-by: Boqun Feng Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/probe.c | 6 +++++- include/linux/pci.h | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 79177ac37880..60c50d4f156f 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -594,6 +594,7 @@ static void pci_init_host_bridge(struct pci_host_bridge *bridge) bridge->native_pme = 1; bridge->native_ltr = 1; bridge->native_dpc = 1; + bridge->domain_nr = PCI_DOMAIN_NR_NOT_SET; device_initialize(&bridge->dev); } @@ -898,7 +899,10 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) bus->ops = bridge->ops; bus->number = bus->busn_res.start = bridge->busnr; #ifdef CONFIG_PCI_DOMAINS_GENERIC - bus->domain_nr = pci_bus_find_domain_nr(bus, parent); + if (bridge->domain_nr == PCI_DOMAIN_NR_NOT_SET) + bus->domain_nr = pci_bus_find_domain_nr(bus, parent); + else + bus->domain_nr = bridge->domain_nr; #endif b = pci_find_bus(pci_domain_nr(bus), bridge->busnr); diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..01aa201e1df0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -526,6 +526,16 @@ static inline int pci_channel_offline(struct pci_dev *pdev) return (pdev->error_state != pci_channel_io_normal); } +/* + * Currently in ACPI spec, for each PCI host bridge, PCI Segment + * Group number is limited to a 16-bit value, therefore (int)-1 is + * not a valid PCI domain number, and can be used as a sentinel + * value indicating ->domain_nr is not set by the driver (and + * CONFIG_PCI_DOMAINS_GENERIC=y archs will set it with + * pci_bus_find_domain_nr()). + */ +#define PCI_DOMAIN_NR_NOT_SET (-1) + struct pci_host_bridge { struct device dev; struct pci_bus *bus; /* Root bus */ @@ -533,6 +543,7 @@ struct pci_host_bridge { struct pci_ops *child_ops; void *sysdata; int busnr; + int domain_nr; struct list_head windows; /* resource_entry */ struct list_head dma_ranges; /* dma ranges resource list */ u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */ -- cgit v1.2.3-71-gd317 From 8c09e896cef8d908dd9a20a9f2a5c3fcb9799de3 Mon Sep 17 00:00:00 2001 From: Zhangfei Gao Date: Tue, 13 Jul 2021 10:54:34 +0800 Subject: PCI: Allow PASID on fake PCIe devices without TLP prefixes Some systems, e.g., HiSilicon KunPeng920 and KunPeng930, have devices that appear as PCI but are actually on the AMBA bus. Some of these fake PCI devices support a PASID-like feature and they do have a working PASID capability even though they do not use the PCIe Transport Layer Protocol and do not support TLP prefixes. Add a pasid_no_tlp bit for this "PASID works without TLP prefixes" case and update pci_enable_pasid() so it can enable PASID on these devices. Set this bit for HiSilicon KunPeng920 and KunPeng930. [bhelgaas: squashed, commit log] Suggested-by: Bjorn Helgaas Link: https://lore.kernel.org/r/1626144876-11352-2-git-send-email-zhangfei.gao@linaro.org Link: https://lore.kernel.org/r/1626144876-11352-3-git-send-email-zhangfei.gao@linaro.org Signed-off-by: Zhangfei Gao Signed-off-by: Jean-Philippe Brucker Signed-off-by: Zhou Wang Signed-off-by: Bjorn Helgaas --- drivers/pci/ats.c | 2 +- drivers/pci/quirks.c | 14 ++++++++++++++ include/linux/pci.h | 1 + 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 6d7d64939f82..c967ad6e2626 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -376,7 +376,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features) if (WARN_ON(pdev->pasid_enabled)) return -EBUSY; - if (!pdev->eetlp_prefix_path) + if (!pdev->eetlp_prefix_path && !pdev->pasid_no_tlp) return -EINVAL; if (!pasid) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6d74386eadc2..5d46ac697218 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1821,6 +1821,20 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quir DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_HUAWEI, 0x1610, PCI_CLASS_BRIDGE_PCI, 8, quirk_pcie_mch); +static void quirk_huawei_pcie_sva(struct pci_dev *pdev) +{ + if (pdev->revision != 0x21 && pdev->revision != 0x30) + return; + + pdev->pasid_no_tlp = 1; +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa250, quirk_huawei_pcie_sva); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa251, quirk_huawei_pcie_sva); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa255, quirk_huawei_pcie_sva); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa256, quirk_huawei_pcie_sva); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa258, quirk_huawei_pcie_sva); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa259, quirk_huawei_pcie_sva); + /* * It's possible for the MSI to get corrupted if SHPC and ACPI are used * together on certain PXH-based systems. diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..28165dc5b221 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -388,6 +388,7 @@ struct pci_dev { supported from root to here */ u16 l1ss; /* L1SS Capability pointer */ #endif + unsigned int pasid_no_tlp:1; /* PASID works without TLP Prefix */ unsigned int eetlp_prefix_path:1; /* End-to-End TLP Prefix */ pci_channel_state_t error_state; /* Current connectivity state */ -- cgit v1.2.3-71-gd317 From 1b7646014e0d838b06be7288e2dec3262948cc56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Aug 2021 15:55:05 +0200 Subject: dax: mark dax_get_by_host static And move the code around a bit to avoid a forward declaration. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20210826135510.6293-5-hch@lst.de Signed-off-by: Dan Williams --- drivers/dax/super.c | 109 ++++++++++++++++++++++++++-------------------------- include/linux/dax.h | 5 --- 2 files changed, 54 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 3e6d7e9ee34f..e13fde57c33e 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -17,6 +17,24 @@ #include #include "dax-private.h" +/** + * struct dax_device - anchor object for dax services + * @inode: core vfs + * @cdev: optional character interface for "device dax" + * @host: optional name for lookups where the device path is not available + * @private: dax driver private data + * @flags: state and boolean properties + */ +struct dax_device { + struct hlist_node list; + struct inode inode; + struct cdev cdev; + const char *host; + void *private; + unsigned long flags; + const struct dax_operations *ops; +}; + static dev_t dax_devt; DEFINE_STATIC_SRCU(dax_srcu); static struct vfsmount *dax_mnt; @@ -40,6 +58,42 @@ void dax_read_unlock(int id) } EXPORT_SYMBOL_GPL(dax_read_unlock); +static int dax_host_hash(const char *host) +{ + return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; +} + +/** + * dax_get_by_host() - temporary lookup mechanism for filesystem-dax + * @host: alternate name for the device registered by a dax driver + */ +static struct dax_device *dax_get_by_host(const char *host) +{ + struct dax_device *dax_dev, *found = NULL; + int hash, id; + + if (!host) + return NULL; + + hash = dax_host_hash(host); + + id = dax_read_lock(); + spin_lock(&dax_host_lock); + hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { + if (!dax_alive(dax_dev) + || strcmp(host, dax_dev->host) != 0) + continue; + + if (igrab(&dax_dev->inode)) + found = dax_dev; + break; + } + spin_unlock(&dax_host_lock); + dax_read_unlock(id); + + return found; +} + #ifdef CONFIG_BLOCK #include @@ -202,24 +256,6 @@ enum dax_device_flags { DAXDEV_SYNC, }; -/** - * struct dax_device - anchor object for dax services - * @inode: core vfs - * @cdev: optional character interface for "device dax" - * @host: optional name for lookups where the device path is not available - * @private: dax driver private data - * @flags: state and boolean properties - */ -struct dax_device { - struct hlist_node list; - struct inode inode; - struct cdev cdev; - const char *host; - void *private; - unsigned long flags; - const struct dax_operations *ops; -}; - static ssize_t write_cache_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -417,11 +453,6 @@ bool dax_alive(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(dax_alive); -static int dax_host_hash(const char *host) -{ - return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; -} - /* * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring * that any fault handlers or operations that might have seen @@ -618,38 +649,6 @@ void put_dax(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(put_dax); -/** - * dax_get_by_host() - temporary lookup mechanism for filesystem-dax - * @host: alternate name for the device registered by a dax driver - */ -struct dax_device *dax_get_by_host(const char *host) -{ - struct dax_device *dax_dev, *found = NULL; - int hash, id; - - if (!host) - return NULL; - - hash = dax_host_hash(host); - - id = dax_read_lock(); - spin_lock(&dax_host_lock); - hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { - if (!dax_alive(dax_dev) - || strcmp(host, dax_dev->host) != 0) - continue; - - if (igrab(&dax_dev->inode)) - found = dax_dev; - break; - } - spin_unlock(&dax_host_lock); - dax_read_unlock(id); - - return found; -} -EXPORT_SYMBOL_GPL(dax_get_by_host); - /** * inode_dax: convert a public inode into its dax_dev * @inode: An inode with i_cdev pointing to a dax_dev diff --git a/include/linux/dax.h b/include/linux/dax.h index b52f084aa643..379739b55408 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -41,7 +41,6 @@ struct dax_operations { extern struct attribute_group dax_attribute_group; #if IS_ENABLED(CONFIG_DAX) -struct dax_device *dax_get_by_host(const char *host); struct dax_device *alloc_dax(void *private, const char *host, const struct dax_operations *ops, unsigned long flags); void put_dax(struct dax_device *dax_dev); @@ -73,10 +72,6 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, return dax_synchronous(dax_dev); } #else -static inline struct dax_device *dax_get_by_host(const char *host) -{ - return NULL; -} static inline struct dax_device *alloc_dax(void *private, const char *host, const struct dax_operations *ops, unsigned long flags) { -- cgit v1.2.3-71-gd317 From cd93a2a4d1b076f5c73d70d836c202bbcbeea49e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Aug 2021 15:55:07 +0200 Subject: dax: remove __generic_fsdax_supported Just implement generic_fsdax_supported directly out of line instead of adding a wrapper. Given that generic_fsdax_supported is only supplied for CONFIG_FS_DAX builds this also allows to not provide it at all for !CONFIG_FS_DAX builds. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20210826135510.6293-7-hch@lst.de Signed-off-by: Dan Williams --- drivers/dax/super.c | 8 ++++---- include/linux/dax.h | 16 ++-------------- 2 files changed, 6 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 0f74f83101ab..8e8ccb3e956b 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -119,9 +119,8 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) return dax_get_by_host(bdev->bd_disk->disk_name); } EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); -#endif -bool __generic_fsdax_supported(struct dax_device *dax_dev, +bool generic_fsdax_supported(struct dax_device *dax_dev, struct block_device *bdev, int blocksize, sector_t start, sector_t sectors) { @@ -201,7 +200,8 @@ bool __generic_fsdax_supported(struct dax_device *dax_dev, } return true; } -EXPORT_SYMBOL_GPL(__generic_fsdax_supported); +EXPORT_SYMBOL_GPL(generic_fsdax_supported); +#endif /* CONFIG_FS_DAX */ /** * __bdev_dax_supported() - Check if the device supports dax for filesystem @@ -360,7 +360,7 @@ bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, return false; id = dax_read_lock(); - if (dax_alive(dax_dev)) + if (dax_alive(dax_dev) && dax_dev->ops->dax_supported) ret = dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, start, len); dax_read_unlock(id); diff --git a/include/linux/dax.h b/include/linux/dax.h index 379739b55408..0a3ef9701e03 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -123,16 +123,9 @@ static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize) return __bdev_dax_supported(bdev, blocksize); } -bool __generic_fsdax_supported(struct dax_device *dax_dev, +bool generic_fsdax_supported(struct dax_device *dax_dev, struct block_device *bdev, int blocksize, sector_t start, sector_t sectors); -static inline bool generic_fsdax_supported(struct dax_device *dax_dev, - struct block_device *bdev, int blocksize, sector_t start, - sector_t sectors) -{ - return __generic_fsdax_supported(dax_dev, bdev, blocksize, start, - sectors); -} static inline void fs_put_dax(struct dax_device *dax_dev) { @@ -154,12 +147,7 @@ static inline bool bdev_dax_supported(struct block_device *bdev, return false; } -static inline bool generic_fsdax_supported(struct dax_device *dax_dev, - struct block_device *bdev, int blocksize, sector_t start, - sector_t sectors) -{ - return false; -} +#define generic_fsdax_supported NULL static inline void fs_put_dax(struct dax_device *dax_dev) { -- cgit v1.2.3-71-gd317 From 60b8340f0d6587d7b51990689fcdae567f309fbf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Aug 2021 15:55:08 +0200 Subject: dax: stub out dax_supported for !CONFIG_FS_DAX dax_supported calls into ->dax_supported which checks for fsdax support. Don't bother building it for !CONFIG_FS_DAX as it will always return false. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20210826135510.6293-8-hch@lst.de Signed-off-by: Dan Williams --- drivers/dax/super.c | 36 ++++++++++++++++++------------------ include/linux/dax.h | 18 ++++++++++-------- 2 files changed, 28 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 8e8ccb3e956b..eed02729add3 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -201,6 +201,24 @@ bool generic_fsdax_supported(struct dax_device *dax_dev, return true; } EXPORT_SYMBOL_GPL(generic_fsdax_supported); + +bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, + int blocksize, sector_t start, sector_t len) +{ + bool ret = false; + int id; + + if (!dax_dev) + return false; + + id = dax_read_lock(); + if (dax_alive(dax_dev) && dax_dev->ops->dax_supported) + ret = dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, + start, len); + dax_read_unlock(id); + return ret; +} +EXPORT_SYMBOL_GPL(dax_supported); #endif /* CONFIG_FS_DAX */ /** @@ -350,24 +368,6 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, } EXPORT_SYMBOL_GPL(dax_direct_access); -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, - int blocksize, sector_t start, sector_t len) -{ - bool ret = false; - int id; - - if (!dax_dev) - return false; - - id = dax_read_lock(); - if (dax_alive(dax_dev) && dax_dev->ops->dax_supported) - ret = dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, - start, len); - dax_read_unlock(id); - return ret; -} -EXPORT_SYMBOL_GPL(dax_supported); - size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { diff --git a/include/linux/dax.h b/include/linux/dax.h index 0a3ef9701e03..32dce5763f2c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -57,8 +57,6 @@ static inline void set_dax_synchronous(struct dax_device *dax_dev) { __set_dax_synchronous(dax_dev); } -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, - int blocksize, sector_t start, sector_t len); /* * Check if given mapping is supported by the file / underlying device. */ @@ -101,12 +99,6 @@ static inline bool dax_synchronous(struct dax_device *dax_dev) static inline void set_dax_synchronous(struct dax_device *dax_dev) { } -static inline bool dax_supported(struct dax_device *dax_dev, - struct block_device *bdev, int blocksize, sector_t start, - sector_t len) -{ - return false; -} static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, struct dax_device *dax_dev) { @@ -127,6 +119,9 @@ bool generic_fsdax_supported(struct dax_device *dax_dev, struct block_device *bdev, int blocksize, sector_t start, sector_t sectors); +bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, + int blocksize, sector_t start, sector_t len); + static inline void fs_put_dax(struct dax_device *dax_dev) { put_dax(dax_dev); @@ -149,6 +144,13 @@ static inline bool bdev_dax_supported(struct block_device *bdev, #define generic_fsdax_supported NULL +static inline bool dax_supported(struct dax_device *dax_dev, + struct block_device *bdev, int blocksize, sector_t start, + sector_t len) +{ + return false; +} + static inline void fs_put_dax(struct dax_device *dax_dev) { } -- cgit v1.2.3-71-gd317 From bdd3c50d83bf7f6acc869b48d02670d19030ae03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Aug 2021 15:55:10 +0200 Subject: dax: remove bdev_dax_supported All callers already have a dax_device obtained from fs_dax_get_by_bdev at hand, so just pass that to dax_supported() insted of doing another lookup. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20210826135510.6293-10-hch@lst.de Signed-off-by: Dan Williams --- drivers/dax/super.c | 42 +----------------------------------------- fs/ext2/super.c | 3 ++- fs/ext4/super.c | 3 ++- fs/xfs/xfs_super.c | 3 ++- include/linux/dax.h | 12 ------------ 5 files changed, 7 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index eed02729add3..fc89e91beea7 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -220,47 +220,7 @@ bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, } EXPORT_SYMBOL_GPL(dax_supported); #endif /* CONFIG_FS_DAX */ - -/** - * __bdev_dax_supported() - Check if the device supports dax for filesystem - * @bdev: block device to check - * @blocksize: The block size of the device - * - * This is a library function for filesystems to check if the block device - * can be mounted with dax option. - * - * Return: true if supported, false if unsupported - */ -bool __bdev_dax_supported(struct block_device *bdev, int blocksize) -{ - struct dax_device *dax_dev; - struct request_queue *q; - char buf[BDEVNAME_SIZE]; - bool ret; - - q = bdev_get_queue(bdev); - if (!q || !blk_queue_dax(q)) { - pr_debug("%s: error: request queue doesn't support dax\n", - bdevname(bdev, buf)); - return false; - } - - dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); - if (!dax_dev) { - pr_debug("%s: error: device does not support dax\n", - bdevname(bdev, buf)); - return false; - } - - ret = dax_supported(dax_dev, bdev, blocksize, 0, - i_size_read(bdev->bd_inode) / 512); - - put_dax(dax_dev); - - return ret; -} -EXPORT_SYMBOL_GPL(__bdev_dax_supported); -#endif +#endif /* CONFIG_BLOCK */ enum dax_device_flags { /* !alive + rcu grace period == no new operations / mappings */ diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 21e09fbaa46f..26e69e48d7e0 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -949,7 +949,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (test_opt(sb, DAX)) { - if (!bdev_dax_supported(sb->s_bdev, blocksize)) { + if (!dax_supported(dax_dev, sb->s_bdev, blocksize, 0, + bdev_nr_sectors(sb->s_bdev))) { ext2_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); clear_opt(sbi->s_mount_opt, DAX); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dfa09a277b56..a1726a8debce 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4435,7 +4435,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (bdev_dax_supported(sb->s_bdev, blocksize)) + if (dax_supported(dax_dev, sb->s_bdev, blocksize, 0, + bdev_nr_sectors(sb->s_bdev))) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 5a89bf601d97..f4384974e52a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -319,7 +319,8 @@ xfs_buftarg_is_dax( struct super_block *sb, struct xfs_buftarg *bt) { - return bdev_dax_supported(bt->bt_bdev, sb->s_blocksize); + return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0, + bdev_nr_sectors(bt->bt_bdev)); } STATIC int diff --git a/include/linux/dax.h b/include/linux/dax.h index 32dce5763f2c..2619d94c308d 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -109,12 +109,6 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, struct writeback_control; int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #if IS_ENABLED(CONFIG_FS_DAX) -bool __bdev_dax_supported(struct block_device *bdev, int blocksize); -static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize) -{ - return __bdev_dax_supported(bdev, blocksize); -} - bool generic_fsdax_supported(struct dax_device *dax_dev, struct block_device *bdev, int blocksize, sector_t start, sector_t sectors); @@ -136,12 +130,6 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t st dax_entry_t dax_lock_page(struct page *page); void dax_unlock_page(struct page *page, dax_entry_t cookie); #else -static inline bool bdev_dax_supported(struct block_device *bdev, - int blocksize) -{ - return false; -} - #define generic_fsdax_supported NULL static inline bool dax_supported(struct dax_device *dax_dev, -- cgit v1.2.3-71-gd317 From ab959c7d4ea086852f35c7ff20ecd79b7471cfad Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 6 Aug 2021 10:53:21 +0100 Subject: dmaengine: Extend the dma_slave_width for 128 bytes Add DMA_SLAVE_BUSWIDTH_128_BYTES to dma_slave_width for DMA engines and users to select 128 bytes as bus width. Signed-off-by: Biju Das Reviewed-by: Lad Prabhakar Link: https://lore.kernel.org/r/20210806095322.2326-3-biju.das.jz@bp.renesas.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 93c3ca5fdafd..e5c2c9e71bf1 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -380,6 +380,7 @@ enum dma_slave_buswidth { DMA_SLAVE_BUSWIDTH_16_BYTES = 16, DMA_SLAVE_BUSWIDTH_32_BYTES = 32, DMA_SLAVE_BUSWIDTH_64_BYTES = 64, + DMA_SLAVE_BUSWIDTH_128_BYTES = 128, }; /** @@ -398,7 +399,7 @@ enum dma_slave_buswidth { * @src_addr_width: this is the width in bytes of the source (RX) * register where DMA data shall be read. If the source * is memory this may be ignored depending on architecture. - * Legal values: 1, 2, 3, 4, 8, 16, 32, 64. + * Legal values: 1, 2, 3, 4, 8, 16, 32, 64, 128. * @dst_addr_width: same as src_addr_width but for destination * target (TX) mutatis mutandis. * @src_maxburst: the maximum number of words (note: words, as in -- cgit v1.2.3-71-gd317 From a61590892ef097c180144fa469abe2256b9ae715 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 26 Aug 2021 20:53:42 +0200 Subject: PCI/VPD: Stop exporting pci_vpd_find_tag() Now that the last users have been migrated to pci_vpd_find_ro_keyword() we can stop exporting this function. It's still used in VPD core code. Link: https://lore.kernel.org/r/71131eca-0502-7878-365f-30b6614161cf@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 3 +-- include/linux/pci.h | 11 ----------- 2 files changed, 1 insertion(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 01e57594781e..5726fbb7a03f 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -296,7 +296,7 @@ void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size) } EXPORT_SYMBOL_GPL(pci_vpd_alloc); -int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt) +static int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt) { int i = 0; @@ -310,7 +310,6 @@ int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt) return -ENOENT; } -EXPORT_SYMBOL_GPL(pci_vpd_find_tag); int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, unsigned int len, const char *kw) diff --git a/include/linux/pci.h b/include/linux/pci.h index 827b7eefd550..4fb233e374c5 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2339,17 +2339,6 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field) */ void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size); -/** - * pci_vpd_find_tag - Locates the Resource Data Type tag provided - * @buf: Pointer to buffered vpd data - * @len: The length of the vpd buffer - * @rdt: The Resource Data Type to search for - * - * Returns the index where the Resource Data Type was found or - * -ENOENT otherwise. - */ -int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt); - /** * pci_vpd_find_info_keyword - Locates an information field keyword in the VPD * @buf: Pointer to buffered vpd data -- cgit v1.2.3-71-gd317 From 59b83b29bb5532bbff54a271e0b4f321e28b954f Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 26 Aug 2021 20:54:23 +0200 Subject: PCI/VPD: Stop exporting pci_vpd_find_info_keyword() Now that the last users have been migrated to pci_vpd_find_ro_keyword() we can stop exporting this function. It's still used in VPD core code. Link: https://lore.kernel.org/r/96ca2a56-383e-9b61-9cba-4f1e5611dc15@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 3 +-- include/linux/pci.h | 13 ------------- 2 files changed, 1 insertion(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 5726fbb7a03f..0e7a5e8a8f17 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -311,7 +311,7 @@ static int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt) return -ENOENT; } -int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, +static int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, unsigned int len, const char *kw) { int i; @@ -327,7 +327,6 @@ int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, return -ENOENT; } -EXPORT_SYMBOL_GPL(pci_vpd_find_info_keyword); /** * pci_read_vpd - Read one entry from Vital Product Data diff --git a/include/linux/pci.h b/include/linux/pci.h index 4fb233e374c5..196cbf4c76a1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2339,19 +2339,6 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field) */ void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size); -/** - * pci_vpd_find_info_keyword - Locates an information field keyword in the VPD - * @buf: Pointer to buffered vpd data - * @off: The offset into the buffer at which to begin the search - * @len: The length of the buffer area, relative to off, in which to search - * @kw: The keyword to search for - * - * Returns the index where the information field keyword was found or - * -ENOENT otherwise. - */ -int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, - unsigned int len, const char *kw); - /** * pci_vpd_find_ro_info_keyword - Locate info field keyword in VPD RO section * @buf: Pointer to buffered VPD data -- cgit v1.2.3-71-gd317 From acfbb1b8a494d7bfd316dfb363a820e6df637e8d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 26 Aug 2021 20:55:43 +0200 Subject: PCI/VPD: Add pci_vpd_find_id_string() Add a pci_vpd_find_id_string() API function to retrieve the ID string from VPD. This way callers don't need pci_vpd_lrdt_size() any longer, and it can be made private to the VPD core. Link: https://lore.kernel.org/r/c5225bf6-8d29-970d-e271-0d7b52252630@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 6 ++++++ include/linux/pci.h | 10 ++++++++++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index b7bf014ccc5f..79712b3d17b6 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -320,6 +320,12 @@ static int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt, unsigned in return -ENOENT; } +int pci_vpd_find_id_string(const u8 *buf, unsigned int len, unsigned int *size) +{ + return pci_vpd_find_tag(buf, len, PCI_VPD_LRDT_ID_STRING, size); +} +EXPORT_SYMBOL_GPL(pci_vpd_find_id_string); + static int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, unsigned int len, const char *kw) { diff --git a/include/linux/pci.h b/include/linux/pci.h index 196cbf4c76a1..ea330ca0501a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2339,6 +2339,16 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field) */ void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size); +/** + * pci_vpd_find_id_string - Locate id string in VPD + * @buf: Pointer to buffered VPD data + * @len: The length of the buffer area in which to search + * @size: Pointer to field where length of id string is returned + * + * Returns the index of the id string or -ENOENT if not found. + */ +int pci_vpd_find_id_string(const u8 *buf, unsigned int len, unsigned int *size); + /** * pci_vpd_find_ro_info_keyword - Locate info field keyword in VPD RO section * @buf: Pointer to buffered VPD data -- cgit v1.2.3-71-gd317 From 06e1913d457121a98ee276179734c34dab30f388 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 26 Aug 2021 20:57:01 +0200 Subject: PCI/VPD: Clean up public VPD defines and inline functions After recent introduction of new VPD API functions and user migration these defines and inline functions aren't used outside VPD core any longer. Link: https://lore.kernel.org/r/d33e06bf-bc5e-ece7-bf35-7245ae224d1b@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 26 ++++++++++++++++++++ include/linux/pci.h | 69 ----------------------------------------------------- 2 files changed, 26 insertions(+), 69 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 79712b3d17b6..ff600dff4557 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -11,6 +11,32 @@ #include #include "pci.h" +#define PCI_VPD_LRDT_TAG_SIZE 3 +#define PCI_VPD_SRDT_LEN_MASK 0x07 +#define PCI_VPD_SRDT_TAG_SIZE 1 +#define PCI_VPD_STIN_END 0x0f +#define PCI_VPD_INFO_FLD_HDR_SIZE 3 + +static u16 pci_vpd_lrdt_size(const u8 *lrdt) +{ + return (u16)lrdt[1] + ((u16)lrdt[2] << 8); +} + +static u8 pci_vpd_srdt_tag(const u8 *srdt) +{ + return *srdt >> 3; +} + +static u8 pci_vpd_srdt_size(const u8 *srdt) +{ + return *srdt & PCI_VPD_SRDT_LEN_MASK; +} + +static u8 pci_vpd_info_field_size(const u8 *info_field) +{ + return info_field[2]; +} + /* VPD access through PCI 2.2+ VPD capability */ static struct pci_dev *pci_get_func0_dev(struct pci_dev *dev) diff --git a/include/linux/pci.h b/include/linux/pci.h index ea330ca0501a..303034d03c33 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2255,81 +2255,12 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask); #define PCI_VPD_LRDT_RO_DATA PCI_VPD_LRDT_ID(PCI_VPD_LTIN_RO_DATA) #define PCI_VPD_LRDT_RW_DATA PCI_VPD_LRDT_ID(PCI_VPD_LTIN_RW_DATA) -/* Small Resource Data Type Tag Item Names */ -#define PCI_VPD_STIN_END 0x0f /* End */ - -#define PCI_VPD_SRDT_END (PCI_VPD_STIN_END << 3) - -#define PCI_VPD_SRDT_TIN_MASK 0x78 -#define PCI_VPD_SRDT_LEN_MASK 0x07 -#define PCI_VPD_LRDT_TIN_MASK 0x7f - -#define PCI_VPD_LRDT_TAG_SIZE 3 -#define PCI_VPD_SRDT_TAG_SIZE 1 - -#define PCI_VPD_INFO_FLD_HDR_SIZE 3 - #define PCI_VPD_RO_KEYWORD_PARTNO "PN" #define PCI_VPD_RO_KEYWORD_SERIALNO "SN" #define PCI_VPD_RO_KEYWORD_MFR_ID "MN" #define PCI_VPD_RO_KEYWORD_VENDOR0 "V0" #define PCI_VPD_RO_KEYWORD_CHKSUM "RV" -/** - * pci_vpd_lrdt_size - Extracts the Large Resource Data Type length - * @lrdt: Pointer to the beginning of the Large Resource Data Type tag - * - * Returns the extracted Large Resource Data Type length. - */ -static inline u16 pci_vpd_lrdt_size(const u8 *lrdt) -{ - return (u16)lrdt[1] + ((u16)lrdt[2] << 8); -} - -/** - * pci_vpd_lrdt_tag - Extracts the Large Resource Data Type Tag Item - * @lrdt: Pointer to the beginning of the Large Resource Data Type tag - * - * Returns the extracted Large Resource Data Type Tag item. - */ -static inline u16 pci_vpd_lrdt_tag(const u8 *lrdt) -{ - return (u16)(lrdt[0] & PCI_VPD_LRDT_TIN_MASK); -} - -/** - * pci_vpd_srdt_size - Extracts the Small Resource Data Type length - * @srdt: Pointer to the beginning of the Small Resource Data Type tag - * - * Returns the extracted Small Resource Data Type length. - */ -static inline u8 pci_vpd_srdt_size(const u8 *srdt) -{ - return (*srdt) & PCI_VPD_SRDT_LEN_MASK; -} - -/** - * pci_vpd_srdt_tag - Extracts the Small Resource Data Type Tag Item - * @srdt: Pointer to the beginning of the Small Resource Data Type tag - * - * Returns the extracted Small Resource Data Type Tag Item. - */ -static inline u8 pci_vpd_srdt_tag(const u8 *srdt) -{ - return ((*srdt) & PCI_VPD_SRDT_TIN_MASK) >> 3; -} - -/** - * pci_vpd_info_field_size - Extracts the information field length - * @info_field: Pointer to the beginning of an information field header - * - * Returns the extracted information field length. - */ -static inline u8 pci_vpd_info_field_size(const u8 *info_field) -{ - return info_field[2]; -} - /** * pci_vpd_alloc - Allocate buffer and read VPD into it * @dev: PCI device -- cgit v1.2.3-71-gd317 From 4b92d4add5f6dcf21275185c997d6ecb800054cd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Aug 2021 13:48:34 +0200 Subject: drivers: base: cacheinfo: Get rid of DEFINE_SMP_CALL_CACHE_FUNCTION() DEFINE_SMP_CALL_CACHE_FUNCTION() was usefel before the CPU hotplug rework to ensure that the cache related functions are called on the upcoming CPU because the notifier itself could run on any online CPU. The hotplug state machine guarantees that the callbacks are invoked on the upcoming CPU. So there is no need to have this SMP function call obfuscation. That indirection was missed when the hotplug notifiers were converted. This also solves the problem of ARM64 init_cache_level() invoking ACPI functions which take a semaphore in that context. That's invalid as SMP function calls run with interrupts disabled. Running it just from the callback in context of the CPU hotplug thread solves this. Fixes: 8571890e1513 ("arm64: Add support for ACPI based firmware tables") Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Acked-by: Will Deacon Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/871r69ersb.ffs@tglx --- arch/arm64/kernel/cacheinfo.c | 7 ++----- arch/mips/kernel/cacheinfo.c | 7 ++----- arch/riscv/kernel/cacheinfo.c | 7 ++----- arch/x86/kernel/cpu/cacheinfo.c | 7 ++----- include/linux/cacheinfo.h | 18 ------------------ 5 files changed, 8 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index 7fa6828bb488..587543c6c51c 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -43,7 +43,7 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->type = type; } -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { unsigned int ctype, level, leaves, fw_level; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -78,7 +78,7 @@ static int __init_cache_level(unsigned int cpu) return 0; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { unsigned int level, idx; enum cache_type type; @@ -97,6 +97,3 @@ static int __populate_cache_leaves(unsigned int cpu) } return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/mips/kernel/cacheinfo.c b/arch/mips/kernel/cacheinfo.c index 53d8ea7d36e6..495dd058231d 100644 --- a/arch/mips/kernel/cacheinfo.c +++ b/arch/mips/kernel/cacheinfo.c @@ -17,7 +17,7 @@ do { \ leaf++; \ } while (0) -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { struct cpuinfo_mips *c = ¤t_cpu_data; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -74,7 +74,7 @@ static void fill_cpumask_cluster(int cpu, cpumask_t *cpu_map) cpumask_set_cpu(cpu1, cpu_map); } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { struct cpuinfo_mips *c = ¤t_cpu_data; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -114,6 +114,3 @@ static int __populate_cache_leaves(unsigned int cpu) return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index d86781357044..90deabfe63ea 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -113,7 +113,7 @@ static void fill_cacheinfo(struct cacheinfo **this_leaf, } } -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct device_node *np = of_cpu_device_node_get(cpu); @@ -155,7 +155,7 @@ static int __init_cache_level(unsigned int cpu) return 0; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *this_leaf = this_cpu_ci->info_list; @@ -187,6 +187,3 @@ static int __populate_cache_leaves(unsigned int cpu) return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index d66af2950e06..b5e36bd0425b 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -985,7 +985,7 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->priv = base->nb; } -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -1014,7 +1014,7 @@ static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) id4_regs->id = c->apicid >> index_msb; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { unsigned int idx, ret; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -1033,6 +1033,3 @@ static int __populate_cache_leaves(unsigned int cpu) return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 4f72b47973c3..2f909ed084c6 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -79,24 +79,6 @@ struct cpu_cacheinfo { bool cpu_map_populated; }; -/* - * Helpers to make sure "func" is executed on the cpu whose cache - * attributes are being detected - */ -#define DEFINE_SMP_CALL_CACHE_FUNCTION(func) \ -static inline void _##func(void *ret) \ -{ \ - int cpu = smp_processor_id(); \ - *(int *)ret = __##func(cpu); \ -} \ - \ -int func(unsigned int cpu) \ -{ \ - int ret; \ - smp_call_function_single(cpu, _##func, &ret, true); \ - return ret; \ -} - struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu); int init_cache_level(unsigned int cpu); int populate_cache_leaves(unsigned int cpu); -- cgit v1.2.3-71-gd317 From 59dc33252ee777e02332774fbdf3381b1d5d5f5d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 24 Aug 2021 16:43:55 +0200 Subject: PCI: VMD: ACPI: Make ACPI companion lookup work for VMD bus On some systems, in order to get to the deepest low-power state of the platform (which may be necessary to save significant enough amounts of energy while suspended to idle. for example), devices on the PCI bus exposed by the VMD driver need to be power-managed via ACPI. However, the layout of the ACPI namespace below the VMD controller device object does not reflect the layout of the PCI bus under the VMD host bridge, so in order to identify the ACPI companion objects for the devices on that bus, it is necessary to use a special _ADR encoding on the ACPI side. In other words, acpi_pci_find_companion() does not work for these devices, so it needs to be amended with a special lookup logic specific to the VMD bus. Address this issue by allowing the VMD driver to temporarily install an ACPI companion lookup hook containing the code matching the devices on the VMD PCI bus with the corresponding objects in the ACPI namespace. Signed-off-by: Rafael J. Wysocki Acked-by: Jon Derrick --- drivers/pci/controller/vmd.c | 55 ++++++++++++++++++++++++++++++++ drivers/pci/host-bridge.c | 1 + drivers/pci/pci-acpi.c | 74 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/pci-acpi.h | 3 ++ 4 files changed, 133 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index e3fcdfec58b3..a5987e52700e 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -447,6 +448,56 @@ static struct pci_ops vmd_ops = { .write = vmd_pci_write, }; +#ifdef CONFIG_ACPI +static struct acpi_device *vmd_acpi_find_companion(struct pci_dev *pci_dev) +{ + struct pci_host_bridge *bridge; + u32 busnr, addr; + + if (pci_dev->bus->ops != &vmd_ops) + return NULL; + + bridge = pci_find_host_bridge(pci_dev->bus); + busnr = pci_dev->bus->number - bridge->bus->number; + /* + * The address computation below is only applicable to relative bus + * numbers below 32. + */ + if (busnr > 31) + return NULL; + + addr = (busnr << 24) | ((u32)pci_dev->devfn << 16) | 0x8000FFFFU; + + dev_dbg(&pci_dev->dev, "Looking for ACPI companion (address 0x%x)\n", + addr); + + return acpi_find_child_device(ACPI_COMPANION(bridge->dev.parent), addr, + false); +} + +static bool hook_installed; + +static void vmd_acpi_begin(void) +{ + if (pci_acpi_set_companion_lookup_hook(vmd_acpi_find_companion)) + return; + + hook_installed = true; +} + +static void vmd_acpi_end(void) +{ + if (!hook_installed) + return; + + pci_acpi_clear_companion_lookup_hook(); + hook_installed = false; +} +#else +static inline void vmd_acpi_begin(void) { } +static inline void vmd_acpi_end(void) { } +#endif /* CONFIG_ACPI */ + static void vmd_attach_resources(struct vmd_dev *vmd) { vmd->dev->resource[VMD_MEMBAR1].child = &vmd->resources[1]; @@ -747,6 +798,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) if (vmd->irq_domain) dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain); + vmd_acpi_begin(); + pci_scan_child_bus(vmd->bus); pci_assign_unassigned_bus_resources(vmd->bus); @@ -760,6 +813,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_bus_add_devices(vmd->bus); + vmd_acpi_end(); + WARN(sysfs_create_link(&vmd->dev->dev.kobj, &vmd->bus->dev.kobj, "domain"), "Can't create symlink to domain\n"); return 0; diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c index e01d53f5b32f..afa50b446567 100644 --- a/drivers/pci/host-bridge.c +++ b/drivers/pci/host-bridge.c @@ -23,6 +23,7 @@ struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus) return to_pci_host_bridge(root_bus->bridge); } +EXPORT_SYMBOL_GPL(pci_find_host_bridge); struct device *pci_get_host_bridge_device(struct pci_dev *dev) { diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 36bc23e21759..825988a5c074 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "pci.h" /* @@ -1159,6 +1160,69 @@ void acpi_pci_remove_bus(struct pci_bus *bus) } /* ACPI bus type */ + + +static DECLARE_RWSEM(pci_acpi_companion_lookup_sem); +static struct acpi_device *(*pci_acpi_find_companion_hook)(struct pci_dev *); + +/** + * pci_acpi_set_companion_lookup_hook - Set ACPI companion lookup callback. + * @func: ACPI companion lookup callback pointer or NULL. + * + * Set a special ACPI companion lookup callback for PCI devices whose companion + * objects in the ACPI namespace have _ADR with non-standard bus-device-function + * encodings. + * + * Return 0 on success or a negative error code on failure (in which case no + * changes are made). + * + * The caller is responsible for the appropriate ordering of the invocations of + * this function with respect to the enumeration of the PCI devices needing the + * callback installed by it. + */ +int pci_acpi_set_companion_lookup_hook(struct acpi_device *(*func)(struct pci_dev *)) +{ + int ret; + + if (!func) + return -EINVAL; + + down_write(&pci_acpi_companion_lookup_sem); + + if (pci_acpi_find_companion_hook) { + ret = -EBUSY; + } else { + pci_acpi_find_companion_hook = func; + ret = 0; + } + + up_write(&pci_acpi_companion_lookup_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(pci_acpi_set_companion_lookup_hook); + +/** + * pci_acpi_clear_companion_lookup_hook - Clear ACPI companion lookup callback. + * + * Clear the special ACPI companion lookup callback previously set by + * pci_acpi_set_companion_lookup_hook(). Block until the last running instance + * of the callback returns before clearing it. + * + * The caller is responsible for the appropriate ordering of the invocations of + * this function with respect to the enumeration of the PCI devices needing the + * callback cleared by it. + */ +void pci_acpi_clear_companion_lookup_hook(void) +{ + down_write(&pci_acpi_companion_lookup_sem); + + pci_acpi_find_companion_hook = NULL; + + up_write(&pci_acpi_companion_lookup_sem); +} +EXPORT_SYMBOL_GPL(pci_acpi_clear_companion_lookup_hook); + static struct acpi_device *acpi_pci_find_companion(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); @@ -1166,6 +1230,16 @@ static struct acpi_device *acpi_pci_find_companion(struct device *dev) bool check_children; u64 addr; + down_read(&pci_acpi_companion_lookup_sem); + + adev = pci_acpi_find_companion_hook ? + pci_acpi_find_companion_hook(pci_dev) : NULL; + + up_read(&pci_acpi_companion_lookup_sem); + + if (adev) + return adev; + check_children = pci_is_bridge(pci_dev); /* Please ref to ACPI spec for the syntax of _ADR */ addr = (PCI_SLOT(pci_dev->devfn) << 16) | PCI_FUNC(pci_dev->devfn); diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 5ba475ca9078..f16de399d2de 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -122,6 +122,9 @@ static inline void pci_acpi_add_edr_notifier(struct pci_dev *pdev) { } static inline void pci_acpi_remove_edr_notifier(struct pci_dev *pdev) { } #endif /* CONFIG_PCIE_EDR */ +int pci_acpi_set_companion_lookup_hook(struct acpi_device *(*func)(struct pci_dev *)); +void pci_acpi_clear_companion_lookup_hook(void); + #else /* CONFIG_ACPI */ static inline void acpi_pci_add_bus(struct pci_bus *bus) { } static inline void acpi_pci_remove_bus(struct pci_bus *bus) { } -- cgit v1.2.3-71-gd317 From 4bf8e582119ed9767f907abb6dc62ef9dddf10df Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 1 Sep 2021 14:41:57 +0530 Subject: cpufreq: Remove ready() callback This isn't used anymore, get rid of it. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- Documentation/cpu-freq/cpu-drivers.rst | 3 --- Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst | 2 -- drivers/cpufreq/cpufreq.c | 4 ---- include/linux/cpufreq.h | 3 --- 4 files changed, 12 deletions(-) (limited to 'include/linux') diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst index d84ededb66f9..3b32336a7803 100644 --- a/Documentation/cpu-freq/cpu-drivers.rst +++ b/Documentation/cpu-freq/cpu-drivers.rst @@ -75,9 +75,6 @@ And optionally .resume - A pointer to a per-policy resume function which is called with interrupts disabled and _before_ the governor is started again. - .ready - A pointer to a per-policy ready function which is called after - the policy is fully initialized. - .attr - A pointer to a NULL-terminated list of "struct freq_attr" which allow to export values to sysfs. diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst index 5ae9cfa2ec55..334f30ae198b 100644 --- a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -80,8 +80,6 @@ CPUfreq核心层注册一个cpufreq_driver结构体。 .resume - 一个指向per-policy恢复函数的指针,该函数在关中断且在调节器再一次开始前被 调用。 - .ready - 一个指向per-policy准备函数的指针,该函数在策略完全初始化之后被调用。 - .attr - 一个指向NULL结尾的"struct freq_attr"列表的指针,该函数允许导出值到 sysfs。 diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7d5f170ecad1..5782b15a8caa 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1517,10 +1517,6 @@ static int cpufreq_online(unsigned int cpu) kobject_uevent(&policy->kobj, KOBJ_ADD); - /* Callback for handling stuff after policy is ready */ - if (cpufreq_driver->ready) - cpufreq_driver->ready(policy); - if (cpufreq_thermal_control_enabled(cpufreq_driver)) policy->cdev = of_cpufreq_cooling_register(policy); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index c65a1d7385f8..fe6acc04e5e5 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -367,9 +367,6 @@ struct cpufreq_driver { int (*suspend)(struct cpufreq_policy *policy); int (*resume)(struct cpufreq_policy *policy); - /* Will be called after the driver is fully initialized */ - void (*ready)(struct cpufreq_policy *policy); - struct freq_attr **attr; /* platform specific boost support code */ -- cgit v1.2.3-71-gd317 From 8083f58d08fd52f547c0a62c0f4e448e15e6726b Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Wed, 7 Jul 2021 18:28:35 +0200 Subject: pwm: Make pwmchip_remove() return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since some time pwmchip_remove() always returns 0 so the return value isn't usefull. Now that all callers are converted to ignore its value the function can be changed to return void. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 4 +--- include/linux/pwm.h | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 35e894f4a379..4527f09a5c50 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -304,7 +304,7 @@ EXPORT_SYMBOL_GPL(pwmchip_add); * * Returns: 0 on success or a negative error code on failure. */ -int pwmchip_remove(struct pwm_chip *chip) +void pwmchip_remove(struct pwm_chip *chip) { pwmchip_sysfs_unexport(chip); @@ -318,8 +318,6 @@ int pwmchip_remove(struct pwm_chip *chip) free_pwms(chip); mutex_unlock(&pwm_lock); - - return 0; } EXPORT_SYMBOL_GPL(pwmchip_remove); diff --git a/include/linux/pwm.h b/include/linux/pwm.h index a0b7e43049d5..725c9b784e60 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -404,7 +404,7 @@ int pwm_set_chip_data(struct pwm_device *pwm, void *data); void *pwm_get_chip_data(struct pwm_device *pwm); int pwmchip_add(struct pwm_chip *chip); -int pwmchip_remove(struct pwm_chip *chip); +void pwmchip_remove(struct pwm_chip *chip); int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip); -- cgit v1.2.3-71-gd317 From 15eb7c888e749fbd1cc0370f3d38de08ad903700 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 31 Aug 2021 08:38:19 +0200 Subject: locking/rwsem: Add missing __init_rwsem() for PREEMPT_RT 730633f0b7f95 became the first direct caller of __init_rwsem() vs the usual init_rwsem(), exposing PREEMPT_RT's lack thereof. Add it. [ tglx: Move it out of line ] Signed-off-by: Mike Galbraith Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/50a936b7d8f12277d6ec7ed2ef0421a381056909.camel@gmx.de --- include/linux/rwsem.h | 12 ++---------- kernel/locking/rwsem.c | 10 ++++++---- 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 426e98e0b675..352c6127cb90 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -142,22 +142,14 @@ struct rw_semaphore { #define DECLARE_RWSEM(lockname) \ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name, +extern void __init_rwsem(struct rw_semaphore *rwsem, const char *name, struct lock_class_key *key); -#else -static inline void __rwsem_init(struct rw_semaphore *rwsem, const char *name, - struct lock_class_key *key) -{ -} -#endif #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - init_rwbase_rt(&(sem)->rwbase); \ - __rwsem_init((sem), #sem, &__key); \ + __init_rwsem((sem), #sem, &__key); \ } while (0) static __always_inline int rwsem_is_locked(struct rw_semaphore *sem) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 9215b4d6a9de..000e8d5a2884 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1376,15 +1376,17 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #include "rwbase_rt.c" -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void __rwsem_init(struct rw_semaphore *sem, const char *name, +void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key) { + init_rwbase_rt(&(sem)->rwbase); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); -} -EXPORT_SYMBOL(__rwsem_init); #endif +} +EXPORT_SYMBOL(__init_rwsem); static inline void __down_read(struct rw_semaphore *sem) { -- cgit v1.2.3-71-gd317 From d095559ce4100f0c02aea229705230deac329c97 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 5 Jul 2021 09:22:56 +0800 Subject: ceph: flush mdlog before umounting Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 25 +++++++++++++++++++++++++ fs/ceph/mds_client.h | 1 + fs/ceph/strings.c | 1 + include/linux/ceph/ceph_fs.h | 1 + 4 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 926971822174..d98a3eda0d4c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4693,6 +4693,30 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests done\n"); } +void send_flush_mdlog(struct ceph_mds_session *s) +{ + struct ceph_msg *msg; + + /* + * Pre-luminous MDS crashes when it sees an unknown session request + */ + if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS)) + return; + + mutex_lock(&s->s_mutex); + dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds, + ceph_session_state_name(s->s_state), s->s_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, + s->s_seq); + if (!msg) { + pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n", + s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); + } else { + ceph_con_send(&s->s_con, msg); + } + mutex_unlock(&s->s_mutex); +} + /* * called before mount is ro, and before dentries are torn down. * (hmm, does this still race with new lookups?) @@ -4702,6 +4726,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) dout("pre_umount\n"); mdsc->stopping = 1; + ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true); ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4a75a14c2a88..97c7f7bfa55f 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -522,6 +522,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } +extern void send_flush_mdlog(struct ceph_mds_session *s); extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, void (*cb)(struct ceph_mds_session *), bool check_state); diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 4a79f3632260..573bb9556fb5 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -46,6 +46,7 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; case CEPH_SESSION_FORCE_RO: return "force_ro"; case CEPH_SESSION_REJECT: return "reject"; + case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "flush_mdlog"; } return "???"; } diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index e41a811026f6..bc2699feddbe 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -299,6 +299,7 @@ enum { CEPH_SESSION_FLUSHMSG_ACK, CEPH_SESSION_FORCE_RO, CEPH_SESSION_REJECT, + CEPH_SESSION_REQUEST_FLUSH_MDLOG, }; extern const char *ceph_session_op_name(int op); -- cgit v1.2.3-71-gd317 From 0b303fb402862dcb7948eeeed2439bd8c99948b5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sat, 8 May 2021 02:28:02 +0200 Subject: mm, slub: do initial checks in ___slab_alloc() with irqs enabled As another step of shortening irq disabled sections in ___slab_alloc(), delay disabling irqs until we pass the initial checks if there is a cached percpu slab and it's suitable for our allocation. Now we have to recheck c->page after actually disabling irqs as an allocation in irq handler might have replaced it. Because we call pfmemalloc_match() as one of the checks, we might hit VM_BUG_ON_PAGE(!PageSlab(page)) in PageSlabPfmemalloc in case we get interrupted and the page is freed. Thus introduce a pfmemalloc_match_unsafe() variant that lacks the PageSlab check. Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman --- include/linux/page-flags.h | 9 ++++++++ mm/slub.c | 54 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 54 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5922031ffab6..7fda4fb85bdc 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -815,6 +815,15 @@ static inline int PageSlabPfmemalloc(struct page *page) return PageActive(page); } +/* + * A version of PageSlabPfmemalloc() for opportunistic checks where the page + * might have been freed under us and not be a PageSlab anymore. + */ +static inline int __PageSlabPfmemalloc(struct page *page) +{ + return PageActive(page); +} + static inline void SetPageSlabPfmemalloc(struct page *page) { VM_BUG_ON_PAGE(!PageSlab(page), page); diff --git a/mm/slub.c b/mm/slub.c index dda05cc83eef..6295695d8515 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2620,6 +2620,19 @@ static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) return true; } +/* + * A variant of pfmemalloc_match() that tests page flags without asserting + * PageSlab. Intended for opportunistic checks before taking a lock and + * rechecking that nobody else freed the page under us. + */ +static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags) +{ + if (unlikely(__PageSlabPfmemalloc(page))) + return gfp_pfmemalloc_allowed(gfpflags); + + return true; +} + /* * Check the page->freelist of a page and either transfer the freelist to the * per cpu freelist or deactivate the page. @@ -2682,8 +2695,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_SLOWPATH); - local_irq_save(flags); - page = c->page; +reread_page: + + page = READ_ONCE(c->page); if (!page) { /* * if the node is not online or has no normal memory, just @@ -2692,6 +2706,11 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (unlikely(node != NUMA_NO_NODE && !node_isset(node, slab_nodes))) node = NUMA_NO_NODE; + local_irq_save(flags); + if (unlikely(c->page)) { + local_irq_restore(flags); + goto reread_page; + } goto new_slab; } redo: @@ -2706,8 +2725,7 @@ redo: goto redo; } else { stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, page, c->freelist, c); - goto new_slab; + goto deactivate_slab; } } @@ -2716,12 +2734,15 @@ redo: * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(page, gfpflags))) { - deactivate_slab(s, page, c->freelist, c); - goto new_slab; - } + if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags))) + goto deactivate_slab; - /* must check again c->freelist in case of cpu migration or IRQ */ + /* must check again c->page in case IRQ handler changed it */ + local_irq_save(flags); + if (unlikely(page != c->page)) { + local_irq_restore(flags); + goto reread_page; + } freelist = c->freelist; if (freelist) goto load_freelist; @@ -2737,6 +2758,9 @@ redo: stat(s, ALLOC_REFILL); load_freelist: + + lockdep_assert_irqs_disabled(); + /* * freelist is pointing to the list of objects to be used. * page is pointing to the page from which the objects are obtained. @@ -2748,11 +2772,23 @@ load_freelist: local_irq_restore(flags); return freelist; +deactivate_slab: + + local_irq_save(flags); + if (page != c->page) { + local_irq_restore(flags); + goto reread_page; + } + deactivate_slab(s, page, c->freelist, c); + new_slab: + lockdep_assert_irqs_disabled(); + if (slub_percpu_partial(c)) { page = c->page = slub_percpu_partial(c); slub_set_percpu_partial(c, page); + local_irq_restore(flags); stat(s, CPU_PARTIAL_ALLOC); goto redo; } -- cgit v1.2.3-71-gd317 From 2112ff5ce0c1128fe7b4d19cfe7f2b8ce5b595fa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Aug 2021 11:18:44 +0100 Subject: iov_iter: track truncated size Remember how many bytes were truncated and reverted back. Because not reexpanded iterators don't always work well with reverting, we may need to know that to reexpand ourselves when needed. Signed-off-by: Pavel Begunkov Signed-off-by: Al Viro --- include/linux/uio.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 82c3c3e819e0..5265024e8b90 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -47,6 +47,7 @@ struct iov_iter { }; loff_t xarray_start; }; + size_t truncated; }; static inline enum iter_type iov_iter_type(const struct iov_iter *i) @@ -254,8 +255,10 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ - if (i->count > count) + if (i->count > count) { + i->truncated += i->count - count; i->count = count; + } } /* @@ -264,6 +267,7 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { + i->truncated -= count - i->count; i->count = count; } -- cgit v1.2.3-71-gd317 From bd0e7491a931f5a2960555b10b9551464ff8cc8e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sat, 22 May 2021 01:59:38 +0200 Subject: mm, slub: convert kmem_cpu_slab protection to local_lock Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions of local_lock instead of plain local_irq_save/restore. On !PREEMPT_RT that's equivalent, with better lockdep visibility. On PREEMPT_RT that means better preemption. However, the cost on PREEMPT_RT is the loss of lockless fast paths which only work with cpu freelist. Those are designed to detect and recover from being preempted by other conflicting operations (both fast or slow path), but the slow path operations assume they cannot be preempted by a fast path operation, which is guaranteed naturally with disabled irqs. With local locks on PREEMPT_RT, the fast paths now also need to take the local lock to avoid races. In the allocation fastpath slab_alloc_node() we can just defer to the slowpath __slab_alloc() which also works with cpu freelist, but under the local lock. In the free fastpath do_slab_free() we have to add a new local lock protected version of freeing to the cpu freelist, as the existing slowpath only works with the page freelist. Also update the comment about locking scheme in SLUB to reflect changes done by this series. [ Mike Galbraith : use local_lock() without irq in PREEMPT_RT scope; debugging of RT crashes resulting in put_cpu_partial() locking changes ] Signed-off-by: Vlastimil Babka --- include/linux/slub_def.h | 6 ++ mm/slub.c | 146 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 117 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index dcde82a4434c..85499f0586b0 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,6 +10,7 @@ #include #include #include +#include enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ @@ -40,6 +41,10 @@ enum stat_item { CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ NR_SLUB_STAT_ITEMS }; +/* + * When changing the layout, make sure freelist and tid are still compatible + * with this_cpu_cmpxchg_double() alignment requirements. + */ struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ @@ -47,6 +52,7 @@ struct kmem_cache_cpu { #ifdef CONFIG_SLUB_CPU_PARTIAL struct page *partial; /* Partially allocated frozen slabs */ #endif + local_lock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; #endif diff --git a/mm/slub.c b/mm/slub.c index 38d4cc51e880..3d2025f7163b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -46,13 +46,21 @@ /* * Lock order: * 1. slab_mutex (Global Mutex) - * 2. node->list_lock - * 3. slab_lock(page) (Only on some arches and for debugging) + * 2. node->list_lock (Spinlock) + * 3. kmem_cache->cpu_slab->lock (Local lock) + * 4. slab_lock(page) (Only on some arches or for debugging) + * 5. object_map_lock (Only for debugging) * * slab_mutex * * The role of the slab_mutex is to protect the list of all the slabs * and to synchronize major metadata changes to slab cache structures. + * Also synchronizes memory hotplug callbacks. + * + * slab_lock + * + * The slab_lock is a wrapper around the page lock, thus it is a bit + * spinlock. * * The slab_lock is only used for debugging and on arches that do not * have the ability to do a cmpxchg_double. It only protects: @@ -61,6 +69,8 @@ * C. page->objects -> Number of objects in page * D. page->frozen -> frozen state * + * Frozen slabs + * * If a slab is frozen then it is exempt from list management. It is not * on any list except per cpu partial list. The processor that froze the * slab is the one who can perform list operations on the page. Other @@ -68,6 +78,8 @@ * froze the slab is the only one that can retrieve the objects from the * page's freelist. * + * list_lock + * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or * removed from the lists nor make the number of partial slabs be modified. @@ -79,10 +91,36 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * Interrupts are disabled during allocation and deallocation in order to - * make the slab allocator safe to use in the context of an irq. In addition - * interrupts are disabled to ensure that the processor does not change - * while handling per_cpu slabs, due to kernel preemption. + * + * cpu_slab->lock local lock + * + * This locks protect slowpath manipulation of all kmem_cache_cpu fields + * except the stat counters. This is a percpu structure manipulated only by + * the local cpu, so the lock protects against being preempted or interrupted + * by an irq. Fast path operations rely on lockless operations instead. + * On PREEMPT_RT, the local lock does not actually disable irqs (and thus + * prevent the lockless operations), so fastpath operations also need to take + * the lock and are no longer lockless. + * + * lockless fastpaths + * + * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) + * are fully lockless when satisfied from the percpu slab (and when + * cmpxchg_double is possible to use, otherwise slab_lock is taken). + * They also don't disable preemption or migration or irqs. They rely on + * the transaction id (tid) field to detect being preempted or moved to + * another cpu. + * + * irq, preemption, migration considerations + * + * Interrupts are disabled as part of list_lock or local_lock operations, or + * around the slab_lock operation, in order to make the slab allocator safe + * to use in the context of an irq. + * + * In addition, preemption (or migration on PREEMPT_RT) is disabled in the + * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the + * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer + * doesn't have to be revalidated in each section protected by the local lock. * * SLUB assigns one slab for allocation to each processor. * Allocations only occur from these slabs called cpu slabs. @@ -2250,9 +2288,13 @@ static inline void note_cmpxchg_failure(const char *n, static void init_kmem_cache_cpus(struct kmem_cache *s) { int cpu; + struct kmem_cache_cpu *c; - for_each_possible_cpu(cpu) - per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(s->cpu_slab, cpu); + local_lock_init(&c->lock); + c->tid = init_tid(cpu); + } } /* @@ -2463,10 +2505,10 @@ static void unfreeze_partials(struct kmem_cache *s) struct page *partial_page; unsigned long flags; - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); partial_page = this_cpu_read(s->cpu_slab->partial); this_cpu_write(s->cpu_slab->partial, NULL); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); if (partial_page) __unfreeze_partials(s, partial_page); @@ -2499,7 +2541,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) int pages = 0; int pobjects = 0; - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); oldpage = this_cpu_read(s->cpu_slab->partial); @@ -2527,7 +2569,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) this_cpu_write(s->cpu_slab->partial, page); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); if (page_to_unfreeze) { __unfreeze_partials(s, page_to_unfreeze); @@ -2549,7 +2591,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) struct page *page; void *freelist; - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); page = c->page; freelist = c->freelist; @@ -2558,7 +2600,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) c->freelist = NULL; c->tid = next_tid(c->tid); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); if (page) { deactivate_slab(s, page, freelist); @@ -2780,8 +2822,6 @@ static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags) * The page is still frozen if the return value is not NULL. * * If this function returns NULL then the page has been unfrozen. - * - * This function must be called with interrupt disabled. */ static inline void *get_freelist(struct kmem_cache *s, struct page *page) { @@ -2789,6 +2829,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) unsigned long counters; void *freelist; + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); + do { freelist = page->freelist; counters = page->counters; @@ -2873,9 +2915,9 @@ redo: goto deactivate_slab; /* must check again c->page in case we got preempted and it changed */ - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(page != c->page)) { - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_page; } freelist = c->freelist; @@ -2886,7 +2928,7 @@ redo: if (!freelist) { c->page = NULL; - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -2895,7 +2937,7 @@ redo: load_freelist: - lockdep_assert_irqs_disabled(); + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); /* * freelist is pointing to the list of objects to be used. @@ -2905,39 +2947,39 @@ load_freelist: VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); return freelist; deactivate_slab: - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); if (page != c->page) { - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_page; } freelist = c->freelist; c->page = NULL; c->freelist = NULL; - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); deactivate_slab(s, page, freelist); new_slab: if (slub_percpu_partial(c)) { - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->page)) { - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_page; } if (unlikely(!slub_percpu_partial(c))) { - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); /* we were preempted and partial list got empty */ goto new_objects; } page = c->page = slub_percpu_partial(c); slub_set_percpu_partial(c, page); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, CPU_PARTIAL_ALLOC); goto redo; } @@ -2990,7 +3032,7 @@ check_new_page: retry_load_page: - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->page)) { void *flush_freelist = c->freelist; struct page *flush_page = c->page; @@ -2999,7 +3041,7 @@ retry_load_page: c->freelist = NULL; c->tid = next_tid(c->tid); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); deactivate_slab(s, flush_page, flush_freelist); @@ -3118,7 +3160,15 @@ redo: object = c->freelist; page = c->page; - if (unlikely(!object || !page || !node_match(page, node))) { + /* + * We cannot use the lockless fastpath on PREEMPT_RT because if a + * slowpath has taken the local_lock_irqsave(), it is not protected + * against a fast path operation in an irq handler. So we need to take + * the slow path which uses local_lock. It is still relatively fast if + * there is a suitable cpu freelist. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) || + unlikely(!object || !page || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { void *next_object = get_freepointer_safe(s, object); @@ -3378,6 +3428,7 @@ redo: barrier(); if (likely(page == c->page)) { +#ifndef CONFIG_PREEMPT_RT void **freelist = READ_ONCE(c->freelist); set_freepointer(s, tail_obj, freelist); @@ -3390,6 +3441,31 @@ redo: note_cmpxchg_failure("slab_free", s, tid); goto redo; } +#else /* CONFIG_PREEMPT_RT */ + /* + * We cannot use the lockless fastpath on PREEMPT_RT because if + * a slowpath has taken the local_lock_irqsave(), it is not + * protected against a fast path operation in an irq handler. So + * we need to take the local_lock. We shouldn't simply defer to + * __slab_free() as that wouldn't use the cpu freelist at all. + */ + void **freelist; + + local_lock(&s->cpu_slab->lock); + c = this_cpu_ptr(s->cpu_slab); + if (unlikely(page != c->page)) { + local_unlock(&s->cpu_slab->lock); + goto redo; + } + tid = c->tid; + freelist = c->freelist; + + set_freepointer(s, tail_obj, freelist); + c->freelist = head; + c->tid = next_tid(tid); + + local_unlock(&s->cpu_slab->lock); +#endif stat(s, FREE_FASTPATH); } else __slab_free(s, page, head, tail_obj, cnt, addr); @@ -3568,7 +3644,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * handlers invoking normal fastpath. */ c = slub_get_cpu_ptr(s->cpu_slab); - local_irq_disable(); + local_lock_irq(&s->cpu_slab->lock); for (i = 0; i < size; i++) { void *object = kfence_alloc(s, s->object_size, flags); @@ -3589,7 +3665,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, */ c->tid = next_tid(c->tid); - local_irq_enable(); + local_unlock_irq(&s->cpu_slab->lock); /* * Invoking slow path likely have side-effect @@ -3603,7 +3679,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c = this_cpu_ptr(s->cpu_slab); maybe_wipe_obj_freeptr(s, p[i]); - local_irq_disable(); + local_lock_irq(&s->cpu_slab->lock); continue; /* goto for-loop */ } @@ -3612,7 +3688,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, maybe_wipe_obj_freeptr(s, p[i]); } c->tid = next_tid(c->tid); - local_irq_enable(); + local_unlock_irq(&s->cpu_slab->lock); slub_put_cpu_ptr(s->cpu_slab); /* -- cgit v1.2.3-71-gd317 From 8486a32dd484a7d7ec25295c7439094608f54915 Mon Sep 17 00:00:00 2001 From: "Hector.Yuan" Date: Fri, 3 Sep 2021 16:39:23 +0800 Subject: cpufreq: Add of_perf_domain_get_sharing_cpumask Add of_perf_domain_get_sharing_cpumask function to group cpu to specific performance domain. Signed-off-by: Hector.Yuan [ Viresh: create separate routine parse_perf_domain() and always set the cpumask. ] Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index c65a1d7385f8..acd3ee5b8b0a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include @@ -1003,6 +1005,55 @@ static inline int cpufreq_table_count_valid_entries(const struct cpufreq_policy return count; } + +static inline int parse_perf_domain(int cpu, const char *list_name, + const char *cell_name) +{ + struct device_node *cpu_np; + struct of_phandle_args args; + int ret; + + cpu_np = of_cpu_device_node_get(cpu); + if (!cpu_np) + return -ENODEV; + + ret = of_parse_phandle_with_args(cpu_np, list_name, cell_name, 0, + &args); + if (ret < 0) + return ret; + + of_node_put(cpu_np); + + return args.args[0]; +} + +static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name, + const char *cell_name, struct cpumask *cpumask) +{ + int target_idx; + int cpu, ret; + + ret = parse_perf_domain(pcpu, list_name, cell_name); + if (ret < 0) + return ret; + + target_idx = ret; + cpumask_set_cpu(pcpu, cpumask); + + for_each_possible_cpu(cpu) { + if (cpu == pcpu) + continue; + + ret = parse_perf_domain(pcpu, list_name, cell_name); + if (ret < 0) + continue; + + if (target_idx == ret) + cpumask_set_cpu(cpu, cpumask); + } + + return target_idx; +} #else static inline int cpufreq_boost_trigger_state(int state) { @@ -1022,6 +1073,12 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) { return false; } + +static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name, + const char *cell_name, struct cpumask *cpumask) +{ + return -EOPNOTSUPP; +} #endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) @@ -1043,7 +1100,6 @@ void arch_set_freq_scale(const struct cpumask *cpus, { } #endif - /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs; -- cgit v1.2.3-71-gd317 From 9c930054f2f5326d59ee4bf8d7d1cf6c82f5643b Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:24 +0800 Subject: file: Export receive_fd() to modules Export receive_fd() so that some modules can use it to pass file descriptor between processes without missing any security stuffs. Signed-off-by: Xie Yongji Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210831103634.33-4-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- fs/file.c | 6 ++++++ include/linux/file.h | 7 +++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/file.c b/fs/file.c index 86dc9956af32..210e540672aa 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1134,6 +1134,12 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) return new_fd; } +int receive_fd(struct file *file, unsigned int o_flags) +{ + return __receive_fd(file, NULL, o_flags); +} +EXPORT_SYMBOL_GPL(receive_fd); + static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; diff --git a/include/linux/file.h b/include/linux/file.h index 2de2e4613d7b..51e830b4fe3a 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -94,6 +94,9 @@ extern void fd_install(unsigned int fd, struct file *file); extern int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); + +extern int receive_fd(struct file *file, unsigned int o_flags); + static inline int receive_fd_user(struct file *file, int __user *ufd, unsigned int o_flags) { @@ -101,10 +104,6 @@ static inline int receive_fd_user(struct file *file, int __user *ufd, return -EFAULT; return __receive_fd(file, ufd, o_flags); } -static inline int receive_fd(struct file *file, unsigned int o_flags) -{ - return __receive_fd(file, NULL, o_flags); -} int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); -- cgit v1.2.3-71-gd317 From 86e17a51c1a5a299009f8b1645e3e9da0d59faae Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:25 +0800 Subject: vdpa: Fix some coding style issues Fix some code indent issues and following checkpatch warning: WARNING: Prefer 'unsigned int' to bare use of 'unsigned' 371: FILE: include/linux/vdpa.h:371: +static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset, Signed-off-by: Xie Yongji Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210831103634.33-5-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 8cfe49d201dd..8ae1134070eb 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -43,17 +43,17 @@ struct vdpa_vq_state_split { * @last_used_idx: used index */ struct vdpa_vq_state_packed { - u16 last_avail_counter:1; - u16 last_avail_idx:15; - u16 last_used_counter:1; - u16 last_used_idx:15; + u16 last_avail_counter:1; + u16 last_avail_idx:15; + u16 last_used_counter:1; + u16 last_used_idx:15; }; struct vdpa_vq_state { - union { - struct vdpa_vq_state_split split; - struct vdpa_vq_state_packed packed; - }; + union { + struct vdpa_vq_state_split split; + struct vdpa_vq_state_packed packed; + }; }; struct vdpa_mgmt_dev; @@ -131,7 +131,7 @@ struct vdpa_iova_range { * @vdev: vdpa device * @idx: virtqueue index * @state: pointer to returned state (last_avail_idx) - * @get_vq_notification: Get the notification area for a virtqueue + * @get_vq_notification: Get the notification area for a virtqueue * @vdev: vdpa device * @idx: virtqueue index * Returns the notifcation area @@ -350,25 +350,25 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev) static inline void vdpa_reset(struct vdpa_device *vdev) { - const struct vdpa_config_ops *ops = vdev->config; + const struct vdpa_config_ops *ops = vdev->config; vdev->features_valid = false; - ops->set_status(vdev, 0); + ops->set_status(vdev, 0); } static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) { - const struct vdpa_config_ops *ops = vdev->config; + const struct vdpa_config_ops *ops = vdev->config; vdev->features_valid = true; - return ops->set_features(vdev, features); + return ops->set_features(vdev, features); } - -static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset, - void *buf, unsigned int len) +static inline void vdpa_get_config(struct vdpa_device *vdev, + unsigned int offset, void *buf, + unsigned int len) { - const struct vdpa_config_ops *ops = vdev->config; + const struct vdpa_config_ops *ops = vdev->config; /* * Config accesses aren't supposed to trigger before features are set. -- cgit v1.2.3-71-gd317 From 0686082dbf7a204ca0fab326a820779e31666639 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:26 +0800 Subject: vdpa: Add reset callback in vdpa_config_ops This adds a new callback to support device specific reset behavior. The vdpa bus driver will call the reset function instead of setting status to zero during resetting. Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210831103634.33-6-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/ifcvf/ifcvf_main.c | 35 +++++++++++++++++++++++----------- drivers/vdpa/mlx5/net/mlx5_vnet.c | 40 +++++++++++++++++++++++---------------- drivers/vdpa/vdpa_sim/vdpa_sim.c | 18 +++++++++++++++--- drivers/vdpa/virtio_pci/vp_vdpa.c | 15 +++++++++++++-- drivers/vhost/vdpa.c | 9 +++++++-- include/linux/vdpa.h | 8 ++++++-- 6 files changed, 89 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index bfc3d7d40c09..4293481ce910 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -222,17 +222,6 @@ static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) if (status_old == status) return; - if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) && - !(status & VIRTIO_CONFIG_S_DRIVER_OK)) { - ifcvf_stop_datapath(adapter); - ifcvf_free_irq(adapter, vf->nr_vring); - } - - if (status == 0) { - ifcvf_reset_vring(adapter); - return; - } - if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) { ret = ifcvf_request_irq(adapter); @@ -252,6 +241,29 @@ static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) ifcvf_set_status(vf, status); } +static int ifcvf_vdpa_reset(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_adapter *adapter; + struct ifcvf_hw *vf; + u8 status_old; + + vf = vdpa_to_vf(vdpa_dev); + adapter = vdpa_to_adapter(vdpa_dev); + status_old = ifcvf_get_status(vf); + + if (status_old == 0) + return 0; + + if (status_old & VIRTIO_CONFIG_S_DRIVER_OK) { + ifcvf_stop_datapath(adapter); + ifcvf_free_irq(adapter, vf->nr_vring); + } + + ifcvf_reset_vring(adapter); + + return 0; +} + static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev) { return IFCVF_QUEUE_MAX; @@ -435,6 +447,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = { .set_features = ifcvf_vdpa_set_features, .get_status = ifcvf_vdpa_get_status, .set_status = ifcvf_vdpa_set_status, + .reset = ifcvf_vdpa_reset, .get_vq_num_max = ifcvf_vdpa_get_vq_num_max, .get_vq_state = ifcvf_vdpa_get_vq_state, .set_vq_state = ifcvf_vdpa_set_vq_state, diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 4ba3ac48ee83..608f6b900cd9 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2154,22 +2154,6 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) int err; print_status(mvdev, status, true); - if (!status) { - mlx5_vdpa_info(mvdev, "performing device reset\n"); - teardown_driver(ndev); - clear_vqs_ready(ndev); - mlx5_vdpa_destroy_mr(&ndev->mvdev); - ndev->mvdev.status = 0; - ndev->mvdev.mlx_features = 0; - memset(ndev->event_cbs, 0, sizeof(ndev->event_cbs)); - ndev->mvdev.actual_features = 0; - ++mvdev->generation; - if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { - if (mlx5_vdpa_create_mr(mvdev, NULL)) - mlx5_vdpa_warn(mvdev, "create MR failed\n"); - } - return; - } if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) { if (status & VIRTIO_CONFIG_S_DRIVER_OK) { @@ -2192,6 +2176,29 @@ err_setup: ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED; } +static int mlx5_vdpa_reset(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + + print_status(mvdev, 0, true); + mlx5_vdpa_info(mvdev, "performing device reset\n"); + teardown_driver(ndev); + clear_vqs_ready(ndev); + mlx5_vdpa_destroy_mr(&ndev->mvdev); + ndev->mvdev.status = 0; + ndev->mvdev.mlx_features = 0; + memset(ndev->event_cbs, 0, sizeof(ndev->event_cbs)); + ndev->mvdev.actual_features = 0; + ++mvdev->generation; + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { + if (mlx5_vdpa_create_mr(mvdev, NULL)) + mlx5_vdpa_warn(mvdev, "create MR failed\n"); + } + + return 0; +} + static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev) { return sizeof(struct virtio_net_config); @@ -2305,6 +2312,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .get_vendor_id = mlx5_vdpa_get_vendor_id, .get_status = mlx5_vdpa_get_status, .set_status = mlx5_vdpa_set_status, + .reset = mlx5_vdpa_reset, .get_config_size = mlx5_vdpa_get_config_size, .get_config = mlx5_vdpa_get_config, .set_config = mlx5_vdpa_set_config, diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 5b51d0ac8bae..f292bb05d6c9 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -92,7 +92,7 @@ static void vdpasim_vq_reset(struct vdpasim *vdpasim, vq->vring.notify = NULL; } -static void vdpasim_reset(struct vdpasim *vdpasim) +static void vdpasim_do_reset(struct vdpasim *vdpasim) { int i; @@ -460,11 +460,21 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status) spin_lock(&vdpasim->lock); vdpasim->status = status; - if (status == 0) - vdpasim_reset(vdpasim); spin_unlock(&vdpasim->lock); } +static int vdpasim_reset(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + spin_lock(&vdpasim->lock); + vdpasim->status = 0; + vdpasim_do_reset(vdpasim); + spin_unlock(&vdpasim->lock); + + return 0; +} + static size_t vdpasim_get_config_size(struct vdpa_device *vdpa) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); @@ -608,6 +618,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .get_vendor_id = vdpasim_get_vendor_id, .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, + .reset = vdpasim_reset, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, @@ -636,6 +647,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .get_vendor_id = vdpasim_get_vendor_id, .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, + .reset = vdpasim_reset, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index fe0527329857..cd7718b43a6e 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -189,10 +189,20 @@ static void vp_vdpa_set_status(struct vdpa_device *vdpa, u8 status) } vp_modern_set_status(mdev, status); +} - if (!(status & VIRTIO_CONFIG_S_DRIVER_OK) && - (s & VIRTIO_CONFIG_S_DRIVER_OK)) +static int vp_vdpa_reset(struct vdpa_device *vdpa) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev; + u8 s = vp_vdpa_get_status(vdpa); + + vp_modern_set_status(mdev, 0); + + if (s & VIRTIO_CONFIG_S_DRIVER_OK) vp_vdpa_free_irq(vp_vdpa); + + return 0; } static u16 vp_vdpa_get_vq_num_max(struct vdpa_device *vdpa) @@ -398,6 +408,7 @@ static const struct vdpa_config_ops vp_vdpa_ops = { .set_features = vp_vdpa_set_features, .get_status = vp_vdpa_get_status, .set_status = vp_vdpa_set_status, + .reset = vp_vdpa_reset, .get_vq_num_max = vp_vdpa_get_vq_num_max, .get_vq_state = vp_vdpa_get_vq_state, .get_vq_notification = vp_vdpa_get_vq_notification, diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 42c998c82102..5562258f6238 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -157,7 +157,7 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; u8 status, status_old; - int nvqs = v->nvqs; + int ret, nvqs = v->nvqs; u16 i; if (copy_from_user(&status, statusp, sizeof(status))) @@ -172,7 +172,12 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) if (status != 0 && (ops->get_status(vdpa) & ~status) != 0) return -EINVAL; - ops->set_status(vdpa, status); + if (status == 0) { + ret = ops->reset(vdpa); + if (ret) + return ret; + } else + ops->set_status(vdpa, status); if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) for (i = 0; i < nvqs; i++) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 8ae1134070eb..e1eae8c7483d 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -171,6 +171,9 @@ struct vdpa_iova_range { * @set_status: Set the device status * @vdev: vdpa device * @status: virtio device status + * @reset: Reset device + * @vdev: vdpa device + * Returns integer: success (0) or error (< 0) * @get_config_size: Get the size of the configuration space * @vdev: vdpa device * Returns size_t: configuration size @@ -255,6 +258,7 @@ struct vdpa_config_ops { u32 (*get_vendor_id)(struct vdpa_device *vdev); u8 (*get_status)(struct vdpa_device *vdev); void (*set_status)(struct vdpa_device *vdev, u8 status); + int (*reset)(struct vdpa_device *vdev); size_t (*get_config_size)(struct vdpa_device *vdev); void (*get_config)(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len); @@ -348,12 +352,12 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev) return vdev->dma_dev; } -static inline void vdpa_reset(struct vdpa_device *vdev) +static inline int vdpa_reset(struct vdpa_device *vdev) { const struct vdpa_config_ops *ops = vdev->config; vdev->features_valid = false; - ops->set_status(vdev, 0); + return ops->reset(vdev); } static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) -- cgit v1.2.3-71-gd317 From 59dfe4f1e810b5820443c84f9863b04b033143e8 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:28 +0800 Subject: vhost-iotlb: Add an opaque pointer for vhost IOTLB Add an opaque pointer for vhost IOTLB. And introduce vhost_iotlb_add_range_ctx() to accept it. Suggested-by: Jason Wang Signed-off-by: Xie Yongji Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210831103634.33-8-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/iotlb.c | 20 ++++++++++++++++---- include/linux/vhost_iotlb.h | 3 +++ 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c index 0582079e4bcc..670d56c879e5 100644 --- a/drivers/vhost/iotlb.c +++ b/drivers/vhost/iotlb.c @@ -36,19 +36,21 @@ void vhost_iotlb_map_free(struct vhost_iotlb *iotlb, EXPORT_SYMBOL_GPL(vhost_iotlb_map_free); /** - * vhost_iotlb_add_range - add a new range to vhost IOTLB + * vhost_iotlb_add_range_ctx - add a new range to vhost IOTLB * @iotlb: the IOTLB * @start: start of the IOVA range * @last: last of IOVA range * @addr: the address that is mapped to @start * @perm: access permission of this range + * @opaque: the opaque pointer for the new mapping * * Returns an error last is smaller than start or memory allocation * fails */ -int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, - u64 start, u64 last, - u64 addr, unsigned int perm) +int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb, + u64 start, u64 last, + u64 addr, unsigned int perm, + void *opaque) { struct vhost_iotlb_map *map; @@ -71,6 +73,7 @@ int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, map->last = last; map->addr = addr; map->perm = perm; + map->opaque = opaque; iotlb->nmaps++; vhost_iotlb_itree_insert(map, &iotlb->root); @@ -80,6 +83,15 @@ int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, return 0; } +EXPORT_SYMBOL_GPL(vhost_iotlb_add_range_ctx); + +int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, + u64 start, u64 last, + u64 addr, unsigned int perm) +{ + return vhost_iotlb_add_range_ctx(iotlb, start, last, + addr, perm, NULL); +} EXPORT_SYMBOL_GPL(vhost_iotlb_add_range); /** diff --git a/include/linux/vhost_iotlb.h b/include/linux/vhost_iotlb.h index 6b09b786a762..2d0e2f52f938 100644 --- a/include/linux/vhost_iotlb.h +++ b/include/linux/vhost_iotlb.h @@ -17,6 +17,7 @@ struct vhost_iotlb_map { u32 perm; u32 flags_padding; u64 __subtree_last; + void *opaque; }; #define VHOST_IOTLB_FLAG_RETIRE 0x1 @@ -29,6 +30,8 @@ struct vhost_iotlb { unsigned int flags; }; +int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb, u64 start, u64 last, + u64 addr, unsigned int perm, void *opaque); int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, u64 start, u64 last, u64 addr, unsigned int perm); void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last); -- cgit v1.2.3-71-gd317 From c10fb9454adc80c062151c6a436047e1fa59e99f Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:29 +0800 Subject: vdpa: Add an opaque pointer for vdpa_config_ops.dma_map() Add an opaque pointer for DMA mapping. Suggested-by: Jason Wang Signed-off-by: Xie Yongji Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210831103634.33-9-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 6 +++--- drivers/vhost/vdpa.c | 2 +- include/linux/vdpa.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index f292bb05d6c9..a70fd2a08ff1 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -555,14 +555,14 @@ err: } static int vdpasim_dma_map(struct vdpa_device *vdpa, u64 iova, u64 size, - u64 pa, u32 perm) + u64 pa, u32 perm, void *opaque) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); int ret; spin_lock(&vdpasim->iommu_lock); - ret = vhost_iotlb_add_range(vdpasim->iommu, iova, iova + size - 1, pa, - perm); + ret = vhost_iotlb_add_range_ctx(vdpasim->iommu, iova, iova + size - 1, + pa, perm, opaque); spin_unlock(&vdpasim->iommu_lock); return ret; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 86fb8d9ce409..5eb2fcc59532 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -571,7 +571,7 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, return r; if (ops->dma_map) { - r = ops->dma_map(vdpa, iova, size, pa, perm); + r = ops->dma_map(vdpa, iova, size, pa, perm, NULL); } else if (ops->set_map) { if (!v->in_batch) r = ops->set_map(vdpa, dev->iotlb); diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index e1eae8c7483d..f3014aaca47e 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -270,7 +270,7 @@ struct vdpa_config_ops { /* DMA ops */ int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb); int (*dma_map)(struct vdpa_device *vdev, u64 iova, u64 size, - u64 pa, u32 perm); + u64 pa, u32 perm, void *opaque); int (*dma_unmap)(struct vdpa_device *vdev, u64 iova, u64 size); /* Free device resources */ -- cgit v1.2.3-71-gd317 From d8945ec411209272bcd4ae9e75ea1b078257e492 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:31 +0800 Subject: vdpa: Support transferring virtual addressing during DMA mapping This patch introduces an attribute for vDPA device to indicate whether virtual address can be used. If vDPA device driver set it, vhost-vdpa bus driver will not pin user page and transfer userspace virtual address instead of physical address during DMA mapping. And corresponding vma->vm_file and offset will be also passed as an opaque pointer. Suggested-by: Jason Wang Signed-off-by: Xie Yongji Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210831103634.33-11-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/ifcvf/ifcvf_main.c | 2 +- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +- drivers/vdpa/vdpa.c | 9 +++- drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- drivers/vdpa/virtio_pci/vp_vdpa.c | 2 +- drivers/vhost/vdpa.c | 99 ++++++++++++++++++++++++++++++++++----- include/linux/vdpa.h | 20 ++++++-- 7 files changed, 117 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 4293481ce910..dcd648e1f7e7 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -515,7 +515,7 @@ static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name) pdev = ifcvf_mgmt_dev->pdev; dev = &pdev->dev; adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, - dev, &ifc_vdpa_ops, name); + dev, &ifc_vdpa_ops, name, false); if (IS_ERR(adapter)) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); return PTR_ERR(adapter); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 608f6b900cd9..08f39952fa6a 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2425,7 +2425,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name) max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS); ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, - name); + name, false); if (IS_ERR(ndev)) return PTR_ERR(ndev); diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index bb3f1d1f0422..8f01d6a7ecc5 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -71,6 +71,7 @@ static void vdpa_release_dev(struct device *d) * @config: the bus operations that is supported by this device * @size: size of the parent structure that contains private data * @name: name of the vdpa device; optional. + * @use_va: indicate whether virtual address must be used by this device * * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. @@ -80,7 +81,8 @@ static void vdpa_release_dev(struct device *d) */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, - size_t size, const char *name) + size_t size, const char *name, + bool use_va) { struct vdpa_device *vdev; int err = -EINVAL; @@ -91,6 +93,10 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, if (!!config->dma_map != !!config->dma_unmap) goto err; + /* It should only work for the device that use on-chip IOMMU */ + if (use_va && !(config->dma_map || config->set_map)) + goto err; + err = -ENOMEM; vdev = kzalloc(size, GFP_KERNEL); if (!vdev) @@ -106,6 +112,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, vdev->index = err; vdev->config = config; vdev->features_valid = false; + vdev->use_va = use_va; if (name) err = dev_set_name(&vdev->dev, "%s", name); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index a70fd2a08ff1..5f484fff8dbe 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -251,7 +251,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) ops = &vdpasim_config_ops; vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, - dev_attr->name); + dev_attr->name, false); if (IS_ERR(vdpasim)) { ret = PTR_ERR(vdpasim); goto err_alloc; diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index cd7718b43a6e..5bcd00246d2e 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -446,7 +446,7 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, - dev, &vp_vdpa_ops, NULL); + dev, &vp_vdpa_ops, NULL, false); if (IS_ERR(vp_vdpa)) { dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); return PTR_ERR(vp_vdpa); diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index ad50d52d6655..f41d081777f5 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -526,8 +526,28 @@ static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last) } } +static void vhost_vdpa_va_unmap(struct vhost_vdpa *v, u64 start, u64 last) +{ + struct vhost_dev *dev = &v->vdev; + struct vhost_iotlb *iotlb = dev->iotlb; + struct vhost_iotlb_map *map; + struct vdpa_map_file *map_file; + + while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) { + map_file = (struct vdpa_map_file *)map->opaque; + fput(map_file->file); + kfree(map_file); + vhost_iotlb_map_free(iotlb, map); + } +} + static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last) { + struct vdpa_device *vdpa = v->vdpa; + + if (vdpa->use_va) + return vhost_vdpa_va_unmap(v, start, last); + return vhost_vdpa_pa_unmap(v, start, last); } @@ -562,21 +582,21 @@ static int perm_to_iommu_flags(u32 perm) return flags | IOMMU_CACHE; } -static int vhost_vdpa_map(struct vhost_vdpa *v, - u64 iova, u64 size, u64 pa, u32 perm) +static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova, + u64 size, u64 pa, u32 perm, void *opaque) { struct vhost_dev *dev = &v->vdev; struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; int r = 0; - r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1, - pa, perm); + r = vhost_iotlb_add_range_ctx(dev->iotlb, iova, iova + size - 1, + pa, perm, opaque); if (r) return r; if (ops->dma_map) { - r = ops->dma_map(vdpa, iova, size, pa, perm, NULL); + r = ops->dma_map(vdpa, iova, size, pa, perm, opaque); } else if (ops->set_map) { if (!v->in_batch) r = ops->set_map(vdpa, dev->iotlb); @@ -584,13 +604,15 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, r = iommu_map(v->domain, iova, pa, size, perm_to_iommu_flags(perm)); } - - if (r) + if (r) { vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); - else + return r; + } + + if (!vdpa->use_va) atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm); - return r; + return 0; } static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size) @@ -611,6 +633,56 @@ static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size) } } +static int vhost_vdpa_va_map(struct vhost_vdpa *v, + u64 iova, u64 size, u64 uaddr, u32 perm) +{ + struct vhost_dev *dev = &v->vdev; + u64 offset, map_size, map_iova = iova; + struct vdpa_map_file *map_file; + struct vm_area_struct *vma; + int ret; + + mmap_read_lock(dev->mm); + + while (size) { + vma = find_vma(dev->mm, uaddr); + if (!vma) { + ret = -EINVAL; + break; + } + map_size = min(size, vma->vm_end - uaddr); + if (!(vma->vm_file && (vma->vm_flags & VM_SHARED) && + !(vma->vm_flags & (VM_IO | VM_PFNMAP)))) + goto next; + + map_file = kzalloc(sizeof(*map_file), GFP_KERNEL); + if (!map_file) { + ret = -ENOMEM; + break; + } + offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start; + map_file->offset = offset; + map_file->file = get_file(vma->vm_file); + ret = vhost_vdpa_map(v, map_iova, map_size, uaddr, + perm, map_file); + if (ret) { + fput(map_file->file); + kfree(map_file); + break; + } +next: + size -= map_size; + uaddr += map_size; + map_iova += map_size; + } + if (ret) + vhost_vdpa_unmap(v, iova, map_iova - iova); + + mmap_read_unlock(dev->mm); + + return ret; +} + static int vhost_vdpa_pa_map(struct vhost_vdpa *v, u64 iova, u64 size, u64 uaddr, u32 perm) { @@ -677,7 +749,7 @@ static int vhost_vdpa_pa_map(struct vhost_vdpa *v, csize = PFN_PHYS(last_pfn - map_pfn + 1); ret = vhost_vdpa_map(v, iova, csize, PFN_PHYS(map_pfn), - perm); + perm, NULL); if (ret) { /* * Unpin the pages that are left unmapped @@ -706,7 +778,7 @@ static int vhost_vdpa_pa_map(struct vhost_vdpa *v, /* Pin the rest chunk */ ret = vhost_vdpa_map(v, iova, PFN_PHYS(last_pfn - map_pfn + 1), - PFN_PHYS(map_pfn), perm); + PFN_PHYS(map_pfn), perm, NULL); out: if (ret) { if (nchunks) { @@ -739,6 +811,7 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, struct vhost_iotlb_msg *msg) { struct vhost_dev *dev = &v->vdev; + struct vdpa_device *vdpa = v->vdpa; struct vhost_iotlb *iotlb = dev->iotlb; if (msg->iova < v->range.first || !msg->size || @@ -750,6 +823,10 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, msg->iova + msg->size - 1)) return -EEXIST; + if (vdpa->use_va) + return vhost_vdpa_va_map(v, msg->iova, msg->size, + msg->uaddr, msg->perm); + return vhost_vdpa_pa_map(v, msg->iova, msg->size, msg->uaddr, msg->perm); } diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index f3014aaca47e..3972ab765de1 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -65,6 +65,7 @@ struct vdpa_mgmt_dev; * @config: the configuration ops for this device. * @index: device index * @features_valid: were features initialized? for legacy guests + * @use_va: indicate whether virtual address must be used by this device * @nvqs: maximum number of supported virtqueues * @mdev: management device pointer; caller must setup when registering device as part * of dev_add() mgmtdev ops callback before invoking _vdpa_register_device(). @@ -75,6 +76,7 @@ struct vdpa_device { const struct vdpa_config_ops *config; unsigned int index; bool features_valid; + bool use_va; int nvqs; struct vdpa_mgmt_dev *mdev; }; @@ -89,6 +91,16 @@ struct vdpa_iova_range { u64 last; }; +/** + * Corresponding file area for device memory mapping + * @file: vma->vm_file for the mapping + * @offset: mapping offset in the vm_file + */ +struct vdpa_map_file { + struct file *file; + u64 offset; +}; + /** * struct vdpa_config_ops - operations for configuring a vDPA device. * Note: vDPA device drivers are required to implement all of the @@ -279,7 +291,8 @@ struct vdpa_config_ops { struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, - size_t size, const char *name); + size_t size, const char *name, + bool use_va); /** * vdpa_alloc_device - allocate and initilaize a vDPA device @@ -289,15 +302,16 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, * @parent: the parent device * @config: the bus operations that is supported by this device * @name: name of the vdpa device + * @use_va: indicate whether virtual address must be used by this device * * Return allocated data structure or ERR_PTR upon error */ -#define vdpa_alloc_device(dev_struct, member, parent, config, name) \ +#define vdpa_alloc_device(dev_struct, member, parent, config, name, use_va) \ container_of(__vdpa_alloc_device( \ parent, config, \ sizeof(dev_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ - dev_struct, member)), name), \ + dev_struct, member)), name, use_va), \ dev_struct, member) int vdpa_register_device(struct vdpa_device *vdev, int nvqs); -- cgit v1.2.3-71-gd317 From ca67408ad57a5a67ad6801d792c40c010451bdef Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 6 Sep 2021 09:44:52 +0100 Subject: PM: EM: fix kernel-doc comments Fix the kernel-doc comments for the improved Energy Model documentation. Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 1834752c5617..39dcadd492b5 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -11,7 +11,7 @@ #include /** - * em_perf_state - Performance state of a performance domain + * struct em_perf_state - Performance state of a performance domain * @frequency: The frequency in KHz, for consistency with CPUFreq * @power: The power consumed at this level (by 1 CPU or by a registered * device). It can be a total power: static and dynamic. @@ -25,7 +25,7 @@ struct em_perf_state { }; /** - * em_perf_domain - Performance domain + * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order * @nr_perf_states: Number of performance states * @milliwatts: Flag indicating the power values are in milli-Watts @@ -103,12 +103,12 @@ void em_dev_unregister_perf_domain(struct device *dev); /** * em_cpu_energy() - Estimates the energy consumed by the CPUs of a - performance domain + * performance domain * @pd : performance domain for which energy has to be estimated * @max_util : highest utilization among CPUs of the domain * @sum_util : sum of the utilization of all CPUs in the domain * @allowed_cpu_cap : maximum allowed CPU capacity for the @pd, which - might reflect reduced frequency (due to thermal) + * might reflect reduced frequency (due to thermal) * * This function must be used only for CPU devices. There is no validation, * i.e. if the EM is a CPU type and has cpumask allocated. It is called from -- cgit v1.2.3-71-gd317 From 39ff83f2f6cc5cc1458dfcea9697f96338210beb Mon Sep 17 00:00:00 2001 From: Lukas Hannen Date: Wed, 25 Aug 2021 10:12:43 +0000 Subject: time: Handle negative seconds correctly in timespec64_to_ns() timespec64_ns() prevents multiplication overflows by comparing the seconds value of the timespec to KTIME_SEC_MAX. If the value is greater or equal it returns KTIME_MAX. But that check casts the signed seconds value to unsigned which makes the comparision true for all negative values and therefore return wrongly KTIME_MAX. Negative second values are perfectly valid and required in some places, e.g. ptp_clock_adjtime(). Remove the cast and add a check for the negative boundary which is required to prevent undefined behaviour due to multiplication underflow. Fixes: cb47755725da ("time: Prevent undefined behaviour in timespec64_to_ns()")' Signed-off-by: Lukas Hannen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/AM6PR01MB541637BD6F336B8FFB72AF80EEC69@AM6PR01MB5416.eurprd01.prod.exchangelabs.com --- include/linux/time64.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/time64.h b/include/linux/time64.h index 5117cb5b5656..81b9686a2079 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -25,7 +25,9 @@ struct itimerspec64 { #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) +#define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) +#define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): @@ -124,10 +126,13 @@ static inline bool timespec64_valid_settod(const struct timespec64 *ts) */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { - /* Prevent multiplication overflow */ - if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + /* Prevent multiplication overflow / underflow */ + if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; + if (ts->tv_sec <= KTIME_SEC_MIN) + return KTIME_MIN; + return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } -- cgit v1.2.3-71-gd317 From 859a85ddf90e714092dea71a0e54c7b9896621be Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 7 Sep 2021 19:54:52 -0700 Subject: mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE Patch series "mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE". After recent updates to freeing unused parts of the memory map, no architecture can have holes in the memory map within a pageblock. This makes pfn_valid_within() check and CONFIG_HOLES_IN_ZONE configuration option redundant. The first patch removes them both in a mechanical way and the second patch simplifies memory_hotplug::test_pages_in_a_zone() that had pfn_valid_within() surrounded by more logic than simple if. This patch (of 2): After recent changes in freeing of the unused parts of the memory map and rework of pfn_valid() in arm and arm64 there are no architectures that can have holes in the memory map within a pageblock and so nothing can enable CONFIG_HOLES_IN_ZONE which guards non trivial implementation of pfn_valid_within(). With that, pfn_valid_within() is always hardwired to 1 and can be completely removed. Remove calls to pfn_valid_within() and CONFIG_HOLES_IN_ZONE. Link: https://lkml.kernel.org/r/20210713080035.7464-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210713080035.7464-2-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 2 -- include/linux/mmzone.h | 12 ------------ mm/Kconfig | 3 --- mm/compaction.c | 20 +++++++------------- mm/memory_hotplug.c | 4 ---- mm/page_alloc.c | 24 ++---------------------- mm/page_isolation.c | 7 +------ mm/page_owner.c | 14 +------------- 8 files changed, 11 insertions(+), 75 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index 4a4ae868ad9f..8ec6b7dfbb0f 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -768,8 +768,6 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static int __ref get_nid_for_pfn(unsigned long pfn) { - if (!pfn_valid_within(pfn)) - return -1; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT if (system_state < SYSTEM_RUNNING) return early_pfn_to_nid(pfn); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fcb535560028..ee3a86830519 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1525,18 +1525,6 @@ void sparse_init(void); #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ -/* - * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we - * need to check pfn validity within that MAX_ORDER_NR_PAGES block. - * pfn_valid_within() should be used in this case; we optimise this away - * when we have no holes within a MAX_ORDER_NR_PAGES block. - */ -#ifdef CONFIG_HOLES_IN_ZONE -#define pfn_valid_within(pfn) pfn_valid(pfn) -#else -#define pfn_valid_within(pfn) (1) -#endif - #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 40a9bfcd5062..14d5d2837737 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -96,9 +96,6 @@ config HAVE_FAST_GUP depends on MMU bool -config HOLES_IN_ZONE - bool - # Don't discard allocated memory used to track "memory" and "reserved" memblocks # after early boot, so it can still be used to test for validity of memory. # Also, memblocks are updated with memory hot(un)plug. diff --git a/mm/compaction.c b/mm/compaction.c index 621508e0ecd5..ed37e1cb4369 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -306,16 +306,14 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, * is necessary for the block to be a migration source/target. */ do { - if (pfn_valid_within(pfn)) { - if (check_source && PageLRU(page)) { - clear_pageblock_skip(page); - return true; - } + if (check_source && PageLRU(page)) { + clear_pageblock_skip(page); + return true; + } - if (check_target && PageBuddy(page)) { - clear_pageblock_skip(page); - return true; - } + if (check_target && PageBuddy(page)) { + clear_pageblock_skip(page); + return true; } page += (1 << PAGE_ALLOC_COSTLY_ORDER); @@ -585,8 +583,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; nr_scanned++; - if (!pfn_valid_within(blockpfn)) - goto isolate_fail; /* * For compound pages such as THP and hugetlbfs, we can save @@ -885,8 +881,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, cond_resched(); } - if (!pfn_valid_within(low_pfn)) - goto isolate_fail; nr_scanned++; page = pfn_to_page(low_pfn); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 86c3af79e874..8d3376f66f01 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1308,10 +1308,6 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn, for (; pfn < sec_end_pfn && pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) { i = 0; - /* This is just a CONFIG_HOLES_IN_ZONE check.*/ - while ((i < MAX_ORDER_NR_PAGES) && - !pfn_valid_within(pfn + i)) - i++; if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) continue; /* Check if we got outside of the zone */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eeb3a9cb36bb..79a2fc5b6c6f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -594,8 +594,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) static int page_is_consistent(struct zone *zone, struct page *page) { - if (!pfn_valid_within(page_to_pfn(page))) - return 0; if (zone != page_zone(page)) return 0; @@ -1025,16 +1023,12 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, if (order >= MAX_ORDER - 2) return false; - if (!pfn_valid_within(buddy_pfn)) - return false; - combined_pfn = buddy_pfn & pfn; higher_page = page + (combined_pfn - pfn); buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); higher_buddy = higher_page + (buddy_pfn - combined_pfn); - return pfn_valid_within(buddy_pfn) && - page_is_buddy(higher_page, higher_buddy, order + 1); + return page_is_buddy(higher_page, higher_buddy, order + 1); } /* @@ -1095,8 +1089,6 @@ continue_merging: buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); - if (!pfn_valid_within(buddy_pfn)) - goto done_merging; if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -1754,9 +1746,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, /* * Check that the whole (or subset of) a pageblock given by the interval of * [start_pfn, end_pfn) is valid and within the same zone, before scanning it - * with the migration of free compaction scanner. The scanners then need to - * use only pfn_valid_within() check for arches that allow holes within - * pageblocks. + * with the migration of free compaction scanner. * * Return struct page pointer of start_pfn, or NULL if checks were not passed. * @@ -1872,8 +1862,6 @@ static inline void __init pgdat_init_report_one_done(void) */ static inline bool __init deferred_pfn_valid(unsigned long pfn) { - if (!pfn_valid_within(pfn)) - return false; if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) return false; return true; @@ -2520,11 +2508,6 @@ static int move_freepages(struct zone *zone, int pages_moved = 0; for (pfn = start_pfn; pfn <= end_pfn;) { - if (!pfn_valid_within(pfn)) { - pfn++; - continue; - } - page = pfn_to_page(pfn); if (!PageBuddy(page)) { /* @@ -8814,9 +8797,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, } for (; iter < pageblock_nr_pages - offset; iter++) { - if (!pfn_valid_within(pfn + iter)) - continue; - page = pfn_to_page(pfn + iter); /* diff --git a/mm/page_isolation.c b/mm/page_isolation.c index bddf788f45bf..471e3a13b541 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -93,8 +93,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); - if (pfn_valid_within(buddy_pfn) && - !is_migrate_isolate_page(buddy)) { + if (!is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); isolated_page = true; } @@ -250,10 +249,6 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, struct page *page; while (pfn < end_pfn) { - if (!pfn_valid_within(pfn)) { - pfn++; - continue; - } page = pfn_to_page(pfn); if (PageBuddy(page)) /* diff --git a/mm/page_owner.c b/mm/page_owner.c index f51a57e92aa3..62402d22539b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -276,9 +276,6 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - continue; - /* The pageblock is online, no need to recheck. */ page = pfn_to_page(pfn); @@ -479,10 +476,6 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) continue; } - /* Check for holes within a MAX_ORDER area */ - if (!pfn_valid_within(pfn)) - continue; - page = pfn_to_page(pfn); if (PageBuddy(page)) { unsigned long freepage_order = buddy_order_unsafe(page); @@ -560,14 +553,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) block_end_pfn = min(block_end_pfn, end_pfn); for (; pfn < block_end_pfn; pfn++) { - struct page *page; + struct page *page = pfn_to_page(pfn); struct page_ext *page_ext; - if (!pfn_valid_within(pfn)) - continue; - - page = pfn_to_page(pfn); - if (page_zone(page) != zone) continue; -- cgit v1.2.3-71-gd317 From 7cf209ba8a86410939a24cb1aeb279479a7e0ca6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:54:59 -0700 Subject: mm/memory_hotplug: use "unsigned long" for PFN in zone_for_pfn_range() Patch series "mm/memory_hotplug: preparatory patches for new online policy and memory" These are all cleanups and one fix previously sent as part of [1]: [PATCH v1 00/12] mm/memory_hotplug: "auto-movable" online policy and memory groups. These patches make sense even without the other series, therefore I pulled them out to make the other series easier to digest. [1] https://lkml.kernel.org/r/20210607195430.48228-1-david@redhat.com This patch (of 4): Checkpatch complained on a follow-up patch that we are using "unsigned" here, which defaults to "unsigned int" and checkpatch is correct. As we will search for a fitting zone using the wrong pfn, we might end up onlining memory to one of the special kernel zones, such as ZONE_DMA, which can end badly as the onlined memory does not satisfy properties of these zones. Use "unsigned long" instead, just as we do in other places when handling PFNs. This can bite us once we have physical addresses in the range of multiple TB. Link: https://lkml.kernel.org/r/20210712124052.26491-2-david@redhat.com Fixes: e5e689302633 ("mm, memory_hotplug: display allowed zones in the preferred ordering") Signed-off-by: David Hildenbrand Reviewed-by: Pankaj Gupta Reviewed-by: Muchun Song Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Vitaly Kuznetsov Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Cc: Wei Yang Cc: Michal Hocko Cc: Dan Williams Cc: Anshuman Khandual Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mike Rapoport Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Pavel Tatashin Cc: Heiko Carstens Cc: Michael Ellerman Cc: Catalin Marinas Cc: virtualization@lists.linux-foundation.org Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V" Cc: Anton Blanchard Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Dave Jiang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jia He Cc: Joe Perches Cc: Kefeng Wang Cc: Laurent Dufour Cc: Michel Lespinasse Cc: Nathan Lynch Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Pierre Morel Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Scott Cheloha Cc: Sergei Trofimovich Cc: Thiago Jung Bauermann Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Will Deacon Cc: Yoshinori Sato Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 4 ++-- mm/memory_hotplug.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index a7fd2c3ccb77..d01b504ce06f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -339,8 +339,8 @@ extern void sparse_remove_section(struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, - unsigned long nr_pages); +extern struct zone *zone_for_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages); extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f829805fe1ca..fa349acb8810 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -708,8 +708,8 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; } -struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, - unsigned long nr_pages) +struct zone *zone_for_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); -- cgit v1.2.3-71-gd317 From 65a2aa5f482ed0c1b5afb9e6b0b9e0b16bb8b616 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:04 -0700 Subject: mm/memory_hotplug: remove nid parameter from arch_remove_memory() The parameter is unused, let's remove it. Link: https://lkml.kernel.org/r/20210712124052.26491-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Catalin Marinas Acked-by: Michael Ellerman [powerpc] Acked-by: Heiko Carstens [s390] Reviewed-by: Pankaj Gupta Reviewed-by: Oscar Salvador Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Pavel Tatashin Cc: Baoquan He Cc: Laurent Dufour Cc: Sergei Trofimovich Cc: Kefeng Wang Cc: Michel Lespinasse Cc: Christophe Leroy Cc: "Aneesh Kumar K.V" Cc: Thiago Jung Bauermann Cc: Joe Perches Cc: Pierre Morel Cc: Jia He Cc: Anton Blanchard Cc: Dan Williams Cc: Dave Jiang Cc: Jason Wang Cc: Len Brown Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Nathan Lynch Cc: Pankaj Gupta Cc: "Rafael J. Wysocki" Cc: "Rafael J. Wysocki" Cc: Scott Cheloha Cc: Vishal Verma Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 3 +-- arch/ia64/mm/init.c | 3 +-- arch/powerpc/mm/mem.c | 3 +-- arch/s390/mm/init.c | 3 +-- arch/sh/mm/init.c | 3 +-- arch/x86/mm/init_32.c | 3 +-- arch/x86/mm/init_64.c | 3 +-- include/linux/memory_hotplug.h | 3 +-- mm/memory_hotplug.c | 4 ++-- mm/memremap.c | 5 +---- 10 files changed, 11 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9ff0de1b2b93..cfd9deb347c3 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1502,8 +1502,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 064a967a7b6e..5c6da8d83c1a 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -484,8 +484,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ad198b439222..c3c4e31462ec 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -119,8 +119,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, return rc; } -void __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8ac710de1ab1..d85bd7f5d8dc 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -306,8 +306,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return rc; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index ce26c7f8950a..506784702430 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -414,8 +414,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 74b78840182d..bd90b8fe81e4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -801,8 +801,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start_pfn, nr_pages, params); } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ddeaba947eb3..a6e11763763f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1255,8 +1255,7 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end) remove_pagetable(start, end, true, NULL); } -void __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index d01b504ce06f..010a192298b5 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -130,8 +130,7 @@ static inline bool movable_node_is_enabled(void) return movable_node_enabled; } -extern void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap); +extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap); extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fa349acb8810..14c4f6051c13 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1106,7 +1106,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) /* create memory block devices after memory was added */ ret = create_memory_block_devices(start, size, mhp_altmap.alloc); if (ret) { - arch_remove_memory(nid, start, size, NULL); + arch_remove_memory(start, size, NULL); goto error; } @@ -1886,7 +1886,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); - arch_remove_memory(nid, start, size, altmap); + arch_remove_memory(start, size, altmap); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { memblock_free(start, size); diff --git a/mm/memremap.c b/mm/memremap.c index 15a074ffb8d7..ed593bf87109 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -140,14 +140,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { struct range *range = &pgmap->ranges[range_id]; struct page *first_page; - int nid; /* make sure to access a memmap that was actually initialized */ first_page = pfn_to_page(pfn_first(pgmap, range_id)); /* pages are dead and unused, undo the arch mapping */ - nid = page_to_nid(first_page); - mem_hotplug_begin(); remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start), PHYS_PFN(range_len(range))); @@ -155,7 +152,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) __remove_pages(PHYS_PFN(range->start), PHYS_PFN(range_len(range)), NULL); } else { - arch_remove_memory(nid, range->start, range_len(range), + arch_remove_memory(range->start, range_len(range), pgmap_altmap(pgmap)); kasan_remove_zero_shadow(__va(range->start), range_len(range)); } -- cgit v1.2.3-71-gd317 From e1c158e4956612e7bada4c03dfb99210af4d6cde Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:09 -0700 Subject: mm/memory_hotplug: remove nid parameter from remove_memory() and friends There is only a single user remaining. We can simply lookup the nid only used for node offlining purposes when walking our memory blocks. We don't expect to remove multi-nid ranges; and if we'd ever do, we most probably don't care about removing multi-nid ranges that actually result in empty nodes. If ever required, we can detect the "multi-nid" scenario and simply try offlining all online nodes. Link: https://lkml.kernel.org/r/20210712124052.26491-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michael Ellerman (powerpc) Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Nathan Lynch Cc: Laurent Dufour Cc: "Aneesh Kumar K.V" Cc: Scott Cheloha Cc: Anton Blanchard Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Baoquan He Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Dave Hansen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jia He Cc: Joe Perches Cc: Kefeng Wang Cc: Michal Hocko Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Pierre Morel Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Sergei Trofimovich Cc: Thiago Jung Bauermann Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/pseries/hotplug-memory.c | 9 ++++---- drivers/acpi/acpi_memhotplug.c | 7 +------ drivers/dax/kmem.c | 3 +-- drivers/virtio/virtio_mem.c | 4 ++-- include/linux/memory_hotplug.h | 10 ++++----- mm/memory_hotplug.c | 28 +++++++++++++++---------- 6 files changed, 30 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 377d852f5a9a..ef5c24b42cf1 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -286,7 +286,7 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si { unsigned long block_sz, start_pfn; int sections_per_block; - int i, nid; + int i; start_pfn = base >> PAGE_SHIFT; @@ -297,10 +297,9 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si block_sz = pseries_memory_block_size(); sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - nid = memory_add_physaddr_to_nid(base); for (i = 0; i < sections_per_block; i++) { - __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE); + __remove_memory(base, MIN_MEMORY_BLOCK_SIZE); base += MIN_MEMORY_BLOCK_SIZE; } @@ -387,7 +386,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) block_sz = pseries_memory_block_size(); - __remove_memory(mem_block->nid, lmb->base_addr, block_sz); + __remove_memory(lmb->base_addr, block_sz); put_device(&mem_block->dev); /* Update memory regions for memory remove */ @@ -660,7 +659,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) rc = dlpar_online_lmb(lmb); if (rc) { - __remove_memory(nid, lmb->base_addr, block_sz); + __remove_memory(lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); } else { lmb->flags |= DRCONF_MEM_ASSIGNED; diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 8cc195c4c861..1d01d9414c40 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -239,19 +239,14 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) static void acpi_memory_remove_memory(struct acpi_memory_device *mem_device) { - acpi_handle handle = mem_device->device->handle; struct acpi_memory_info *info, *n; - int nid = acpi_get_node(handle); list_for_each_entry_safe(info, n, &mem_device->res_list, list) { if (!info->enabled) continue; - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(info->start_addr); - acpi_unbind_memory_blocks(info); - __remove_memory(nid, info->start_addr, info->length); + __remove_memory(info->start_addr, info->length); list_del(&info->list); kfree(info); } diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index ac231cc36359..99e0f60c4c26 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -156,8 +156,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax) if (rc) continue; - rc = remove_memory(dev_dax->target_node, range.start, - range_len(&range)); + rc = remove_memory(range.start, range_len(&range)); if (rc == 0) { release_resource(data->res[i]); kfree(data->res[i]); diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index b91bc810a87e..7e83ed373e00 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -677,7 +677,7 @@ static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, addr + size - 1); - rc = remove_memory(vm->nid, addr, size); + rc = remove_memory(addr, size); if (!rc) { atomic64_sub(size, &vm->offline_size); /* @@ -720,7 +720,7 @@ static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, "offlining and removing memory: 0x%llx - 0x%llx\n", addr, addr + size - 1); - rc = offline_and_remove_memory(vm->nid, addr, size); + rc = offline_and_remove_memory(addr, size); if (!rc) { atomic64_sub(size, &vm->offline_size); /* diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 010a192298b5..068e3dcf4690 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -292,9 +292,9 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); -extern int remove_memory(int nid, u64 start, u64 size); -extern void __remove_memory(int nid, u64 start, u64 size); -extern int offline_and_remove_memory(int nid, u64 start, u64 size); +extern int remove_memory(u64 start, u64 size); +extern void __remove_memory(u64 start, u64 size); +extern int offline_and_remove_memory(u64 start, u64 size); #else static inline void try_offline_node(int nid) {} @@ -304,12 +304,12 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) return -EINVAL; } -static inline int remove_memory(int nid, u64 start, u64 size) +static inline int remove_memory(u64 start, u64 size) { return -EBUSY; } -static inline void __remove_memory(int nid, u64 start, u64 size) {} +static inline void __remove_memory(u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ extern void set_zone_contiguous(struct zone *zone); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 14c4f6051c13..6ea62efe2a8f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1739,7 +1739,9 @@ failed_removal: static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); + int *nid = arg; + *nid = mem->nid; if (unlikely(ret)) { phys_addr_t beginpa, endpa; @@ -1832,12 +1834,12 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); -static int __ref try_remove_memory(int nid, u64 start, u64 size) +static int __ref try_remove_memory(u64 start, u64 size) { - int rc = 0; struct vmem_altmap mhp_altmap = {}; struct vmem_altmap *altmap = NULL; unsigned long nr_vmemmap_pages; + int rc = 0, nid = NUMA_NO_NODE; BUG_ON(check_hotplug_memory_range(start, size)); @@ -1845,8 +1847,12 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) * All memory blocks must be offlined before removing memory. Check * whether all memory blocks in question are offline and return error * if this is not the case. + * + * While at it, determine the nid. Note that if we'd have mixed nodes, + * we'd only try to offline the last determined one -- which is good + * enough for the cases we care about. */ - rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb); + rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb); if (rc) return rc; @@ -1895,7 +1901,8 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) release_mem_region_adjustable(start, size); - try_offline_node(nid); + if (nid != NUMA_NO_NODE) + try_offline_node(nid); mem_hotplug_done(); return 0; @@ -1903,7 +1910,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) /** * __remove_memory - Remove memory if every memory block is offline - * @nid: the node ID * @start: physical address of the region to remove * @size: size of the region to remove * @@ -1911,14 +1917,14 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) * and online/offline operations before this call, as required by * try_offline_node(). */ -void __remove_memory(int nid, u64 start, u64 size) +void __remove_memory(u64 start, u64 size) { /* * trigger BUG() if some memory is not offlined prior to calling this * function */ - if (try_remove_memory(nid, start, size)) + if (try_remove_memory(start, size)) BUG(); } @@ -1926,12 +1932,12 @@ void __remove_memory(int nid, u64 start, u64 size) * Remove memory if every memory block is offline, otherwise return -EBUSY is * some memory is not offline */ -int remove_memory(int nid, u64 start, u64 size) +int remove_memory(u64 start, u64 size) { int rc; lock_device_hotplug(); - rc = try_remove_memory(nid, start, size); + rc = try_remove_memory(start, size); unlock_device_hotplug(); return rc; @@ -1991,7 +1997,7 @@ static int try_reonline_memory_block(struct memory_block *mem, void *arg) * unplugged all memory (so it's no longer in use) and want to offline + remove * that memory. */ -int offline_and_remove_memory(int nid, u64 start, u64 size) +int offline_and_remove_memory(u64 start, u64 size) { const unsigned long mb_count = size / memory_block_size_bytes(); uint8_t *online_types, *tmp; @@ -2027,7 +2033,7 @@ int offline_and_remove_memory(int nid, u64 start, u64 size) * This cannot fail as it cannot get onlined in the meantime. */ if (!rc) { - rc = try_remove_memory(nid, start, size); + rc = try_remove_memory(start, size); if (rc) pr_err("%s: Failed to remove memory: %d", __func__, rc); } -- cgit v1.2.3-71-gd317 From 4b0970024408afb17886e0c76e9761c4264db2a8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:19 -0700 Subject: mm: track present early pages per zone Patch series "mm/memory_hotplug: "auto-movable" online policy and memory groups", v3. I. Goal The goal of this series is improving in-kernel auto-online support. It tackles the fundamental problems that: 1) We can create zone imbalances when onlining all memory blindly to ZONE_MOVABLE, in the worst case crashing the system. We have to know upfront how much memory we are going to hotplug such that we can safely enable auto-onlining of all hotplugged memory to ZONE_MOVABLE via "online_movable". This is far from practical and only applicable in limited setups -- like inside VMs under the RHV/oVirt hypervisor which will never hotplug more than 3 times the boot memory (and the limitation is only in place due to the Linux limitation). 2) We see more setups that implement dynamic VM resizing, hot(un)plugging memory to resize VM memory. In these setups, we might hotplug a lot of memory, but it might happen in various small steps in both directions (e.g., 2 GiB -> 8 GiB -> 4 GiB -> 16 GiB ...). virtio-mem is the primary driver of this upstream right now, performing such dynamic resizing NUMA-aware via multiple virtio-mem devices. Onlining all hotplugged memory to ZONE_NORMAL means we basically have no hotunplug guarantees. Onlining all to ZONE_MOVABLE means we can easily run into zone imbalances when growing a VM. We want a mixture, and we want as much memory as reasonable/configured in ZONE_MOVABLE. Details regarding zone imbalances can be found at [1]. 3) Memory devices consist of 1..X memory block devices, however, the kernel doesn't really track the relationship. Consequently, also user space has no idea. We want to make per-device decisions. As one example, for memory hotunplug it doesn't make sense to use a mixture of zones within a single DIMM: we want all MOVABLE if possible, otherwise all !MOVABLE, because any !MOVABLE part will easily block the whole DIMM from getting hotunplugged. As another example, virtio-mem operates on individual units that span 1..X memory blocks. Similar to a DIMM, we want a unit to either be all MOVABLE or !MOVABLE. A "unit" can be thought of like a DIMM, however, all units of a virtio-mem device logically belong together and are managed (added/removed) by a single driver. We want as much memory of a virtio-mem device to be MOVABLE as possible. 4) We want memory onlining to be done right from the kernel while adding memory, not triggered by user space via udev rules; for example, this is reqired for fast memory hotplug for drivers that add individual memory blocks, like virito-mem. We want a way to configure a policy in the kernel and avoid implementing advanced policies in user space. The auto-onlining support we have in the kernel is not sufficient. All we have is a) online everything MOVABLE (online_movable) b) online everything !MOVABLE (online_kernel) c) keep zones contiguous (online). This series allows configuring c) to mean instead "online movable if possible according to the coniguration, driven by a maximum MOVABLE:KERNEL ratio" -- a new onlining policy. II. Approach This series does 3 things: 1) Introduces the "auto-movable" online policy that initially operates on individual memory blocks only. It uses a maximum MOVABLE:KERNEL ratio to make a decision whether a memory block will be onlined to ZONE_MOVABLE or not. However, in the basic form, hotplugged KERNEL memory does not allow for more MOVABLE memory (details in the patches). CMA memory is treated like MOVABLE memory. 2) Introduces static (e.g., DIMM) and dynamic (e.g., virtio-mem) memory groups and uses group information to make decisions in the "auto-movable" online policy across memory blocks of a single memory device (modeled as memory group). More details can be found in patch #3 or in the DIMM example below. 3) Maximizes ZONE_MOVABLE memory within dynamic memory groups, by allowing ZONE_NORMAL memory within a dynamic memory group to allow for more ZONE_MOVABLE memory within the same memory group. The target use case is dynamic VM resizing using virtio-mem. See the virtio-mem example below. I remember that the basic idea of using a ratio to implement a policy in the kernel was once mentioned by Vitaly Kuznetsov, but I might be wrong (I lost the pointer to that discussion). For me, the main use case is using it along with virtio-mem (and DIMMs / ppc64 dlpar where necessary) for dynamic resizing of VMs, increasing the amount of memory we can hotunplug reliably again if we might eventually hotplug a lot of memory to a VM. III. Target Usage The target usage will be: 1) Linux boots with "mhp_default_online_type=offline" 2) User space (e.g., systemd unit) configures memory onlining (according to a config file and system properties), for example: * Setting memory_hotplug.online_policy=auto-movable * Setting memory_hotplug.auto_movable_ratio=301 * Setting memory_hotplug.auto_movable_numa_aware=true 3) User space enabled auto onlining via "echo online > /sys/devices/system/memory/auto_online_blocks" 4) User space triggers manual onlining of all already-offline memory blocks (go over offline memory blocks and set them to "online") IV. Example For DIMMs, hotplugging 4 GiB DIMMs to a 4 GiB VM with a configured ratio of 301% results in the following layout: Memory block 0-15: DMA32 (early) Memory block 32-47: Normal (early) Memory block 48-79: Movable (DIMM 0) Memory block 80-111: Movable (DIMM 1) Memory block 112-143: Movable (DIMM 2) Memory block 144-275: Normal (DIMM 3) Memory block 176-207: Normal (DIMM 4) ... all Normal (-> hotplugged Normal memory does not allow for more Movable memory) For virtio-mem, using a simple, single virtio-mem device with a 4 GiB VM will result in the following layout: Memory block 0-15: DMA32 (early) Memory block 32-47: Normal (early) Memory block 48-143: Movable (virtio-mem, first 12 GiB) Memory block 144: Normal (virtio-mem, next 128 MiB) Memory block 145-147: Movable (virtio-mem, next 384 MiB) Memory block 148: Normal (virtio-mem, next 128 MiB) Memory block 149-151: Movable (virtio-mem, next 384 MiB) ... Normal/Movable mixture as above (-> hotplugged Normal memory allows for more Movable memory within the same device) Which gives us maximum flexibility when dynamically growing/shrinking a VM in smaller steps. V. Doc Update I'll update the memory-hotplug.rst documentation, once the overhaul [1] is usptream. Until then, details can be found in patch #2. VI. Future Work 1) Use memory groups for ppc64 dlpar 2) Being able to specify a portion of (early) kernel memory that will be excluded from the ratio. Like "128 MiB globally/per node" are excluded. This might be helpful when starting VMs with extremely small memory footprint (e.g., 128 MiB) and hotplugging memory later -- not wanting the first hotplugged units getting onlined to ZONE_MOVABLE. One alternative would be a trigger to not consider ZONE_DMA memory in the ratio. We'll have to see if this is really rrequired. 3) Indicate to user space that MOVABLE might be a bad idea -- especially relevant when memory ballooning without support for balloon compaction is active. This patch (of 9): For implementing a new memory onlining policy, which determines when to online memory blocks to ZONE_MOVABLE semi-automatically, we need the number of present early (boot) pages -- present pages excluding hotplugged pages. Let's track these pages per zone. Pass a page instead of the zone to adjust_present_page_count(), similar as adjust_managed_page_count() and derive the zone from the page. It's worth noting that a memory block to be offlined/onlined is either completely "early" or "not early". add_memory() and friends can only add complete memory blocks and we only online/offline complete (individual) memory blocks. Link: https://lkml.kernel.org/r/20210806124715.17090-1-david@redhat.com Link: https://lkml.kernel.org/r/20210806124715.17090-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Vitaly Kuznetsov Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Marek Kedzierski Cc: Hui Zhu Cc: Pankaj Gupta Cc: Wei Yang Cc: Oscar Salvador Cc: Michal Hocko Cc: Dan Williams Cc: Anshuman Khandual Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mike Rapoport Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 14 +++++++------- include/linux/memory_hotplug.h | 2 +- include/linux/mmzone.h | 7 +++++++ mm/memory_hotplug.c | 14 +++++++++++--- mm/page_alloc.c | 3 +++ 5 files changed, 29 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index aa31a21f33d7..86ec2dc82fc2 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -205,7 +205,8 @@ static int memory_block_online(struct memory_block *mem) * now already properly populated. */ if (nr_vmemmap_pages) - adjust_present_page_count(zone, nr_vmemmap_pages); + adjust_present_page_count(pfn_to_page(start_pfn), + nr_vmemmap_pages); return ret; } @@ -215,24 +216,23 @@ static int memory_block_offline(struct memory_block *mem) unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; - struct zone *zone; int ret; /* * Unaccount before offlining, such that unpopulated zone and kthreads * can properly be torn down in offline_pages(). */ - if (nr_vmemmap_pages) { - zone = page_zone(pfn_to_page(start_pfn)); - adjust_present_page_count(zone, -nr_vmemmap_pages); - } + if (nr_vmemmap_pages) + adjust_present_page_count(pfn_to_page(start_pfn), + -nr_vmemmap_pages); ret = offline_pages(start_pfn + nr_vmemmap_pages, nr_pages - nr_vmemmap_pages); if (ret) { /* offline_pages() failed. Account back. */ if (nr_vmemmap_pages) - adjust_present_page_count(zone, nr_vmemmap_pages); + adjust_present_page_count(pfn_to_page(start_pfn), + nr_vmemmap_pages); return ret; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 068e3dcf4690..39b04e99a30e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -95,7 +95,7 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); -extern void adjust_present_page_count(struct zone *zone, long nr_pages); +extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ee3a86830519..1c0e3bf42521 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -540,6 +540,10 @@ struct zone { * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * + * present_early_pages is present pages existing within the zone + * located on memory available since early boot, excluding hotplugged + * memory. + * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): @@ -572,6 +576,9 @@ struct zone { atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; +#if defined(CONFIG_MEMORY_HOTPLUG) + unsigned long present_early_pages; +#endif #ifdef CONFIG_CMA unsigned long cma_pages; #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6ea62efe2a8f..8a99fa6d096c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -724,8 +724,16 @@ struct zone *zone_for_pfn_range(int online_type, int nid, * This function should only be called by memory_block_{online,offline}, * and {online,offline}_pages. */ -void adjust_present_page_count(struct zone *zone, long nr_pages) +void adjust_present_page_count(struct page *page, long nr_pages) { + struct zone *zone = page_zone(page); + + /* + * We only support onlining/offlining/adding/removing of complete + * memory blocks; therefore, either all is either early or hotplugged. + */ + if (early_section(__pfn_to_section(page_to_pfn(page)))) + zone->present_early_pages += nr_pages; zone->present_pages += nr_pages; zone->zone_pgdat->node_present_pages += nr_pages; } @@ -826,7 +834,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z } online_pages_range(pfn, nr_pages); - adjust_present_page_count(zone, nr_pages); + adjust_present_page_count(pfn_to_page(pfn), nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1697,7 +1705,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - adjust_present_page_count(zone, -nr_pages); + adjust_present_page_count(pfn_to_page(start_pfn), -nr_pages); /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 79a2fc5b6c6f..9353418892a7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7240,6 +7240,9 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, zone->zone_start_pfn = 0; zone->spanned_pages = size; zone->present_pages = real_size; +#if defined(CONFIG_MEMORY_HOTPLUG) + zone->present_early_pages = real_size; +#endif totalpages += size; realtotalpages += real_size; -- cgit v1.2.3-71-gd317 From 028fc57a1c361116e3bcebfeba4ca87878baaf4f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:26 -0700 Subject: drivers/base/memory: introduce "memory groups" to logically group memory blocks In our "auto-movable" memory onlining policy, we want to make decisions across memory blocks of a single memory device. Examples of memory devices include ACPI memory devices (in the simplest case a single DIMM) and virtio-mem. For now, we don't have a connection between a single memory block device and the real memory device. Each memory device consists of 1..X memory block devices. Let's logically group memory blocks belonging to the same memory device in "memory groups". Memory groups can span multiple physical ranges and a memory group itself does not contain any information regarding physical ranges, only properties (e.g., "max_pages") necessary for improved memory onlining. Introduce two memory group types: 1) Static memory group: E.g., a single ACPI memory device, consisting of 1..X memory resources. A memory group consists of 1..Y memory blocks. The whole group is added/removed in one go. If any part cannot get offlined, the whole group cannot be removed. 2) Dynamic memory group: E.g., a single virtio-mem device. Memory is dynamically added/removed in a fixed granularity, called a "unit", consisting of 1..X memory blocks. A unit is added/removed in one go. If any part of a unit cannot get offlined, the whole unit cannot be removed. In case of 1) we usually want either all memory managed by ZONE_MOVABLE or none. In case of 2) we usually want to have as many units as possible managed by ZONE_MOVABLE. We want a single unit to be of the same type. For now, memory groups are an internal concept that is not exposed to user space; we might want to change that in the future, though. add_memory() users can specify a mgid instead of a nid when passing the MHP_NID_IS_MGID flag. Link: https://lkml.kernel.org/r/20210806124715.17090-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 159 +++++++++++++++++++++++++++++++++++++++-- include/linux/memory.h | 46 +++++++++++- include/linux/memory_hotplug.h | 5 ++ mm/memory_hotplug.c | 11 ++- 4 files changed, 215 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 86ec2dc82fc2..16f5a3610229 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -82,6 +82,11 @@ static struct bus_type memory_subsys = { */ static DEFINE_XARRAY(memory_blocks); +/* + * Memory groups, indexed by memory group id (mgid). + */ +static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); + static BLOCKING_NOTIFIER_HEAD(memory_chain); int register_memory_notifier(struct notifier_block *nb) @@ -634,7 +639,8 @@ int register_memory(struct memory_block *memory) } static int init_memory_block(unsigned long block_id, unsigned long state, - unsigned long nr_vmemmap_pages) + unsigned long nr_vmemmap_pages, + struct memory_group *group) { struct memory_block *mem; int ret = 0; @@ -652,6 +658,12 @@ static int init_memory_block(unsigned long block_id, unsigned long state, mem->state = state; mem->nid = NUMA_NO_NODE; mem->nr_vmemmap_pages = nr_vmemmap_pages; + INIT_LIST_HEAD(&mem->group_next); + + if (group) { + mem->group = group; + list_add(&mem->group_next, &group->memory_blocks); + } ret = register_memory(mem); @@ -671,7 +683,7 @@ static int add_memory_block(unsigned long base_section_nr) if (section_count == 0) return 0; return init_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE, 0); + MEM_ONLINE, 0, NULL); } static void unregister_memory(struct memory_block *memory) @@ -681,6 +693,11 @@ static void unregister_memory(struct memory_block *memory) WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); + if (memory->group) { + list_del(&memory->group_next); + memory->group = NULL; + } + /* drop the ref. we got via find_memory_block() */ put_device(&memory->dev); device_unregister(&memory->dev); @@ -694,7 +711,8 @@ static void unregister_memory(struct memory_block *memory) * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages) + unsigned long vmemmap_pages, + struct memory_group *group) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); @@ -707,7 +725,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size, return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages); + ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages, + group); if (ret) break; } @@ -891,3 +910,135 @@ int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) return bus_for_each_dev(&memory_subsys, NULL, &cb_data, for_each_memory_block_cb); } + +/* + * This is an internal helper to unify allocation and initialization of + * memory groups. Note that the passed memory group will be copied to a + * dynamically allocated memory group. After this call, the passed + * memory group should no longer be used. + */ +static int memory_group_register(struct memory_group group) +{ + struct memory_group *new_group; + uint32_t mgid; + int ret; + + if (!node_possible(group.nid)) + return -EINVAL; + + new_group = kzalloc(sizeof(group), GFP_KERNEL); + if (!new_group) + return -ENOMEM; + *new_group = group; + INIT_LIST_HEAD(&new_group->memory_blocks); + + ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, + GFP_KERNEL); + if (ret) { + kfree(new_group); + return ret; + } + return mgid; +} + +/** + * memory_group_register_static() - Register a static memory group. + * @nid: The node id. + * @max_pages: The maximum number of pages we'll have in this static memory + * group. + * + * Register a new static memory group and return the memory group id. + * All memory in the group belongs to a single unit, such as a DIMM. All + * memory belonging to a static memory group is added in one go to be removed + * in one go -- it's static. + * + * Returns an error if out of memory, if the node id is invalid, if no new + * memory groups can be registered, or if max_pages is invalid (0). Otherwise, + * returns the new memory group id. + */ +int memory_group_register_static(int nid, unsigned long max_pages) +{ + struct memory_group group = { + .nid = nid, + .s = { + .max_pages = max_pages, + }, + }; + + if (!max_pages) + return -EINVAL; + return memory_group_register(group); +} +EXPORT_SYMBOL_GPL(memory_group_register_static); + +/** + * memory_group_register_dynamic() - Register a dynamic memory group. + * @nid: The node id. + * @unit_pages: Unit in pages in which is memory added/removed in this dynamic + * memory group. + * + * Register a new dynamic memory group and return the memory group id. + * Memory within a dynamic memory group is added/removed dynamically + * in unit_pages. + * + * Returns an error if out of memory, if the node id is invalid, if no new + * memory groups can be registered, or if unit_pages is invalid (0, not a + * power of two, smaller than a single memory block). Otherwise, returns the + * new memory group id. + */ +int memory_group_register_dynamic(int nid, unsigned long unit_pages) +{ + struct memory_group group = { + .nid = nid, + .is_dynamic = true, + .d = { + .unit_pages = unit_pages, + }, + }; + + if (!unit_pages || !is_power_of_2(unit_pages) || + unit_pages < PHYS_PFN(memory_block_size_bytes())) + return -EINVAL; + return memory_group_register(group); +} +EXPORT_SYMBOL_GPL(memory_group_register_dynamic); + +/** + * memory_group_unregister() - Unregister a memory group. + * @mgid: the memory group id + * + * Unregister a memory group. If any memory block still belongs to this + * memory group, unregistering will fail. + * + * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some + * memory blocks still belong to this memory group and returns 0 if + * unregistering succeeded. + */ +int memory_group_unregister(int mgid) +{ + struct memory_group *group; + + if (mgid < 0) + return -EINVAL; + + group = xa_load(&memory_groups, mgid); + if (!group) + return -EINVAL; + if (!list_empty(&group->memory_blocks)) + return -EBUSY; + xa_erase(&memory_groups, mgid); + kfree(group); + return 0; +} +EXPORT_SYMBOL_GPL(memory_group_unregister); + +/* + * This is an internal helper only to be used in core memory hotplug code to + * lookup a memory group. We don't care about locking, as we don't expect a + * memory group to get unregistered while adding memory to it -- because + * the group and the memory is managed by the same driver. + */ +struct memory_group *memory_group_find_by_id(int mgid) +{ + return xa_load(&memory_groups, mgid); +} diff --git a/include/linux/memory.h b/include/linux/memory.h index 97e92e8b556a..d505c12c5c77 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -23,6 +23,42 @@ #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) +/** + * struct memory_group - a logical group of memory blocks + * @nid: The node id for all memory blocks inside the memory group. + * @blocks: List of all memory blocks belonging to this memory group. + * @is_dynamic: The memory group type: static vs. dynamic + * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum + * number of pages we'll have in this static memory group. + * @d.unit_pages: Valid with &memory_group.is_dynamic == true. Unit in pages + * in which memory is added/removed in this dynamic memory group. + * This granularity defines the alignment of a unit in physical + * address space; it has to be at least as big as a single + * memory block. + * + * A memory group logically groups memory blocks; each memory block + * belongs to at most one memory group. A memory group corresponds to + * a memory device, such as a DIMM or a NUMA node, which spans multiple + * memory blocks and might even span multiple non-contiguous physical memory + * ranges. + * + * Modification of members after registration is serialized by memory + * hot(un)plug code. + */ +struct memory_group { + int nid; + struct list_head memory_blocks; + bool is_dynamic; + union { + struct { + unsigned long max_pages; + } s; + struct { + unsigned long unit_pages; + } d; + }; +}; + struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ @@ -34,6 +70,8 @@ struct memory_block { * lay at the beginning of the memory block. */ unsigned long nr_vmemmap_pages; + struct memory_group *group; /* group (if any) for this block */ + struct list_head group_next; /* next block inside memory group */ }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -86,7 +124,8 @@ static inline int memory_notify(unsigned long val, void *v) extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages); + unsigned long vmemmap_pages, + struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); @@ -96,6 +135,11 @@ extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func); #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<nid; + } + if (!node_possible(nid)) { WARN(1, "node %d was absent from the node_possible_map\n", nid); return -EINVAL; @@ -1303,7 +1311,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, mhp_altmap.alloc); + ret = create_memory_block_devices(start, size, mhp_altmap.alloc, + group); if (ret) { arch_remove_memory(start, size, NULL); goto error; -- cgit v1.2.3-71-gd317 From 836809ec75cc07c6d07c43036e3844affbe0d46f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:30 -0700 Subject: mm/memory_hotplug: track present pages in memory groups Let's track all present pages in each memory group. Especially, track memory present in ZONE_MOVABLE and memory present in one of the kernel zones (which really only is ZONE_NORMAL right now as memory groups only apply to hotplugged memory) separately within a memory group, to prepare for making smart auto-online decision for individual memory blocks within a memory group based on group statistics. Link: https://lkml.kernel.org/r/20210806124715.17090-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 10 +++++----- include/linux/memory.h | 6 ++++++ include/linux/memory_hotplug.h | 13 +++++++++---- mm/memory_hotplug.c | 19 ++++++++++++++----- 4 files changed, 34 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 16f5a3610229..a1082013e10c 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -198,7 +198,7 @@ static int memory_block_online(struct memory_block *mem) } ret = online_pages(start_pfn + nr_vmemmap_pages, - nr_pages - nr_vmemmap_pages, zone); + nr_pages - nr_vmemmap_pages, zone, mem->group); if (ret) { if (nr_vmemmap_pages) mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); @@ -210,7 +210,7 @@ static int memory_block_online(struct memory_block *mem) * now already properly populated. */ if (nr_vmemmap_pages) - adjust_present_page_count(pfn_to_page(start_pfn), + adjust_present_page_count(pfn_to_page(start_pfn), mem->group, nr_vmemmap_pages); return ret; @@ -228,16 +228,16 @@ static int memory_block_offline(struct memory_block *mem) * can properly be torn down in offline_pages(). */ if (nr_vmemmap_pages) - adjust_present_page_count(pfn_to_page(start_pfn), + adjust_present_page_count(pfn_to_page(start_pfn), mem->group, -nr_vmemmap_pages); ret = offline_pages(start_pfn + nr_vmemmap_pages, - nr_pages - nr_vmemmap_pages); + nr_pages - nr_vmemmap_pages, mem->group); if (ret) { /* offline_pages() failed. Account back. */ if (nr_vmemmap_pages) adjust_present_page_count(pfn_to_page(start_pfn), - nr_vmemmap_pages); + mem->group, nr_vmemmap_pages); return ret; } diff --git a/include/linux/memory.h b/include/linux/memory.h index d505c12c5c77..6ffdc1db385f 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -27,6 +27,10 @@ * struct memory_group - a logical group of memory blocks * @nid: The node id for all memory blocks inside the memory group. * @blocks: List of all memory blocks belonging to this memory group. + * @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this + * memory group. + * @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this + * memory group. * @is_dynamic: The memory group type: static vs. dynamic * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum * number of pages we'll have in this static memory group. @@ -48,6 +52,8 @@ struct memory_group { int nid; struct list_head memory_blocks; + unsigned long present_kernel_pages; + unsigned long present_movable_pages; bool is_dynamic; union { struct { diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 5d341978b4bc..cf3f423c8a74 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -12,6 +12,7 @@ struct zone; struct pglist_data; struct mem_section; struct memory_block; +struct memory_group; struct resource; struct vmem_altmap; @@ -100,13 +101,15 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); -extern void adjust_present_page_count(struct page *page, long nr_pages); +extern void adjust_present_page_count(struct page *page, + struct memory_group *group, + long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, - struct zone *zone); + struct zone *zone, struct memory_group *group); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, @@ -296,7 +299,8 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE extern void try_offline_node(int nid); -extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); +extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group); extern int remove_memory(u64 start, u64 size); extern void __remove_memory(u64 start, u64 size); extern int offline_and_remove_memory(u64 start, u64 size); @@ -304,7 +308,8 @@ extern int offline_and_remove_memory(u64 start, u64 size); #else static inline void try_offline_node(int nid) {} -static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group) { return -EINVAL; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd57a296dd27..8199a4f98b2b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -915,9 +915,11 @@ struct zone *zone_for_pfn_range(int online_type, int nid, * This function should only be called by memory_block_{online,offline}, * and {online,offline}_pages. */ -void adjust_present_page_count(struct page *page, long nr_pages) +void adjust_present_page_count(struct page *page, struct memory_group *group, + long nr_pages) { struct zone *zone = page_zone(page); + const bool movable = zone_idx(zone) == ZONE_MOVABLE; /* * We only support onlining/offlining/adding/removing of complete @@ -927,6 +929,11 @@ void adjust_present_page_count(struct page *page, long nr_pages) zone->present_early_pages += nr_pages; zone->present_pages += nr_pages; zone->zone_pgdat->node_present_pages += nr_pages; + + if (group && movable) + group->present_movable_pages += nr_pages; + else if (group && !movable) + group->present_kernel_pages += nr_pages; } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, @@ -972,7 +979,8 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone) +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + struct zone *zone, struct memory_group *group) { unsigned long flags; int need_zonelists_rebuild = 0; @@ -1025,7 +1033,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z } online_pages_range(pfn, nr_pages); - adjust_present_page_count(pfn_to_page(pfn), nr_pages); + adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1769,7 +1777,8 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, return 0; } -int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) +int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group) { const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn, system_ram_pages = 0; @@ -1905,7 +1914,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - adjust_present_page_count(pfn_to_page(start_pfn), -nr_pages); + adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); -- cgit v1.2.3-71-gd317 From 445fcf7c721450dd1d4ec6c217b3c6a932602a44 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:45 -0700 Subject: mm/memory_hotplug: memory group aware "auto-movable" online policy Use memory groups to improve our "auto-movable" onlining policy: 1. For static memory groups (e.g., a DIMM), online a memory block MOVABLE only if all other memory blocks in the group are either MOVABLE or could be onlined MOVABLE. A DIMM will either be MOVABLE or not, not a mixture. 2. For dynamic memory groups (e.g., a virtio-mem device), online a memory block MOVABLE only if all other memory blocks inside the current unit are either MOVABLE or could be onlined MOVABLE. For a virtio-mem device with a device block size with 512 MiB, all 128 MiB memory blocks wihin a 512 MiB unit will either be MOVABLE or not, not a mixture. We have to pass the memory group to zone_for_pfn_range() to take the memory group into account. Note: for now, there seems to be no compelling reason to make this behavior configurable. Link: https://lkml.kernel.org/r/20210806124715.17090-9-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 18 +++++++++------- include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 48 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 57 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index a1082013e10c..b699ddc42693 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -182,7 +182,8 @@ static int memory_block_online(struct memory_block *mem) struct zone *zone; int ret; - zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages); + zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, + start_pfn, nr_pages); /* * Although vmemmap pages have a different lifecycle than the pages @@ -379,12 +380,13 @@ static ssize_t phys_device_show(struct device *dev, #ifdef CONFIG_MEMORY_HOTREMOVE static int print_allowed_zone(char *buf, int len, int nid, + struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages, int online_type, struct zone *default_zone) { struct zone *zone; - zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); + zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages); if (zone == default_zone) return 0; @@ -397,9 +399,10 @@ static ssize_t valid_zones_show(struct device *dev, struct memory_block *mem = to_memory_block(dev); unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + struct memory_group *group = mem->group; struct zone *default_zone; + int nid = mem->nid; int len = 0; - int nid; /* * Check the existing zone. Make sure that we do that only on the @@ -418,14 +421,13 @@ static ssize_t valid_zones_show(struct device *dev, goto out; } - nid = mem->nid; - default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn, - nr_pages); + default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group, + start_pfn, nr_pages); len += sysfs_emit_at(buf, len, "%s", default_zone->name); - len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages, + len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, MMOP_ONLINE_KERNEL, default_zone); - len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages, + len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE, default_zone); out: len += sysfs_emit_at(buf, len, "\n"); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index cf3f423c8a74..e5a867c950b2 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -349,7 +349,8 @@ extern void sparse_remove_section(struct mem_section *ms, extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); extern struct zone *zone_for_pfn_range(int online_type, int nid, - unsigned long start_pfn, unsigned long nr_pages); + struct memory_group *group, unsigned long start_pfn, + unsigned long nr_pages); extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8199a4f98b2b..248e2ba4ac59 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -852,12 +852,53 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * "present pages" is an upper limit that can get reached at runtime. As * we base our calculations on KERNEL_EARLY, this is not an issue. */ -static struct zone *auto_movable_zone_for_pfn(int nid, unsigned long pfn, +static struct zone *auto_movable_zone_for_pfn(int nid, + struct memory_group *group, + unsigned long pfn, unsigned long nr_pages) { + unsigned long online_pages = 0, max_pages, end_pfn; + struct page *page; + if (!auto_movable_ratio) goto kernel_zone; + if (group && !group->is_dynamic) { + max_pages = group->s.max_pages; + online_pages = group->present_movable_pages; + + /* If anything is !MOVABLE online the rest !MOVABLE. */ + if (group->present_kernel_pages) + goto kernel_zone; + } else if (!group || group->d.unit_pages == nr_pages) { + max_pages = nr_pages; + } else { + max_pages = group->d.unit_pages; + /* + * Take a look at all online sections in the current unit. + * We can safely assume that all pages within a section belong + * to the same zone, because dynamic memory groups only deal + * with hotplugged memory. + */ + pfn = ALIGN_DOWN(pfn, group->d.unit_pages); + end_pfn = pfn + group->d.unit_pages; + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + /* If anything is !MOVABLE online the rest !MOVABLE. */ + if (page_zonenum(page) != ZONE_MOVABLE) + goto kernel_zone; + online_pages += PAGES_PER_SECTION; + } + } + + /* + * Online MOVABLE if we could *currently* online all remaining parts + * MOVABLE. We expect to (add+) online them immediately next, so if + * nobody interferes, all will be MOVABLE if possible. + */ + nr_pages = max_pages - online_pages; if (!auto_movable_can_online_movable(NUMA_NO_NODE, nr_pages)) goto kernel_zone; @@ -897,7 +938,8 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn } struct zone *zone_for_pfn_range(int online_type, int nid, - unsigned long start_pfn, unsigned long nr_pages) + struct memory_group *group, unsigned long start_pfn, + unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); @@ -906,7 +948,7 @@ struct zone *zone_for_pfn_range(int online_type, int nid, return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) - return auto_movable_zone_for_pfn(nid, start_pfn, nr_pages); + return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages); return default_zone_for_pfn(nid, start_pfn, nr_pages); } -- cgit v1.2.3-71-gd317 From 3fcebf90209a7f52d384ad7701425aa91be309ab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:48 -0700 Subject: mm/memory_hotplug: improved dynamic memory group aware "auto-movable" online policy Currently, the "auto-movable" online policy does not allow for hotplugged KERNEL (ZONE_NORMAL) memory to increase the amount of MOVABLE memory we can have, primarily, because there is no coordiantion across memory devices and we don't want to create zone-imbalances accidentially when unplugging memory. However, within a single memory device it's different. Let's allow for KERNEL memory within a dynamic memory group to allow for more MOVABLE within the same memory group. The only thing we have to take care of is that the managing driver avoids zone imbalances by unplugging MOVABLE memory first, otherwise there can be corner cases where unplug of memory could result in (accidential) zone imbalances. virtio-mem is the only user of dynamic memory groups and recently added support for prioritizing unplug of ZONE_MOVABLE over ZONE_NORMAL, so we don't need a new toggle to enable it for dynamic memory groups. We limit this handling to dynamic memory groups, because: * We want to keep the runtime overhead for collecting stats when onlining a single memory block small. We tend to have only a handful of dynamic memory groups, but we can have quite some static memory groups (e.g., 256 DIMMs). * It doesn't make too much sense for static memory groups, as we try onlining all applicable memory blocks either completely to ZONE_MOVABLE or not. In ordinary operation, we won't have a mixture of zones within a static memory group. When adding memory to a dynamic memory group, we'll first online memory to ZONE_MOVABLE as long as early KERNEL memory allows for it. Then, we'll online the next unit(s) to ZONE_NORMAL, until we can online the next unit(s) to ZONE_MOVABLE. For a simple virtio-mem device with a MOVABLE:KERNEL ratio of 3:1, it will result in a layout like: [M][M][M][M][M][M][M][M][N][M][M][M][N][M][M][M]... ^ movable memory due to early kernel memory ^ allows for more movable memory ... ^-----^ ... here ^ allows for more movable memory ... ^-----^ ... here While the created layout is sub-optimal when it comes to contiguous zones, it gives us the maximum flexibility when dynamically growing/shrinking a device; we can grow small VMs really big in small steps, and still shrink reliably to e.g., 1/4 of the maximum VM size in this example, removing full memory blocks along with meta data more reliably. Mark dynamic memory groups in the xarray such that we can efficiently iterate over them when collecting stats. In usual setups, we have one virtio-mem device per NUMA node, and usually only a small number of NUMA nodes. Note: for now, there seems to be no compelling reason to make this behavior configurable. Link: https://lkml.kernel.org/r/20210806124715.17090-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 30 +++++++++++++++++++++++++ include/linux/memory.h | 3 +++ mm/memory_hotplug.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 89 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index b699ddc42693..440fd656c002 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -86,6 +86,7 @@ static DEFINE_XARRAY(memory_blocks); * Memory groups, indexed by memory group id (mgid). */ static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); +#define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1 static BLOCKING_NOTIFIER_HEAD(memory_chain); @@ -939,6 +940,8 @@ static int memory_group_register(struct memory_group group) if (ret) { kfree(new_group); return ret; + } else if (group.is_dynamic) { + xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC); } return mgid; } @@ -1044,3 +1047,30 @@ struct memory_group *memory_group_find_by_id(int mgid) { return xa_load(&memory_groups, mgid); } + +/* + * This is an internal helper only to be used in core memory hotplug code to + * walk all dynamic memory groups excluding a given memory group, either + * belonging to a specific node, or belonging to any node. + */ +int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, + struct memory_group *excluded, void *arg) +{ + struct memory_group *group; + unsigned long index; + int ret = 0; + + xa_for_each_marked(&memory_groups, index, group, + MEMORY_GROUP_MARK_DYNAMIC) { + if (group == excluded) + continue; +#ifdef CONFIG_NUMA + if (nid != NUMA_NO_NODE && group->nid != nid) + continue; +#endif /* CONFIG_NUMA */ + ret = func(group, arg); + if (ret) + break; + } + return ret; +} diff --git a/include/linux/memory.h b/include/linux/memory.h index 6ffdc1db385f..cbcc43ad2b97 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -146,6 +146,9 @@ extern int memory_group_register_static(int nid, unsigned long max_pages); extern int memory_group_register_dynamic(int nid, unsigned long unit_pages); extern int memory_group_unregister(int mgid); struct memory_group *memory_group_find_by_id(int mgid); +typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *); +int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, + struct memory_group *excluded, void *arg); #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 248e2ba4ac59..b80fb8164fb8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -752,11 +752,44 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, #endif /* CONFIG_CMA */ } } +struct auto_movable_group_stats { + unsigned long movable_pages; + unsigned long req_kernel_early_pages; +}; -static bool auto_movable_can_online_movable(int nid, unsigned long nr_pages) +static int auto_movable_stats_account_group(struct memory_group *group, + void *arg) +{ + const int ratio = READ_ONCE(auto_movable_ratio); + struct auto_movable_group_stats *stats = arg; + long pages; + + /* + * We don't support modifying the config while the auto-movable online + * policy is already enabled. Just avoid the division by zero below. + */ + if (!ratio) + return 0; + + /* + * Calculate how many early kernel pages this group requires to + * satisfy the configured zone ratio. + */ + pages = group->present_movable_pages * 100 / ratio; + pages -= group->present_kernel_pages; + + if (pages > 0) + stats->req_kernel_early_pages += pages; + stats->movable_pages += group->present_movable_pages; + return 0; +} + +static bool auto_movable_can_online_movable(int nid, struct memory_group *group, + unsigned long nr_pages) { - struct auto_movable_stats stats = {}; unsigned long kernel_early_pages, movable_pages; + struct auto_movable_group_stats group_stats = {}; + struct auto_movable_stats stats = {}; pg_data_t *pgdat = NODE_DATA(nid); struct zone *zone; int i; @@ -777,6 +810,21 @@ static bool auto_movable_can_online_movable(int nid, unsigned long nr_pages) kernel_early_pages = stats.kernel_early_pages; movable_pages = stats.movable_pages; + /* + * Kernel memory inside dynamic memory group allows for more MOVABLE + * memory within the same group. Remove the effect of all but the + * current group from the stats. + */ + walk_dynamic_memory_groups(nid, auto_movable_stats_account_group, + group, &group_stats); + if (kernel_early_pages <= group_stats.req_kernel_early_pages) + return false; + kernel_early_pages -= group_stats.req_kernel_early_pages; + movable_pages -= group_stats.movable_pages; + + if (group && group->is_dynamic) + kernel_early_pages += group->present_kernel_pages; + /* * Test if we could online the given number of pages to ZONE_MOVABLE * and still stay in the configured ratio. @@ -834,6 +882,10 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * with unmovable allocations). While there are corner cases where it might * still work, it is barely relevant in practice. * + * Exceptions are dynamic memory groups, which allow for more MOVABLE + * memory within the same memory group -- because in that case, there is + * coordination within the single memory device managed by a single driver. + * * We rely on "present pages" instead of "managed pages", as the latter is * highly unreliable and dynamic in virtualized environments, and does not * consider boot time allocations. For example, memory ballooning adjusts the @@ -899,12 +951,12 @@ static struct zone *auto_movable_zone_for_pfn(int nid, * nobody interferes, all will be MOVABLE if possible. */ nr_pages = max_pages - online_pages; - if (!auto_movable_can_online_movable(NUMA_NO_NODE, nr_pages)) + if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages)) goto kernel_zone; #ifdef CONFIG_NUMA if (auto_movable_numa_aware && - !auto_movable_can_online_movable(nid, nr_pages)) + !auto_movable_can_online_movable(nid, group, nr_pages)) goto kernel_zone; #endif /* CONFIG_NUMA */ -- cgit v1.2.3-71-gd317 From fe3df441ef885a75a3eff5e151ead1a92266d222 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 7 Sep 2021 19:55:55 -0700 Subject: mm: remove redundant compound_head() calling There is a READ_ONCE() in the macro of compound_head(), which will prevent compiler from optimizing the code when there are more than once calling of it in a function. Remove the redundant calling of compound_head() from page_to_index() and page_add_file_rmap() for better code generation. Link: https://lkml.kernel.org/r/20210811101431.83940-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: David Howells Cc: Matthew Wilcox (Oracle) Cc: William Kucharski Cc: Kirill A. Shutemov Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 7 +++---- mm/rmap.c | 6 ++++-- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ed02aa522263..904e57db3a7d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -521,18 +521,17 @@ static inline struct page *read_mapping_page(struct address_space *mapping, */ static inline pgoff_t page_to_index(struct page *page) { - pgoff_t pgoff; + struct page *head; if (likely(!PageTransTail(page))) return page->index; + head = compound_head(page); /* * We don't initialize ->index for tail pages: calculate based on * head page */ - pgoff = compound_head(page)->index; - pgoff += page - compound_head(page); - return pgoff; + return head->index + page - head; } extern pgoff_t hugetlb_basepage_index(struct page *page); diff --git a/mm/rmap.c b/mm/rmap.c index b9eb5c12f3fe..b2cebf35ffe7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1230,11 +1230,13 @@ void page_add_file_rmap(struct page *page, bool compound) nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { + struct page *head = compound_head(page); + VM_WARN_ON_ONCE(!PageLocked(page)); - SetPageDoubleMap(compound_head(page)); + SetPageDoubleMap(head); if (PageMlocked(page)) - clear_page_mlock(compound_head(page)); + clear_page_mlock(head); } if (!atomic_inc_and_test(&page->_mapcount)) goto out; -- cgit v1.2.3-71-gd317 From 82a70ce0426dd7c4099516175019dccbd18cebf9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Sep 2021 19:56:01 -0700 Subject: mm: move ioremap_page_range to vmalloc.c Patch series "small ioremap cleanups". The first patch moves a little code around the vmalloc/ioremap boundary following a bigger move by Nick earlier. The second enforces non-executable mapping on ioremap just like we do for vmap. No driver currently uses executable mappings anyway, as they should. This patch (of 2): This keeps it together with the implementation, and to remove the vmap_range wrapper. Link: https://lkml.kernel.org/r/20210824091259.1324527-1-hch@lst.de Link: https://lkml.kernel.org/r/20210824091259.1324527-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Nicholas Piggin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 --- mm/Makefile | 3 ++- mm/ioremap.c | 25 ------------------------- mm/vmalloc.c | 22 +++++++++++++++++----- 4 files changed, 19 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 2644425b6dce..671d402c3778 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -225,9 +225,6 @@ static inline bool is_vm_area_hugepages(const void *addr) } #ifdef CONFIG_MMU -int vmap_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift); void vunmap_range(unsigned long addr, unsigned long end); static inline void set_vm_flush_reset_perms(void *addr) { diff --git a/mm/Makefile b/mm/Makefile index e3436741d539..0e0a5a6fe127 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -38,7 +38,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ - pgtable-generic.o rmap.o vmalloc.o ioremap.o + pgtable-generic.o rmap.o vmalloc.o ifdef CONFIG_CROSS_MEMORY_ATTACH @@ -128,3 +128,4 @@ obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o +obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o diff --git a/mm/ioremap.c b/mm/ioremap.c index 8ee0136f8cb0..5fe598ecd9b7 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -8,33 +8,9 @@ */ #include #include -#include #include #include -#include -#include "pgalloc-track.h" - -#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static unsigned int __ro_after_init iomap_max_page_shift = BITS_PER_LONG - 1; - -static int __init set_nohugeiomap(char *str) -{ - iomap_max_page_shift = PAGE_SHIFT; - return 0; -} -early_param("nohugeiomap", set_nohugeiomap); -#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static const unsigned int iomap_max_page_shift = PAGE_SHIFT; -#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ - -int ioremap_page_range(unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) -{ - return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift); -} - -#ifdef CONFIG_GENERIC_IOREMAP void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot) { unsigned long offset, vaddr; @@ -71,4 +47,3 @@ void iounmap(volatile void __iomem *addr) vunmap((void *)((unsigned long)addr & PAGE_MASK)); } EXPORT_SYMBOL(iounmap); -#endif /* CONFIG_GENERIC_IOREMAP */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d5cd52805149..e44983fb2d15 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -44,6 +44,19 @@ #include "internal.h" #include "pgalloc-track.h" +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; + +static int __init set_nohugeiomap(char *str) +{ + ioremap_max_page_shift = PAGE_SHIFT; + return 0; +} +early_param("nohugeiomap", set_nohugeiomap); +#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC static bool __ro_after_init vmap_allow_huge = true; @@ -298,15 +311,14 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end, return err; } -int vmap_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift) +int ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) { int err; - err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift); + err = vmap_range_noflush(addr, end, phys_addr, prot, + ioremap_max_page_shift); flush_cache_vmap(addr, end); - return err; } -- cgit v1.2.3-71-gd317 From 513861202d1259e35934e206b79cd54f523d79b5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 7 Sep 2021 19:56:09 -0700 Subject: highmem: don't disable preemption on RT in kmap_atomic() kmap_atomic() disables preemption and pagefaults for historical reasons. The conversion to kmap_local(), which only disables migration, cannot be done wholesale because quite some call sites need to be updated to accommodate with the changed semantics. On PREEMPT_RT enabled kernels the kmap_atomic() semantics are problematic due to the implicit disabling of preemption which makes it impossible to acquire 'sleeping' spinlocks within the kmap atomic sections. PREEMPT_RT replaces the preempt_disable() with a migrate_disable() for more than a decade. It could be argued that this is a justification to do this unconditionally, but PREEMPT_RT covers only a limited number of architectures and it disables some functionality which limits the coverage further. Limit the replacement to PREEMPT_RT for now. Link: https://lkml.kernel.org/r/20210810091116.pocdmaatdcogvdso@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Vlastimil Babka Cc: Thomas Gleixner Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem-internal.h | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 7902c7d8b55f..4aa1031d3e4c 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -90,7 +90,11 @@ static inline void __kunmap_local(void *vaddr) static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { - preempt_disable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_disable(); + else + preempt_disable(); + pagefault_disable(); return __kmap_local_page_prot(page, prot); } @@ -102,7 +106,11 @@ static inline void *kmap_atomic(struct page *page) static inline void *kmap_atomic_pfn(unsigned long pfn) { - preempt_disable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_disable(); + else + preempt_disable(); + pagefault_disable(); return __kmap_local_pfn_prot(pfn, kmap_prot); } @@ -111,7 +119,10 @@ static inline void __kunmap_atomic(void *addr) { kunmap_local_indexed(addr); pagefault_enable(); - preempt_enable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_enable(); + else + preempt_enable(); } unsigned int __nr_free_highpages(void); @@ -179,7 +190,10 @@ static inline void __kunmap_local(void *addr) static inline void *kmap_atomic(struct page *page) { - preempt_disable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_disable(); + else + preempt_disable(); pagefault_disable(); return page_address(page); } @@ -200,7 +214,10 @@ static inline void __kunmap_atomic(void *addr) kunmap_flush_on_unmap(addr); #endif pagefault_enable(); - preempt_enable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_enable(); + else + preempt_enable(); } static inline unsigned int nr_free_highpages(void) { return 0; } -- cgit v1.2.3-71-gd317 From 41c961b9013ee9b6d0491f6926df546e37964b1f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 7 Sep 2021 19:56:15 -0700 Subject: mm: introduce PAGEFLAGS_MASK to replace ((1UL << NR_PAGEFLAGS) - 1) Instead of hard-coding ((1UL << NR_PAGEFLAGS) - 1) everywhere, introducing PAGEFLAGS_MASK to make the code clear to get the page flags. Link: https://lkml.kernel.org/r/20210819150712.59948-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Vladimir Davydov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 +++- include/trace/events/page_ref.h | 4 ++-- lib/test_printf.c | 2 +- lib/vsprintf.c | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5922031ffab6..6b8d66965145 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -178,6 +178,8 @@ enum pageflags { PG_reported = PG_uptodate, }; +#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) + #ifndef __GENERATING_BOUNDS_H static inline unsigned long _compound_head(const struct page *page) @@ -859,7 +861,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ - (((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) + (PAGEFLAGS_MASK & ~__PG_HWPOISON) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h index 5d2ea93956ce..8a99c1cd417b 100644 --- a/include/trace/events/page_ref.h +++ b/include/trace/events/page_ref.h @@ -38,7 +38,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_template, TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d", __entry->pfn, - show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)), + show_page_flags(__entry->flags & PAGEFLAGS_MASK), __entry->count, __entry->mapcount, __entry->mapping, __entry->mt, __entry->val) @@ -88,7 +88,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_and_test_template, TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d ret=%d", __entry->pfn, - show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)), + show_page_flags(__entry->flags & PAGEFLAGS_MASK), __entry->count, __entry->mapcount, __entry->mapping, __entry->mt, __entry->val, __entry->ret) diff --git a/lib/test_printf.c b/lib/test_printf.c index 8ac71aee46af..ec69953cf80c 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -614,7 +614,7 @@ page_flags_test(int section, int node, int zone, int last_cpupid, bool append = false; int i; - flags &= BIT(NR_PAGEFLAGS) - 1; + flags &= PAGEFLAGS_MASK; if (flags) { page_flags |= flags; snprintf(cmp_buf + size, BUF_SIZE - size, "%s", name); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 26c83943748a..cc7bdd3ac2ee 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2019,7 +2019,7 @@ static const struct page_flags_fields pff[] = { static char *format_page_flags(char *buf, char *end, unsigned long flags) { - unsigned long main_flags = flags & (BIT(NR_PAGEFLAGS) - 1); + unsigned long main_flags = flags & PAGEFLAGS_MASK; bool append = false; int i; -- cgit v1.2.3-71-gd317 From 2224d8485492e499ca2e5d25407f8502cc06f149 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:28 -0700 Subject: mm: introduce Data Access MONitor (DAMON) Patch series "Introduce Data Access MONitor (DAMON)", v34. Introduction ============ DAMON is a data access monitoring framework for the Linux kernel. The core mechanisms of DAMON called 'region based sampling' and 'adaptive regions adjustment' (refer to 'mechanisms.rst' in the 11th patch of this patchset for the detail) make it - accurate (The monitored information is useful for DRAM level memory management. It might not appropriate for Cache-level accuracy, though.), - light-weight (The monitoring overhead is low enough to be applied online while making no impact on the performance of the target workloads.), and - scalable (the upper-bound of the instrumentation overhead is controllable regardless of the size of target workloads.). Using this framework, therefore, several memory management mechanisms such as reclamation and THP can be optimized to aware real data access patterns. Experimental access pattern aware memory management optimization works that incurring high instrumentation overhead will be able to have another try. Though DAMON is for kernel subsystems, it can be easily exposed to the user space by writing a DAMON-wrapper kernel subsystem. Then, user space users who have some special workloads will be able to write personalized tools or applications for deeper understanding and specialized optimizations of their systems. DAMON is also merged in two public Amazon Linux kernel trees that based on v5.4.y[1] and v5.10.y[2]. [1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon [2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon The userspace tool[1] is available, released under GPLv2, and actively being maintained. I am also planning to implement another basic user interface in perf[2]. Also, the basic test suite for DAMON is available under GPLv2[3]. [1] https://github.com/awslabs/damo [2] https://lore.kernel.org/linux-mm/20210107120729.22328-1-sjpark@amazon.com/ [3] https://github.com/awslabs/damon-tests Long-term Plan -------------- DAMON is a part of a project called Data Access-aware Operating System (DAOS). As the name implies, I want to improve the performance and efficiency of systems using fine-grained data access patterns. The optimizations are for both kernel and user spaces. I will therefore modify or create kernel subsystems, export some of those to user space and implement user space library / tools. Below shows the layers and components for the project. --------------------------------------------------------------------------- Primitives: PTE Accessed bit, PG_idle, rmap, (Intel CMT), ... Framework: DAMON Features: DAMOS, virtual addr, physical addr, ... Applications: DAMON-debugfs, (DARC), ... ^^^^^^^^^^^^^^^^^^^^^^^ KERNEL SPACE ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Raw Interface: debugfs, (sysfs), (damonfs), tracepoints, (sys_damon), ... vvvvvvvvvvvvvvvvvvvvvvv USER SPACE vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv Library: (libdamon), ... Tools: DAMO, (perf), ... --------------------------------------------------------------------------- The components in parentheses or marked as '...' are not implemented yet but in the future plan. IOW, those are the TODO tasks of DAOS project. For more detail, please refer to the plans: https://lore.kernel.org/linux-mm/20201202082731.24828-1-sjpark@amazon.com/ Evaluations =========== We evaluated DAMON's overhead, monitoring quality and usefulness using 24 realistic workloads on my QEMU/KVM based virtual machine running a kernel that v24 DAMON patchset is applied. DAMON is lightweight. It increases system memory usage by 0.39% and slows target workloads down by 1.16%. DAMON is accurate and useful for memory management optimizations. An experimental DAMON-based operation scheme for THP, namely 'ethp', removes 76.15% of THP memory overheads while preserving 51.25% of THP speedup. Another experimental DAMON-based 'proactive reclamation' implementation, 'prcl', reduces 93.38% of residential sets and 23.63% of system memory footprint while incurring only 1.22% runtime overhead in the best case (parsec3/freqmine). NOTE that the experimental THP optimization and proactive reclamation are not for production but only for proof of concepts. Please refer to the official document[1] or "Documentation/admin-guide/mm: Add a document for DAMON" patch in this patchset for detailed evaluation setup and results. [1] https://damonitor.github.io/doc/html/latest-damon/admin-guide/mm/damon/eval.html Real-world User Story ===================== In summary, DAMON has used on production systems and proved its usefulness. DAMON as a profiler ------------------- We analyzed characteristics of a large scale production systems of our customers using DAMON. The systems utilize 70GB DRAM and 36 CPUs. From this, we were able to find interesting things below. There were obviously different access pattern under idle workload and active workload. Under the idle workload, it accessed large memory regions with low frequency, while the active workload accessed small memory regions with high freuqnecy. DAMON found a 7GB memory region that showing obviously high access frequency under the active workload. We believe this is the performance-effective working set and need to be protected. There was a 4KB memory region that showing highest access frequency under not only active but also idle workloads. We think this must be a hottest code section like thing that should never be paged out. For this analysis, DAMON used only 0.3-1% of single CPU time. Because we used recording-based analysis, it consumed about 3-12 MB of disk space per 20 minutes. This is only small amount of disk space, but we can further reduce the disk usage by using non-recording-based DAMON features. I'd like to argue that only DAMON can do such detailed analysis (finding 4KB highest region in 70GB memory) with the light overhead. DAMON as a system optimization tool ----------------------------------- We also found below potential performance problems on the systems and made DAMON-based solutions. The system doesn't want to make the workload suffer from the page reclamation and thus it utilizes enough DRAM but no swap device. However, we found the system is actively reclaiming file-backed pages, because the system has intensive file IO. The file IO turned out to be not performance critical for the workload, but the customer wanted to ensure performance critical file-backed pages like code section to not mistakenly be evicted. Using direct IO should or `mlock()` would be a straightforward solution, but modifying the user space code is not easy for the customer. Alternatively, we could use DAMON-based operation scheme[1]. By using it, we can ask DAMON to track access frequency of each region and make 'process_madvise(MADV_WILLNEED)[2]' call for regions having specific size and access frequency for a time interval. We also found the system is having high number of TLB misses. We tried 'always' THP enabled policy and it greatly reduced TLB misses, but the page reclamation also been more frequent due to the THP internal fragmentation caused memory bloat. We could try another DAMON-based operation scheme that applies 'MADV_HUGEPAGE' to memory regions having >=2MB size and high access frequency, while applying 'MADV_NOHUGEPAGE' to regions having <2MB size and low access frequency. We do not own the systems so we only reported the analysis results and possible optimization solutions to the customers. The customers satisfied about the analysis results and promised to try the optimization guides. [1] https://lore.kernel.org/linux-mm/20201006123931.5847-1-sjpark@amazon.com/ [2] https://lore.kernel.org/linux-api/20200622192900.22757-4-minchan@kernel.org/ Comparison with Idle Page Tracking ================================== Idle Page Tracking allows users to set and read idleness of pages using a bitmap file which represents each page with each bit of the file. One recommended usage of it is working set size detection. Users can do that by 1. find PFN of each page for workloads in interest, 2. set all the pages as idle by doing writes to the bitmap file, 3. wait until the workload accesses its working set, and 4. read the idleness of the pages again and count pages became not idle. NOTE: While Idle Page Tracking is for user space users, DAMON is primarily designed for kernel subsystems though it can easily exposed to the user space. Hence, this section only assumes such user space use of DAMON. For what use cases Idle Page Tracking would be better? ------------------------------------------------------ 1. Flexible usecases other than hotness monitoring. Because Idle Page Tracking allows users to control the primitive (Page idleness) by themselves, Idle Page Tracking users can do anything they want. Meanwhile, DAMON is primarily designed to monitor the hotness of each memory region. For this, DAMON asks users to provide sampling interval and aggregation interval. For the reason, there could be some use case that using Idle Page Tracking is simpler. 2. Physical memory monitoring. Idle Page Tracking receives PFN range as input, so natively supports physical memory monitoring. DAMON is designed to be extensible for multiple address spaces and use cases by implementing and using primitives for the given use case. Therefore, by theory, DAMON has no limitation in the type of target address space as long as primitives for the given address space exists. However, the default primitives introduced by this patchset supports only virtual address spaces. Therefore, for physical memory monitoring, you should implement your own primitives and use it, or simply use Idle Page Tracking. Nonetheless, RFC patchsets[1] for the physical memory address space primitives is already available. It also supports user memory same to Idle Page Tracking. [1] https://lore.kernel.org/linux-mm/20200831104730.28970-1-sjpark@amazon.com/ For what use cases DAMON is better? ----------------------------------- 1. Hotness Monitoring. Idle Page Tracking let users know only if a page frame is accessed or not. For hotness check, the user should write more code and use more memory. DAMON do that by itself. 2. Low Monitoring Overhead DAMON receives user's monitoring request with one step and then provide the results. So, roughly speaking, DAMON require only O(1) user/kernel context switches. In case of Idle Page Tracking, however, because the interface receives contiguous page frames, the number of user/kernel context switches increases as the monitoring target becomes complex and huge. As a result, the context switch overhead could be not negligible. Moreover, DAMON is born to handle with the monitoring overhead. Because the core mechanism is pure logical, Idle Page Tracking users might be able to implement the mechanism on their own, but it would be time consuming and the user/kernel context switching will still more frequent than that of DAMON. Also, the kernel subsystems cannot use the logic in this case. 3. Page granularity working set size detection. Until v22 of this patchset, this was categorized as the thing Idle Page Tracking could do better, because DAMON basically maintains additional metadata for each of the monitoring target regions. So, in the page granularity working set size detection use case, DAMON would incur (number of monitoring target pages * size of metadata) memory overhead. Size of the single metadata item is about 54 bytes, so assuming 4KB pages, about 1.3% of monitoring target pages will be additionally used. All essential metadata for Idle Page Tracking are embedded in 'struct page' and page table entries. Therefore, in this use case, only one counter variable for working set size accounting is required if Idle Page Tracking is used. There are more details to consider, but roughly speaking, this is true in most cases. However, the situation changed from v23. Now DAMON supports arbitrary types of monitoring targets, which don't use the metadata. Using that, DAMON can do the working set size detection with no additional space overhead but less user-kernel context switch. A first draft for the implementation of monitoring primitives for this usage is available in a DAMON development tree[1]. An RFC patchset for it based on this patchset will also be available soon. Since v24, the arbitrary type support is dropped from this patchset because this patchset doesn't introduce real use of the type. You can still get it from the DAMON development tree[2], though. [1] https://github.com/sjp38/linux/tree/damon/pgidle_hack [2] https://github.com/sjp38/linux/tree/damon/master 4. More future usecases While Idle Page Tracking has tight coupling with base primitives (PG_Idle and page table Accessed bits), DAMON is designed to be extensible for many use cases and address spaces. If you need some special address type or want to use special h/w access check primitives, you can write your own primitives for that and configure DAMON to use those. Therefore, if your use case could be changed a lot in future, using DAMON could be better. Can I use both Idle Page Tracking and DAMON? -------------------------------------------- Yes, though using them concurrently for overlapping memory regions could result in interference to each other. Nevertheless, such use case would be rare or makes no sense at all. Even in the case, the noise would bot be really significant. So, you can choose whatever you want depending on the characteristics of your use cases. More Information ================ We prepared a showcase web site[1] that you can get more information. There are - the official documentations[2], - the heatmap format dynamic access pattern of various realistic workloads for heap area[3], mmap()-ed area[4], and stack[5] area, - the dynamic working set size distribution[6] and chronological working set size changes[7], and - the latest performance test results[8]. [1] https://damonitor.github.io/_index [2] https://damonitor.github.io/doc/html/latest-damon [3] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.0.png.html [4] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html [5] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.2.png.html [6] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html [7] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html [8] https://damonitor.github.io/test/result/perf/latest/html/index.html Baseline and Complete Git Trees =============================== The patches are based on the latest -mm tree, specifically v5.14-rc1-mmots-2021-07-15-18-47 of https://github.com/hnaz/linux-mm. You can also clone the complete git tree: $ git clone git://github.com/sjp38/linux -b damon/patches/v34 The web is also available: https://github.com/sjp38/linux/releases/tag/damon/patches/v34 Development Trees ----------------- There are a couple of trees for entire DAMON patchset series and features for future release. - For latest release: https://github.com/sjp38/linux/tree/damon/master - For next release: https://github.com/sjp38/linux/tree/damon/next Long-term Support Trees ----------------------- For people who want to test DAMON but using LTS kernels, there are another couple of trees based on two latest LTS kernels respectively and containing the 'damon/master' backports. - For v5.4.y: https://github.com/sjp38/linux/tree/damon/for-v5.4.y - For v5.10.y: https://github.com/sjp38/linux/tree/damon/for-v5.10.y Amazon Linux Kernel Trees ------------------------- DAMON is also merged in two public Amazon Linux kernel trees that based on v5.4.y[1] and v5.10.y[2]. [1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon [2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon Git Tree for Diff of Patches ============================ For easy review of diff between different versions of each patch, I prepared a git tree containing all versions of the DAMON patchset series: https://github.com/sjp38/damon-patches You can clone it and use 'diff' for easy review of changes between different versions of the patchset. For example: $ git clone https://github.com/sjp38/damon-patches && cd damon-patches $ diff -u damon/v33 damon/v34 Sequence Of Patches =================== First three patches implement the core logics of DAMON. The 1st patch introduces basic sampling based hotness monitoring for arbitrary types of targets. Following two patches implement the core mechanisms for control of overhead and accuracy, namely regions based sampling (patch 2) and adaptive regions adjustment (patch 3). Now the essential parts of DAMON is complete, but it cannot work unless someone provides monitoring primitives for a specific use case. The following two patches make it just work for virtual address spaces monitoring. The 4th patch makes 'PG_idle' can be used by DAMON and the 5th patch implements the virtual memory address space specific monitoring primitives using page table Accessed bits and the 'PG_idle' page flag. Now DAMON just works for virtual address space monitoring via the kernel space api. To let the user space users can use DAMON, following four patches add interfaces for them. The 6th patch adds a tracepoint for monitoring results. The 7th patch implements a DAMON application kernel module, namely damon-dbgfs, that simply wraps DAMON and exposes DAMON interface to the user space via the debugfs interface. The 8th patch further exports pid of monitoring thread (kdamond) to user space for easier cpu usage accounting, and the 9th patch makes the debugfs interface to support multiple contexts. Three patches for maintainability follows. The 10th patch adds documentations for both the user space and the kernel space. The 11th patch provides unit tests (based on the kunit) while the 12th patch adds user space tests (based on the kselftest). Finally, the last patch (13th) updates the MAINTAINERS file. This patch (of 13): DAMON is a data access monitoring framework for the Linux kernel. The core mechanisms of DAMON make it - accurate (the monitoring output is useful enough for DRAM level performance-centric memory management; It might be inappropriate for CPU cache levels, though), - light-weight (the monitoring overhead is normally low enough to be applied online), and - scalable (the upper-bound of the overhead is in constant range regardless of the size of target workloads). Using this framework, hence, we can easily write efficient kernel space data access monitoring applications. For example, the kernel's memory management mechanisms can make advanced decisions using this. Experimental data access aware optimization works that incurring high access monitoring overhead could again be implemented on top of this. Due to its simple and flexible interface, providing user space interface would be also easy. Then, user space users who have some special workloads can write personalized applications for better understanding and optimizations of their workloads and systems. === Nevertheless, this commit is defining and implementing only basic access check part without the overhead-accuracy handling core logic. The basic access check is as below. The output of DAMON says what memory regions are how frequently accessed for a given duration. The resolution of the access frequency is controlled by setting ``sampling interval`` and ``aggregation interval``. In detail, DAMON checks access to each page per ``sampling interval`` and aggregates the results. In other words, counts the number of the accesses to each region. After each ``aggregation interval`` passes, DAMON calls callback functions that previously registered by users so that users can read the aggregated results and then clears the results. This can be described in below simple pseudo-code:: init() while monitoring_on: for page in monitoring_target: if accessed(page): nr_accesses[page] += 1 if time() % aggregation_interval == 0: for callback in user_registered_callbacks: callback(monitoring_target, nr_accesses) for page in monitoring_target: nr_accesses[page] = 0 if time() % update_interval == 0: update() sleep(sampling interval) The target regions constructed at the beginning of the monitoring and updated after each ``regions_update_interval``, because the target regions could be dynamically changed (e.g., mmap() or memory hotplug). The monitoring overhead of this mechanism will arbitrarily increase as the size of the target workload grows. The basic monitoring primitives for actual access check and dynamic target regions construction aren't in the core part of DAMON. Instead, it allows users to implement their own primitives that are optimized for their use case and configure DAMON to use those. In other words, users cannot use current version of DAMON without some additional works. Following commits will implement the core mechanisms for the overhead-accuracy control and default primitives implementations. Link: https://lkml.kernel.org/r/20210716081449.22187-1-sj38.park@gmail.com Link: https://lkml.kernel.org/r/20210716081449.22187-2-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Jonathan Cameron Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Jonathan Corbet Cc: David Hildenbrand Cc: David Woodhouse Cc: Marco Elver Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Joe Perches Cc: Mel Gorman Cc: Maximilian Heyne Cc: Minchan Kim Cc: Ingo Molnar Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: David Rientjes Cc: Steven Rostedt (VMware) Cc: Shuah Khan Cc: Vlastimil Babka Cc: Vladimir Davydov Cc: Brendan Higgins Cc: Markus Boehme Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 167 ++++++++++++++++++++++++++ mm/Kconfig | 2 + mm/Makefile | 1 + mm/damon/Kconfig | 15 +++ mm/damon/Makefile | 3 + mm/damon/core.c | 320 ++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 508 insertions(+) create mode 100644 include/linux/damon.h create mode 100644 mm/damon/Kconfig create mode 100644 mm/damon/Makefile create mode 100644 mm/damon/core.c (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h new file mode 100644 index 000000000000..2f652602b1ea --- /dev/null +++ b/include/linux/damon.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAMON api + * + * Author: SeongJae Park + */ + +#ifndef _DAMON_H_ +#define _DAMON_H_ + +#include +#include +#include + +struct damon_ctx; + +/** + * struct damon_primitive Monitoring primitives for given use cases. + * + * @init: Initialize primitive-internal data structures. + * @update: Update primitive-internal data structures. + * @prepare_access_checks: Prepare next access check of target regions. + * @check_accesses: Check the accesses to target regions. + * @reset_aggregated: Reset aggregated accesses monitoring results. + * @target_valid: Determine if the target is valid. + * @cleanup: Clean up the context. + * + * DAMON can be extended for various address spaces and usages. For this, + * users should register the low level primitives for their target address + * space and usecase via the &damon_ctx.primitive. Then, the monitoring thread + * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting + * the monitoring, @update after each &damon_ctx.primitive_update_interval, and + * @check_accesses, @target_valid and @prepare_access_checks after each + * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each + * &damon_ctx.aggr_interval. + * + * @init should initialize primitive-internal data structures. For example, + * this could be used to construct proper monitoring target regions and link + * those to @damon_ctx.target. + * @update should update the primitive-internal data structures. For example, + * this could be used to update monitoring target regions for current status. + * @prepare_access_checks should manipulate the monitoring regions to be + * prepared for the next access check. + * @check_accesses should check the accesses to each region that made after the + * last preparation and update the number of observed accesses of each region. + * @reset_aggregated should reset the access monitoring results that aggregated + * by @check_accesses. + * @target_valid should check whether the target is still valid for the + * monitoring. + * @cleanup is called from @kdamond just before its termination. + */ +struct damon_primitive { + void (*init)(struct damon_ctx *context); + void (*update)(struct damon_ctx *context); + void (*prepare_access_checks)(struct damon_ctx *context); + void (*check_accesses)(struct damon_ctx *context); + void (*reset_aggregated)(struct damon_ctx *context); + bool (*target_valid)(void *target); + void (*cleanup)(struct damon_ctx *context); +}; + +/* + * struct damon_callback Monitoring events notification callbacks. + * + * @before_start: Called before starting the monitoring. + * @after_sampling: Called after each sampling. + * @after_aggregation: Called after each aggregation. + * @before_terminate: Called before terminating the monitoring. + * @private: User private data. + * + * The monitoring thread (&damon_ctx.kdamond) calls @before_start and + * @before_terminate just before starting and finishing the monitoring, + * respectively. Therefore, those are good places for installing and cleaning + * @private. + * + * The monitoring thread calls @after_sampling and @after_aggregation for each + * of the sampling intervals and aggregation intervals, respectively. + * Therefore, users can safely access the monitoring results without additional + * protection. For the reason, users are recommended to use these callback for + * the accesses to the results. + * + * If any callback returns non-zero, monitoring stops. + */ +struct damon_callback { + void *private; + + int (*before_start)(struct damon_ctx *context); + int (*after_sampling)(struct damon_ctx *context); + int (*after_aggregation)(struct damon_ctx *context); + int (*before_terminate)(struct damon_ctx *context); +}; + +/** + * struct damon_ctx - Represents a context for each monitoring. This is the + * main interface that allows users to set the attributes and get the results + * of the monitoring. + * + * @sample_interval: The time between access samplings. + * @aggr_interval: The time between monitor results aggregations. + * @primitive_update_interval: The time between monitoring primitive updates. + * + * For each @sample_interval, DAMON checks whether each region is accessed or + * not. It aggregates and keeps the access information (number of accesses to + * each region) for @aggr_interval time. DAMON also checks whether the target + * memory regions need update (e.g., by ``mmap()`` calls from the application, + * in case of virtual memory monitoring) and applies the changes for each + * @primitive_update_interval. All time intervals are in micro-seconds. + * Please refer to &struct damon_primitive and &struct damon_callback for more + * detail. + * + * @kdamond: Kernel thread who does the monitoring. + * @kdamond_stop: Notifies whether kdamond should stop. + * @kdamond_lock: Mutex for the synchronizations with @kdamond. + * + * For each monitoring context, one kernel thread for the monitoring is + * created. The pointer to the thread is stored in @kdamond. + * + * Once started, the monitoring thread runs until explicitly required to be + * terminated or every monitoring target is invalid. The validity of the + * targets is checked via the &damon_primitive.target_valid of @primitive. The + * termination can also be explicitly requested by writing non-zero to + * @kdamond_stop. The thread sets @kdamond to NULL when it terminates. + * Therefore, users can know whether the monitoring is ongoing or terminated by + * reading @kdamond. Reads and writes to @kdamond and @kdamond_stop from + * outside of the monitoring thread must be protected by @kdamond_lock. + * + * Note that the monitoring thread protects only @kdamond and @kdamond_stop via + * @kdamond_lock. Accesses to other fields must be protected by themselves. + * + * @primitive: Set of monitoring primitives for given use cases. + * @callback: Set of callbacks for monitoring events notifications. + * + * @target: Pointer to the user-defined monitoring target. + */ +struct damon_ctx { + unsigned long sample_interval; + unsigned long aggr_interval; + unsigned long primitive_update_interval; + +/* private: internal use only */ + struct timespec64 last_aggregation; + struct timespec64 last_primitive_update; + +/* public: */ + struct task_struct *kdamond; + bool kdamond_stop; + struct mutex kdamond_lock; + + struct damon_primitive primitive; + struct damon_callback callback; + + void *target; +}; + +#ifdef CONFIG_DAMON + +struct damon_ctx *damon_new_ctx(void); +void damon_destroy_ctx(struct damon_ctx *ctx); +int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, + unsigned long aggr_int, unsigned long primitive_upd_int); + +int damon_start(struct damon_ctx **ctxs, int nr_ctxs); +int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); + +#endif /* CONFIG_DAMON */ + +#endif /* _DAMON_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 14d5d2837737..8459167b0294 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -886,4 +886,6 @@ config IO_MAPPING config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +source "mm/damon/Kconfig" + endmenu diff --git a/mm/Makefile b/mm/Makefile index 0e0a5a6fe127..fc60a40ce954 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -118,6 +118,7 @@ obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o +obj-$(CONFIG_DAMON) += damon/ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig new file mode 100644 index 000000000000..d00e99ac1a15 --- /dev/null +++ b/mm/damon/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Data Access Monitoring" + +config DAMON + bool "DAMON: Data Access Monitoring Framework" + help + This builds a framework that allows kernel subsystems to monitor + access frequency of each memory region. The information can be useful + for performance-centric DRAM level memory management. + + See https://damonitor.github.io/doc/html/latest-damon/index.html for + more information. + +endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile new file mode 100644 index 000000000000..4fd2edb4becf --- /dev/null +++ b/mm/damon/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_DAMON) := core.o diff --git a/mm/damon/core.c b/mm/damon/core.c new file mode 100644 index 000000000000..651590bf49b1 --- /dev/null +++ b/mm/damon/core.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Data Access Monitor + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon: " fmt + +#include +#include +#include +#include + +static DEFINE_MUTEX(damon_lock); +static int nr_running_ctxs; + +struct damon_ctx *damon_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ctx->sample_interval = 5 * 1000; + ctx->aggr_interval = 100 * 1000; + ctx->primitive_update_interval = 60 * 1000 * 1000; + + ktime_get_coarse_ts64(&ctx->last_aggregation); + ctx->last_primitive_update = ctx->last_aggregation; + + mutex_init(&ctx->kdamond_lock); + + ctx->target = NULL; + + return ctx; +} + +void damon_destroy_ctx(struct damon_ctx *ctx) +{ + if (ctx->primitive.cleanup) + ctx->primitive.cleanup(ctx); + kfree(ctx); +} + +/** + * damon_set_attrs() - Set attributes for the monitoring. + * @ctx: monitoring context + * @sample_int: time interval between samplings + * @aggr_int: time interval between aggregations + * @primitive_upd_int: time interval between monitoring primitive updates + * + * This function should not be called while the kdamond is running. + * Every time interval is in micro-seconds. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, + unsigned long aggr_int, unsigned long primitive_upd_int) +{ + ctx->sample_interval = sample_int; + ctx->aggr_interval = aggr_int; + ctx->primitive_update_interval = primitive_upd_int; + + return 0; +} + +static bool damon_kdamond_running(struct damon_ctx *ctx) +{ + bool running; + + mutex_lock(&ctx->kdamond_lock); + running = ctx->kdamond != NULL; + mutex_unlock(&ctx->kdamond_lock); + + return running; +} + +static int kdamond_fn(void *data); + +/* + * __damon_start() - Starts monitoring with given context. + * @ctx: monitoring context + * + * This function should be called while damon_lock is hold. + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_start(struct damon_ctx *ctx) +{ + int err = -EBUSY; + + mutex_lock(&ctx->kdamond_lock); + if (!ctx->kdamond) { + err = 0; + ctx->kdamond_stop = false; + ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d", + nr_running_ctxs); + if (IS_ERR(ctx->kdamond)) { + err = PTR_ERR(ctx->kdamond); + ctx->kdamond = 0; + } + } + mutex_unlock(&ctx->kdamond_lock); + + return err; +} + +/** + * damon_start() - Starts the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to start monitoring + * @nr_ctxs: size of @ctxs + * + * This function starts a group of monitoring threads for a group of monitoring + * contexts. One thread per each context is created and run in parallel. The + * caller should handle synchronization between the threads by itself. If a + * group of threads that created by other 'damon_start()' call is currently + * running, this function does nothing but returns -EBUSY. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_start(struct damon_ctx **ctxs, int nr_ctxs) +{ + int i; + int err = 0; + + mutex_lock(&damon_lock); + if (nr_running_ctxs) { + mutex_unlock(&damon_lock); + return -EBUSY; + } + + for (i = 0; i < nr_ctxs; i++) { + err = __damon_start(ctxs[i]); + if (err) + break; + nr_running_ctxs++; + } + mutex_unlock(&damon_lock); + + return err; +} + +/* + * __damon_stop() - Stops monitoring of given context. + * @ctx: monitoring context + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_stop(struct damon_ctx *ctx) +{ + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ctx->kdamond_stop = true; + mutex_unlock(&ctx->kdamond_lock); + while (damon_kdamond_running(ctx)) + usleep_range(ctx->sample_interval, + ctx->sample_interval * 2); + return 0; + } + mutex_unlock(&ctx->kdamond_lock); + + return -EPERM; +} + +/** + * damon_stop() - Stops the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to stop monitoring + * @nr_ctxs: size of @ctxs + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) +{ + int i, err = 0; + + for (i = 0; i < nr_ctxs; i++) { + /* nr_running_ctxs is decremented in kdamond_fn */ + err = __damon_stop(ctxs[i]); + if (err) + return err; + } + + return err; +} + +/* + * damon_check_reset_time_interval() - Check if a time interval is elapsed. + * @baseline: the time to check whether the interval has elapsed since + * @interval: the time interval (microseconds) + * + * See whether the given time interval has passed since the given baseline + * time. If so, it also updates the baseline to current time for next check. + * + * Return: true if the time interval has passed, or false otherwise. + */ +static bool damon_check_reset_time_interval(struct timespec64 *baseline, + unsigned long interval) +{ + struct timespec64 now; + + ktime_get_coarse_ts64(&now); + if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) < + interval * 1000) + return false; + *baseline = now; + return true; +} + +/* + * Check whether it is time to flush the aggregated information + */ +static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_aggregation, + ctx->aggr_interval); +} + +/* + * Check whether it is time to check and apply the target monitoring regions + * + * Returns true if it is. + */ +static bool kdamond_need_update_primitive(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_primitive_update, + ctx->primitive_update_interval); +} + +/* + * Check whether current monitoring should be stopped + * + * The monitoring is stopped when either the user requested to stop, or all + * monitoring targets are invalid. + * + * Returns true if need to stop current monitoring. + */ +static bool kdamond_need_stop(struct damon_ctx *ctx) +{ + bool stop; + + mutex_lock(&ctx->kdamond_lock); + stop = ctx->kdamond_stop; + mutex_unlock(&ctx->kdamond_lock); + if (stop) + return true; + + if (!ctx->primitive.target_valid) + return false; + + return !ctx->primitive.target_valid(ctx->target); +} + +static void set_kdamond_stop(struct damon_ctx *ctx) +{ + mutex_lock(&ctx->kdamond_lock); + ctx->kdamond_stop = true; + mutex_unlock(&ctx->kdamond_lock); +} + +/* + * The monitoring daemon that runs as a kernel thread + */ +static int kdamond_fn(void *data) +{ + struct damon_ctx *ctx = (struct damon_ctx *)data; + + mutex_lock(&ctx->kdamond_lock); + pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); + mutex_unlock(&ctx->kdamond_lock); + + if (ctx->primitive.init) + ctx->primitive.init(ctx); + if (ctx->callback.before_start && ctx->callback.before_start(ctx)) + set_kdamond_stop(ctx); + + while (!kdamond_need_stop(ctx)) { + if (ctx->primitive.prepare_access_checks) + ctx->primitive.prepare_access_checks(ctx); + if (ctx->callback.after_sampling && + ctx->callback.after_sampling(ctx)) + set_kdamond_stop(ctx); + + usleep_range(ctx->sample_interval, ctx->sample_interval + 1); + + if (ctx->primitive.check_accesses) + ctx->primitive.check_accesses(ctx); + + if (kdamond_aggregate_interval_passed(ctx)) { + if (ctx->callback.after_aggregation && + ctx->callback.after_aggregation(ctx)) + set_kdamond_stop(ctx); + if (ctx->primitive.reset_aggregated) + ctx->primitive.reset_aggregated(ctx); + } + + if (kdamond_need_update_primitive(ctx)) { + if (ctx->primitive.update) + ctx->primitive.update(ctx); + } + } + + if (ctx->callback.before_terminate && + ctx->callback.before_terminate(ctx)) + set_kdamond_stop(ctx); + if (ctx->primitive.cleanup) + ctx->primitive.cleanup(ctx); + + pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid); + mutex_lock(&ctx->kdamond_lock); + ctx->kdamond = NULL; + mutex_unlock(&ctx->kdamond_lock); + + mutex_lock(&damon_lock); + nr_running_ctxs--; + mutex_unlock(&damon_lock); + + do_exit(0); +} -- cgit v1.2.3-71-gd317 From f23b8eee1871a6db5c37f90831147de5426c40b7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:32 -0700 Subject: mm/damon/core: implement region-based sampling To avoid the unbounded increase of the overhead, DAMON groups adjacent pages that are assumed to have the same access frequencies into a region. As long as the assumption (pages in a region have the same access frequencies) is kept, only one page in the region is required to be checked. Thus, for each ``sampling interval``, 1. the 'prepare_access_checks' primitive picks one page in each region, 2. waits for one ``sampling interval``, 3. checks whether the page is accessed meanwhile, and 4. increases the access count of the region if so. Therefore, the monitoring overhead is controllable by adjusting the number of regions. DAMON allows both the underlying primitives and user callbacks to adjust regions for the trade-off. In other words, this commit makes DAMON to use not only time-based sampling but also space-based sampling. This scheme, however, cannot preserve the quality of the output if the assumption is not guaranteed. Next commit will address this problem. Link: https://lkml.kernel.org/r/20210716081449.22187-3-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 77 +++++++++++++++++++++++++-- mm/damon/core.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 213 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2f652602b1ea..67db309ad61b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -12,6 +12,48 @@ #include #include +/** + * struct damon_addr_range - Represents an address region of [@start, @end). + * @start: Start address of the region (inclusive). + * @end: End address of the region (exclusive). + */ +struct damon_addr_range { + unsigned long start; + unsigned long end; +}; + +/** + * struct damon_region - Represents a monitoring target region. + * @ar: The address range of the region. + * @sampling_addr: Address of the sample for the next access check. + * @nr_accesses: Access frequency of this region. + * @list: List head for siblings. + */ +struct damon_region { + struct damon_addr_range ar; + unsigned long sampling_addr; + unsigned int nr_accesses; + struct list_head list; +}; + +/** + * struct damon_target - Represents a monitoring target. + * @id: Unique identifier for this target. + * @regions_list: Head of the monitoring target regions of this target. + * @list: List head for siblings. + * + * Each monitoring context could have multiple targets. For example, a context + * for virtual memory address spaces could have multiple target processes. The + * @id of each target should be unique among the targets of the context. For + * example, in the virtual address monitoring context, it could be a pidfd or + * an address of an mm_struct. + */ +struct damon_target { + unsigned long id; + struct list_head regions_list; + struct list_head list; +}; + struct damon_ctx; /** @@ -36,7 +78,7 @@ struct damon_ctx; * * @init should initialize primitive-internal data structures. For example, * this could be used to construct proper monitoring target regions and link - * those to @damon_ctx.target. + * those to @damon_ctx.adaptive_targets. * @update should update the primitive-internal data structures. For example, * this could be used to update monitoring target regions for current status. * @prepare_access_checks should manipulate the monitoring regions to be @@ -130,7 +172,7 @@ struct damon_callback { * @primitive: Set of monitoring primitives for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @target: Pointer to the user-defined monitoring target. + * @region_targets: Head of monitoring targets (&damon_target) list. */ struct damon_ctx { unsigned long sample_interval; @@ -149,11 +191,40 @@ struct damon_ctx { struct damon_primitive primitive; struct damon_callback callback; - void *target; + struct list_head region_targets; }; +#define damon_next_region(r) \ + (container_of(r->list.next, struct damon_region, list)) + +#define damon_prev_region(r) \ + (container_of(r->list.prev, struct damon_region, list)) + +#define damon_for_each_region(r, t) \ + list_for_each_entry(r, &t->regions_list, list) + +#define damon_for_each_region_safe(r, next, t) \ + list_for_each_entry_safe(r, next, &t->regions_list, list) + +#define damon_for_each_target(t, ctx) \ + list_for_each_entry(t, &(ctx)->region_targets, list) + +#define damon_for_each_target_safe(t, next, ctx) \ + list_for_each_entry_safe(t, next, &(ctx)->region_targets, list) + #ifdef CONFIG_DAMON +struct damon_region *damon_new_region(unsigned long start, unsigned long end); +inline void damon_insert_region(struct damon_region *r, + struct damon_region *prev, struct damon_region *next); +void damon_add_region(struct damon_region *r, struct damon_target *t); +void damon_destroy_region(struct damon_region *r); + +struct damon_target *damon_new_target(unsigned long id); +void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); +void damon_free_target(struct damon_target *t); +void damon_destroy_target(struct damon_target *t); + struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, diff --git a/mm/damon/core.c b/mm/damon/core.c index 651590bf49b1..947486a150ce 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -15,6 +15,101 @@ static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; +/* + * Construct a damon_region struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_region *damon_new_region(unsigned long start, unsigned long end) +{ + struct damon_region *region; + + region = kmalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return NULL; + + region->ar.start = start; + region->ar.end = end; + region->nr_accesses = 0; + INIT_LIST_HEAD(®ion->list); + + return region; +} + +/* + * Add a region between two other regions + */ +inline void damon_insert_region(struct damon_region *r, + struct damon_region *prev, struct damon_region *next) +{ + __list_add(&r->list, &prev->list, &next->list); +} + +void damon_add_region(struct damon_region *r, struct damon_target *t) +{ + list_add_tail(&r->list, &t->regions_list); +} + +static void damon_del_region(struct damon_region *r) +{ + list_del(&r->list); +} + +static void damon_free_region(struct damon_region *r) +{ + kfree(r); +} + +void damon_destroy_region(struct damon_region *r) +{ + damon_del_region(r); + damon_free_region(r); +} + +/* + * Construct a damon_target struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_target *damon_new_target(unsigned long id) +{ + struct damon_target *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return NULL; + + t->id = id; + INIT_LIST_HEAD(&t->regions_list); + + return t; +} + +void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) +{ + list_add_tail(&t->list, &ctx->region_targets); +} + +static void damon_del_target(struct damon_target *t) +{ + list_del(&t->list); +} + +void damon_free_target(struct damon_target *t) +{ + struct damon_region *r, *next; + + damon_for_each_region_safe(r, next, t) + damon_free_region(r); + kfree(t); +} + +void damon_destroy_target(struct damon_target *t) +{ + damon_del_target(t); + damon_free_target(t); +} + struct damon_ctx *damon_new_ctx(void) { struct damon_ctx *ctx; @@ -32,15 +127,27 @@ struct damon_ctx *damon_new_ctx(void) mutex_init(&ctx->kdamond_lock); - ctx->target = NULL; + INIT_LIST_HEAD(&ctx->region_targets); return ctx; } -void damon_destroy_ctx(struct damon_ctx *ctx) +static void damon_destroy_targets(struct damon_ctx *ctx) { - if (ctx->primitive.cleanup) + struct damon_target *t, *next_t; + + if (ctx->primitive.cleanup) { ctx->primitive.cleanup(ctx); + return; + } + + damon_for_each_target_safe(t, next_t, ctx) + damon_destroy_target(t); +} + +void damon_destroy_ctx(struct damon_ctx *ctx) +{ + damon_destroy_targets(ctx); kfree(ctx); } @@ -217,6 +324,21 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) ctx->aggr_interval); } +/* + * Reset the aggregated monitoring results ('nr_accesses' of each region). + */ +static void kdamond_reset_aggregated(struct damon_ctx *c) +{ + struct damon_target *t; + + damon_for_each_target(t, c) { + struct damon_region *r; + + damon_for_each_region(r, t) + r->nr_accesses = 0; + } +} + /* * Check whether it is time to check and apply the target monitoring regions * @@ -238,6 +360,7 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx) */ static bool kdamond_need_stop(struct damon_ctx *ctx) { + struct damon_target *t; bool stop; mutex_lock(&ctx->kdamond_lock); @@ -249,7 +372,12 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) if (!ctx->primitive.target_valid) return false; - return !ctx->primitive.target_valid(ctx->target); + damon_for_each_target(t, ctx) { + if (ctx->primitive.target_valid(t)) + return false; + } + + return true; } static void set_kdamond_stop(struct damon_ctx *ctx) @@ -265,6 +393,8 @@ static void set_kdamond_stop(struct damon_ctx *ctx) static int kdamond_fn(void *data) { struct damon_ctx *ctx = (struct damon_ctx *)data; + struct damon_target *t; + struct damon_region *r, *next; mutex_lock(&ctx->kdamond_lock); pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); @@ -291,6 +421,7 @@ static int kdamond_fn(void *data) if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) set_kdamond_stop(ctx); + kdamond_reset_aggregated(ctx); if (ctx->primitive.reset_aggregated) ctx->primitive.reset_aggregated(ctx); } @@ -300,6 +431,10 @@ static int kdamond_fn(void *data) ctx->primitive.update(ctx); } } + damon_for_each_target(t, ctx) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r); + } if (ctx->callback.before_terminate && ctx->callback.before_terminate(ctx)) -- cgit v1.2.3-71-gd317 From b9a6ac4e4ede4172d165c133398b93e3233b0ba7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:36 -0700 Subject: mm/damon: adaptively adjust regions Even somehow the initial monitoring target regions are well constructed to fulfill the assumption (pages in same region have similar access frequencies), the data access pattern can be dynamically changed. This will result in low monitoring quality. To keep the assumption as much as possible, DAMON adaptively merges and splits each region based on their access frequency. For each ``aggregation interval``, it compares the access frequencies of adjacent regions and merges those if the frequency difference is small. Then, after it reports and clears the aggregated access frequency of each region, it splits each region into two or three regions if the total number of regions will not exceed the user-specified maximum number of regions after the split. In this way, DAMON provides its best-effort quality and minimal overhead while keeping the upper-bound overhead that users set. Link: https://lkml.kernel.org/r/20210716081449.22187-4-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 30 +++++-- mm/damon/core.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 237 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 67db309ad61b..ce2a84b26cd7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -12,6 +12,9 @@ #include #include +/* Minimal region size. Every damon_region is aligned by this. */ +#define DAMON_MIN_REGION PAGE_SIZE + /** * struct damon_addr_range - Represents an address region of [@start, @end). * @start: Start address of the region (inclusive). @@ -39,6 +42,7 @@ struct damon_region { /** * struct damon_target - Represents a monitoring target. * @id: Unique identifier for this target. + * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. * @@ -50,6 +54,7 @@ struct damon_region { */ struct damon_target { unsigned long id; + unsigned int nr_regions; struct list_head regions_list; struct list_head list; }; @@ -85,6 +90,8 @@ struct damon_ctx; * prepared for the next access check. * @check_accesses should check the accesses to each region that made after the * last preparation and update the number of observed accesses of each region. + * It should also return max number of observed accesses that made as a result + * of its update. The value will be used for regions adjustment threshold. * @reset_aggregated should reset the access monitoring results that aggregated * by @check_accesses. * @target_valid should check whether the target is still valid for the @@ -95,7 +102,7 @@ struct damon_primitive { void (*init)(struct damon_ctx *context); void (*update)(struct damon_ctx *context); void (*prepare_access_checks)(struct damon_ctx *context); - void (*check_accesses)(struct damon_ctx *context); + unsigned int (*check_accesses)(struct damon_ctx *context); void (*reset_aggregated)(struct damon_ctx *context); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); @@ -172,7 +179,9 @@ struct damon_callback { * @primitive: Set of monitoring primitives for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @region_targets: Head of monitoring targets (&damon_target) list. + * @min_nr_regions: The minimum number of adaptive monitoring regions. + * @max_nr_regions: The maximum number of adaptive monitoring regions. + * @adaptive_targets: Head of monitoring targets (&damon_target) list. */ struct damon_ctx { unsigned long sample_interval; @@ -191,7 +200,9 @@ struct damon_ctx { struct damon_primitive primitive; struct damon_callback callback; - struct list_head region_targets; + unsigned long min_nr_regions; + unsigned long max_nr_regions; + struct list_head adaptive_targets; }; #define damon_next_region(r) \ @@ -207,28 +218,31 @@ struct damon_ctx { list_for_each_entry_safe(r, next, &t->regions_list, list) #define damon_for_each_target(t, ctx) \ - list_for_each_entry(t, &(ctx)->region_targets, list) + list_for_each_entry(t, &(ctx)->adaptive_targets, list) #define damon_for_each_target_safe(t, next, ctx) \ - list_for_each_entry_safe(t, next, &(ctx)->region_targets, list) + list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list) #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next); + struct damon_region *prev, struct damon_region *next, + struct damon_target *t); void damon_add_region(struct damon_region *r, struct damon_target *t); -void damon_destroy_region(struct damon_region *r); +void damon_destroy_region(struct damon_region *r, struct damon_target *t); struct damon_target *damon_new_target(unsigned long id); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); void damon_free_target(struct damon_target *t); void damon_destroy_target(struct damon_target *t); +unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long primitive_upd_int); + unsigned long aggr_int, unsigned long primitive_upd_int, + unsigned long min_nr_reg, unsigned long max_nr_reg); int damon_start(struct damon_ctx **ctxs, int nr_ctxs); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/mm/damon/core.c b/mm/damon/core.c index 947486a150ce..28a2c78914fa 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -10,8 +10,12 @@ #include #include #include +#include #include +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; @@ -40,19 +44,23 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) * Add a region between two other regions */ inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next) + struct damon_region *prev, struct damon_region *next, + struct damon_target *t) { __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; } void damon_add_region(struct damon_region *r, struct damon_target *t) { list_add_tail(&r->list, &t->regions_list); + t->nr_regions++; } -static void damon_del_region(struct damon_region *r) +static void damon_del_region(struct damon_region *r, struct damon_target *t) { list_del(&r->list); + t->nr_regions--; } static void damon_free_region(struct damon_region *r) @@ -60,9 +68,9 @@ static void damon_free_region(struct damon_region *r) kfree(r); } -void damon_destroy_region(struct damon_region *r) +void damon_destroy_region(struct damon_region *r, struct damon_target *t) { - damon_del_region(r); + damon_del_region(r, t); damon_free_region(r); } @@ -80,6 +88,7 @@ struct damon_target *damon_new_target(unsigned long id) return NULL; t->id = id; + t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); return t; @@ -87,7 +96,7 @@ struct damon_target *damon_new_target(unsigned long id) void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) { - list_add_tail(&t->list, &ctx->region_targets); + list_add_tail(&t->list, &ctx->adaptive_targets); } static void damon_del_target(struct damon_target *t) @@ -110,6 +119,11 @@ void damon_destroy_target(struct damon_target *t) damon_free_target(t); } +unsigned int damon_nr_regions(struct damon_target *t) +{ + return t->nr_regions; +} + struct damon_ctx *damon_new_ctx(void) { struct damon_ctx *ctx; @@ -127,7 +141,10 @@ struct damon_ctx *damon_new_ctx(void) mutex_init(&ctx->kdamond_lock); - INIT_LIST_HEAD(&ctx->region_targets); + ctx->min_nr_regions = 10; + ctx->max_nr_regions = 1000; + + INIT_LIST_HEAD(&ctx->adaptive_targets); return ctx; } @@ -157,6 +174,8 @@ void damon_destroy_ctx(struct damon_ctx *ctx) * @sample_int: time interval between samplings * @aggr_int: time interval between aggregations * @primitive_upd_int: time interval between monitoring primitive updates + * @min_nr_reg: minimal number of regions + * @max_nr_reg: maximum number of regions * * This function should not be called while the kdamond is running. * Every time interval is in micro-seconds. @@ -164,15 +183,49 @@ void damon_destroy_ctx(struct damon_ctx *ctx) * Return: 0 on success, negative error code otherwise. */ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long primitive_upd_int) + unsigned long aggr_int, unsigned long primitive_upd_int, + unsigned long min_nr_reg, unsigned long max_nr_reg) { + if (min_nr_reg < 3) { + pr_err("min_nr_regions (%lu) must be at least 3\n", + min_nr_reg); + return -EINVAL; + } + if (min_nr_reg > max_nr_reg) { + pr_err("invalid nr_regions. min (%lu) > max (%lu)\n", + min_nr_reg, max_nr_reg); + return -EINVAL; + } + ctx->sample_interval = sample_int; ctx->aggr_interval = aggr_int; ctx->primitive_update_interval = primitive_upd_int; + ctx->min_nr_regions = min_nr_reg; + ctx->max_nr_regions = max_nr_reg; return 0; } +/* Returns the size upper limit for each monitoring region */ +static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sz = 0; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) + sz += r->ar.end - r->ar.start; + } + + if (ctx->min_nr_regions) + sz /= ctx->min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + return sz; +} + static bool damon_kdamond_running(struct damon_ctx *ctx) { bool running; @@ -339,6 +392,150 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) } } +#define sz_damon_region(r) (r->ar.end - r->ar.start) + +/* + * Merge two adjacent regions into one region + */ +static void damon_merge_two_regions(struct damon_target *t, + struct damon_region *l, struct damon_region *r) +{ + unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r); + + l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / + (sz_l + sz_r); + l->ar.end = r->ar.end; + damon_destroy_region(r, t); +} + +#define diff_of(a, b) (a > b ? a - b : b - a) + +/* + * Merge adjacent regions having similar access frequencies + * + * t target affected by this merge operation + * thres '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + */ +static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, + unsigned long sz_limit) +{ + struct damon_region *r, *prev = NULL, *next; + + damon_for_each_region_safe(r, next, t) { + if (prev && prev->ar.end == r->ar.start && + diff_of(prev->nr_accesses, r->nr_accesses) <= thres && + sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) + damon_merge_two_regions(t, prev, r); + else + prev = r; + } +} + +/* + * Merge adjacent regions having similar access frequencies + * + * threshold '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + * + * This function merges monitoring target regions which are adjacent and their + * access frequencies are similar. This is for minimizing the monitoring + * overhead under the dynamically changeable access pattern. If a merge was + * unnecessarily made, later 'kdamond_split_regions()' will revert it. + */ +static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, + unsigned long sz_limit) +{ + struct damon_target *t; + + damon_for_each_target(t, c) + damon_merge_regions_of(t, threshold, sz_limit); +} + +/* + * Split a region in two + * + * r the region to be split + * sz_r size of the first sub-region that will be made + */ +static void damon_split_region_at(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + unsigned long sz_r) +{ + struct damon_region *new; + + new = damon_new_region(r->ar.start + sz_r, r->ar.end); + if (!new) + return; + + r->ar.end = new->ar.start; + + damon_insert_region(new, r, damon_next_region(r), t); +} + +/* Split every region in the given target into 'nr_subs' regions */ +static void damon_split_regions_of(struct damon_ctx *ctx, + struct damon_target *t, int nr_subs) +{ + struct damon_region *r, *next; + unsigned long sz_region, sz_sub = 0; + int i; + + damon_for_each_region_safe(r, next, t) { + sz_region = r->ar.end - r->ar.start; + + for (i = 0; i < nr_subs - 1 && + sz_region > 2 * DAMON_MIN_REGION; i++) { + /* + * Randomly select size of left sub-region to be at + * least 10 percent and at most 90% of original region + */ + sz_sub = ALIGN_DOWN(damon_rand(1, 10) * + sz_region / 10, DAMON_MIN_REGION); + /* Do not allow blank region */ + if (sz_sub == 0 || sz_sub >= sz_region) + continue; + + damon_split_region_at(ctx, t, r, sz_sub); + sz_region = sz_sub; + } + } +} + +/* + * Split every target region into randomly-sized small regions + * + * This function splits every target region into random-sized small regions if + * current total number of the regions is equal or smaller than half of the + * user-specified maximum number of regions. This is for maximizing the + * monitoring accuracy under the dynamically changeable access patterns. If a + * split was unnecessarily made, later 'kdamond_merge_regions()' will revert + * it. + */ +static void kdamond_split_regions(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_regions = 0; + static unsigned int last_nr_regions; + int nr_subregions = 2; + + damon_for_each_target(t, ctx) + nr_regions += damon_nr_regions(t); + + if (nr_regions > ctx->max_nr_regions / 2) + return; + + /* Maybe the middle of the region has different access frequency */ + if (last_nr_regions == nr_regions && + nr_regions < ctx->max_nr_regions / 3) + nr_subregions = 3; + + damon_for_each_target(t, ctx) + damon_split_regions_of(ctx, t, nr_subregions); + + last_nr_regions = nr_regions; +} + /* * Check whether it is time to check and apply the target monitoring regions * @@ -395,6 +592,8 @@ static int kdamond_fn(void *data) struct damon_ctx *ctx = (struct damon_ctx *)data; struct damon_target *t; struct damon_region *r, *next; + unsigned int max_nr_accesses = 0; + unsigned long sz_limit = 0; mutex_lock(&ctx->kdamond_lock); pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); @@ -405,6 +604,8 @@ static int kdamond_fn(void *data) if (ctx->callback.before_start && ctx->callback.before_start(ctx)) set_kdamond_stop(ctx); + sz_limit = damon_region_sz_limit(ctx); + while (!kdamond_need_stop(ctx)) { if (ctx->primitive.prepare_access_checks) ctx->primitive.prepare_access_checks(ctx); @@ -415,13 +616,17 @@ static int kdamond_fn(void *data) usleep_range(ctx->sample_interval, ctx->sample_interval + 1); if (ctx->primitive.check_accesses) - ctx->primitive.check_accesses(ctx); + max_nr_accesses = ctx->primitive.check_accesses(ctx); if (kdamond_aggregate_interval_passed(ctx)) { + kdamond_merge_regions(ctx, + max_nr_accesses / 10, + sz_limit); if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) set_kdamond_stop(ctx); kdamond_reset_aggregated(ctx); + kdamond_split_regions(ctx); if (ctx->primitive.reset_aggregated) ctx->primitive.reset_aggregated(ctx); } @@ -429,11 +634,12 @@ static int kdamond_fn(void *data) if (kdamond_need_update_primitive(ctx)) { if (ctx->primitive.update) ctx->primitive.update(ctx); + sz_limit = damon_region_sz_limit(ctx); } } damon_for_each_target(t, ctx) { damon_for_each_region_safe(r, next, t) - damon_destroy_region(r); + damon_destroy_region(r, t); } if (ctx->callback.before_terminate && -- cgit v1.2.3-71-gd317 From 1c676e0d9b1a59b98885b24a0e16a81fe4cc8301 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:40 -0700 Subject: mm/idle_page_tracking: make PG_idle reusable PG_idle and PG_young allow the two PTE Accessed bit users, Idle Page Tracking and the reclaim logic concurrently work while not interfering with each other. That is, when they need to clear the Accessed bit, they set PG_young to represent the previous state of the bit, respectively. And when they need to read the bit, if the bit is cleared, they further read the PG_young to know whether the other has cleared the bit meanwhile or not. For yet another user of the PTE Accessed bit, we could add another page flag, or extend the mechanism to use the flags. For the DAMON usecase, however, we don't need to do that just yet. IDLE_PAGE_TRACKING and DAMON are mutually exclusive, so there's only ever going to be one user of the current set of flags. In this commit, we split out the CONFIG options to allow for the use of PG_young and PG_idle outside of idle page tracking. In the next commit, DAMON's reference implementation of the virtual memory address space monitoring primitives will use it. [sjpark@amazon.de: set PAGE_EXTENSION for non-64BIT] Link: https://lkml.kernel.org/r/20210806095153.6444-1-sj38.park@gmail.com [akpm@linux-foundation.org: tweak Kconfig text] [sjpark@amazon.de: hide PAGE_IDLE_FLAG from users] Link: https://lkml.kernel.org/r/20210813081238.34705-1-sj38.park@gmail.com Link: https://lkml.kernel.org/r/20210716081449.22187-5-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Fernand Sieber Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 ++-- include/linux/page_ext.h | 2 +- include/linux/page_idle.h | 6 +++--- include/trace/events/mmflags.h | 2 +- mm/Kconfig | 10 +++++++++- mm/page_ext.c | 12 +++++++++++- mm/page_idle.c | 10 ---------- 7 files changed, 27 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6b8d66965145..0a51dd1bb6b1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -131,7 +131,7 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif @@ -441,7 +441,7 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) TESTPAGEFLAG(Young, young, PF_ANY) SETPAGEFLAG(Young, young, PF_ANY) TESTCLEARFLAG(Young, young, PF_ANY) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index aff81ba31bd8..fabb2e1e087f 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -19,7 +19,7 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, #endif diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 1e894d34bdce..d8a6aecf99cb 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -6,7 +6,7 @@ #include #include -#ifdef CONFIG_IDLE_PAGE_TRACKING +#ifdef CONFIG_PAGE_IDLE_FLAG #ifdef CONFIG_64BIT static inline bool page_is_young(struct page *page) @@ -106,7 +106,7 @@ static inline void clear_page_idle(struct page *page) } #endif /* CONFIG_64BIT */ -#else /* !CONFIG_IDLE_PAGE_TRACKING */ +#else /* !CONFIG_PAGE_IDLE_FLAG */ static inline bool page_is_young(struct page *page) { @@ -135,6 +135,6 @@ static inline void clear_page_idle(struct page *page) { } -#endif /* CONFIG_IDLE_PAGE_TRACKING */ +#endif /* CONFIG_PAGE_IDLE_FLAG */ #endif /* _LINUX_MM_PAGE_IDLE_H */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index f160484afc5c..a26dbefdf294 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -75,7 +75,7 @@ #define IF_HAVE_PG_HWPOISON(flag,string) #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) #define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string} #else #define IF_HAVE_PG_IDLE(flag,string) diff --git a/mm/Kconfig b/mm/Kconfig index 8459167b0294..d16ba9249bc5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -739,10 +739,18 @@ config DEFERRED_STRUCT_PAGE_INIT lifetime of the system until these kthreads finish the initialisation. +config PAGE_IDLE_FLAG + bool + select PAGE_EXTENSION if !64BIT + help + This adds PG_idle and PG_young flags to 'struct page'. PTE Accessed + bit writers can set the state of the bit in the flags so that PTE + Accessed bit readers may avoid disturbance. + config IDLE_PAGE_TRACKING bool "Enable idle page tracking" depends on SYSFS && MMU - select PAGE_EXTENSION if !64BIT + select PAGE_IDLE_FLAG help This feature allows to estimate the amount of user pages that have not been touched during a given period of time. This information can diff --git a/mm/page_ext.c b/mm/page_ext.c index 293b2685fc48..dfb91653d359 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -58,11 +58,21 @@ * can utilize this callback to initialize the state of it correctly. */ +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) +static bool need_page_idle(void) +{ + return true; +} +struct page_ext_operations page_idle_ops = { + .need = need_page_idle, +}; +#endif + static struct page_ext_operations *page_ext_ops[] = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif }; diff --git a/mm/page_idle.c b/mm/page_idle.c index 64e5344a992c..edead6a8a5f9 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -207,16 +207,6 @@ static const struct attribute_group page_idle_attr_group = { .name = "page_idle", }; -#ifndef CONFIG_64BIT -static bool need_page_idle(void) -{ - return true; -} -struct page_ext_operations page_idle_ops = { - .need = need_page_idle, -}; -#endif - static int __init page_idle_init(void) { int err; -- cgit v1.2.3-71-gd317 From 3f49584b262cf8f42b25f4c1ad9f5bfd3bdc1bca Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:44 -0700 Subject: mm/damon: implement primitives for the virtual memory address spaces This commit introduces a reference implementation of the address space specific low level primitives for the virtual address space, so that users of DAMON can easily monitor the data accesses on virtual address spaces of specific processes by simply configuring the implementation to be used by DAMON. The low level primitives for the fundamental access monitoring are defined in two parts: 1. Identification of the monitoring target address range for the address space. 2. Access check of specific address range in the target space. The reference implementation for the virtual address space does the works as below. PTE Accessed-bit Based Access Check ----------------------------------- The implementation uses PTE Accessed-bit for basic access checks. That is, it clears the bit for the next sampling target page and checks whether it is set again after one sampling period. This could disturb the reclaim logic. DAMON uses ``PG_idle`` and ``PG_young`` page flags to solve the conflict, as Idle page tracking does. VMA-based Target Address Range Construction ------------------------------------------- Only small parts in the super-huge virtual address space of the processes are mapped to physical memory and accessed. Thus, tracking the unmapped address regions is just wasteful. However, because DAMON can deal with some level of noise using the adaptive regions adjustment mechanism, tracking every mapping is not strictly required but could even incur a high overhead in some cases. That said, too huge unmapped areas inside the monitoring target should be removed to not take the time for the adaptive mechanism. For the reason, this implementation converts the complex mappings to three distinct regions that cover every mapped area of the address space. Also, the two gaps between the three regions are the two biggest unmapped areas in the given address space. The two biggest unmapped areas would be the gap between the heap and the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed region and the stack in most of the cases. Because these gaps are exceptionally huge in usual address spaces, excluding these will be sufficient to make a reasonable trade-off. Below shows this in detail:: (small mmap()-ed regions and munmap()-ed regions) [akpm@linux-foundation.org: mm/damon/vaddr.c needs highmem.h for kunmap_atomic()] [sjpark@amazon.de: remove unnecessary PAGE_EXTENSION setup] Link: https://lkml.kernel.org/r/20210806095153.6444-2-sj38.park@gmail.com [sjpark@amazon.de: safely walk page table] Link: https://lkml.kernel.org/r/20210831161800.29419-1-sj38.park@gmail.com Link: https://lkml.kernel.org/r/20210716081449.22187-6-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 13 + mm/damon/Kconfig | 8 + mm/damon/Makefile | 1 + mm/damon/vaddr.c | 665 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 687 insertions(+) create mode 100644 mm/damon/vaddr.c (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index ce2a84b26cd7..edb350e52b93 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -249,4 +249,17 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); #endif /* CONFIG_DAMON */ +#ifdef CONFIG_DAMON_VADDR + +/* Monitoring primitives for virtual memory address spaces */ +void damon_va_init(struct damon_ctx *ctx); +void damon_va_update(struct damon_ctx *ctx); +void damon_va_prepare_access_checks(struct damon_ctx *ctx); +unsigned int damon_va_check_accesses(struct damon_ctx *ctx); +bool damon_va_target_valid(void *t); +void damon_va_cleanup(struct damon_ctx *ctx); +void damon_va_set_primitives(struct damon_ctx *ctx); + +#endif /* CONFIG_DAMON_VADDR */ + #endif /* _DAMON_H */ diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index d00e99ac1a15..5cbb5db54158 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -12,4 +12,12 @@ config DAMON See https://damonitor.github.io/doc/html/latest-damon/index.html for more information. +config DAMON_VADDR + bool "Data access monitoring primitives for virtual address spaces" + depends on DAMON && MMU + select PAGE_IDLE_FLAG + help + This builds the default data access monitoring primitives for DAMON + that works for virtual address spaces. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 4fd2edb4becf..6ebbd08aed67 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DAMON) := core.o +obj-$(CONFIG_DAMON_VADDR) += vaddr.o diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c new file mode 100644 index 000000000000..897aa8cf96c8 --- /dev/null +++ b/mm/damon/vaddr.c @@ -0,0 +1,665 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Primitives for Virtual Address Spaces + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-va: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + +/* + * 't->id' should be the pointer to the relevant 'struct pid' having reference + * count. Caller must put the returned task, unless it is NULL. + */ +#define damon_get_task_struct(t) \ + (get_pid_task((struct pid *)t->id, PIDTYPE_PID)) + +/* + * Get the mm_struct of the given target + * + * Caller _must_ put the mm_struct after use, unless it is NULL. + * + * Returns the mm_struct of the target on success, NULL on failure + */ +static struct mm_struct *damon_get_mm(struct damon_target *t) +{ + struct task_struct *task; + struct mm_struct *mm; + + task = damon_get_task_struct(t); + if (!task) + return NULL; + + mm = get_task_mm(task); + put_task_struct(task); + return mm; +} + +/* + * Functions for the initial monitoring target regions construction + */ + +/* + * Size-evenly split a region into 'nr_pieces' small regions + * + * Returns 0 on success, or negative error code otherwise. + */ +static int damon_va_evenly_split_region(struct damon_target *t, + struct damon_region *r, unsigned int nr_pieces) +{ + unsigned long sz_orig, sz_piece, orig_end; + struct damon_region *n = NULL, *next; + unsigned long start; + + if (!r || !nr_pieces) + return -EINVAL; + + orig_end = r->ar.end; + sz_orig = r->ar.end - r->ar.start; + sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); + + if (!sz_piece) + return -EINVAL; + + r->ar.end = r->ar.start + sz_piece; + next = damon_next_region(r); + for (start = r->ar.end; start + sz_piece <= orig_end; + start += sz_piece) { + n = damon_new_region(start, start + sz_piece); + if (!n) + return -ENOMEM; + damon_insert_region(n, r, next, t); + r = n; + } + /* complement last region for possible rounding error */ + if (n) + n->ar.end = orig_end; + + return 0; +} + +static unsigned long sz_range(struct damon_addr_range *r) +{ + return r->end - r->start; +} + +static void swap_ranges(struct damon_addr_range *r1, + struct damon_addr_range *r2) +{ + struct damon_addr_range tmp; + + tmp = *r1; + *r1 = *r2; + *r2 = tmp; +} + +/* + * Find three regions separated by two biggest unmapped regions + * + * vma the head vma of the target address space + * regions an array of three address ranges that results will be saved + * + * This function receives an address space and finds three regions in it which + * separated by the two biggest unmapped regions in the space. Please refer to + * below comments of '__damon_va_init_regions()' function to know why this is + * necessary. + * + * Returns 0 if success, or negative error code otherwise. + */ +static int __damon_va_three_regions(struct vm_area_struct *vma, + struct damon_addr_range regions[3]) +{ + struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0}; + struct vm_area_struct *last_vma = NULL; + unsigned long start = 0; + struct rb_root rbroot; + + /* Find two biggest gaps so that first_gap > second_gap > others */ + for (; vma; vma = vma->vm_next) { + if (!last_vma) { + start = vma->vm_start; + goto next; + } + + if (vma->rb_subtree_gap <= sz_range(&second_gap)) { + rbroot.rb_node = &vma->vm_rb; + vma = rb_entry(rb_last(&rbroot), + struct vm_area_struct, vm_rb); + goto next; + } + + gap.start = last_vma->vm_end; + gap.end = vma->vm_start; + if (sz_range(&gap) > sz_range(&second_gap)) { + swap_ranges(&gap, &second_gap); + if (sz_range(&second_gap) > sz_range(&first_gap)) + swap_ranges(&second_gap, &first_gap); + } +next: + last_vma = vma; + } + + if (!sz_range(&second_gap) || !sz_range(&first_gap)) + return -EINVAL; + + /* Sort the two biggest gaps by address */ + if (first_gap.start > second_gap.start) + swap_ranges(&first_gap, &second_gap); + + /* Store the result */ + regions[0].start = ALIGN(start, DAMON_MIN_REGION); + regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); + regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); + regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); + regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); + regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION); + + return 0; +} + +/* + * Get the three regions in the given target (task) + * + * Returns 0 on success, negative error code otherwise. + */ +static int damon_va_three_regions(struct damon_target *t, + struct damon_addr_range regions[3]) +{ + struct mm_struct *mm; + int rc; + + mm = damon_get_mm(t); + if (!mm) + return -EINVAL; + + mmap_read_lock(mm); + rc = __damon_va_three_regions(mm->mmap, regions); + mmap_read_unlock(mm); + + mmput(mm); + return rc; +} + +/* + * Initialize the monitoring target regions for the given target (task) + * + * t the given target + * + * Because only a number of small portions of the entire address space + * is actually mapped to the memory and accessed, monitoring the unmapped + * regions is wasteful. That said, because we can deal with small noises, + * tracking every mapping is not strictly required but could even incur a high + * overhead if the mapping frequently changes or the number of mappings is + * high. The adaptive regions adjustment mechanism will further help to deal + * with the noise by simply identifying the unmapped areas as a region that + * has no access. Moreover, applying the real mappings that would have many + * unmapped areas inside will make the adaptive mechanism quite complex. That + * said, too huge unmapped areas inside the monitoring target should be removed + * to not take the time for the adaptive mechanism. + * + * For the reason, we convert the complex mappings to three distinct regions + * that cover every mapped area of the address space. Also the two gaps + * between the three regions are the two biggest unmapped areas in the given + * address space. In detail, this function first identifies the start and the + * end of the mappings and the two biggest unmapped areas of the address space. + * Then, it constructs the three regions as below: + * + * [mappings[0]->start, big_two_unmapped_areas[0]->start) + * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start) + * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end) + * + * As usual memory map of processes is as below, the gap between the heap and + * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed + * region and the stack will be two biggest unmapped regions. Because these + * gaps are exceptionally huge areas in usual address space, excluding these + * two biggest unmapped regions will be sufficient to make a trade-off. + * + * + * + * + * (other mmap()-ed regions and small unmapped regions) + * + * + * + */ +static void __damon_va_init_regions(struct damon_ctx *ctx, + struct damon_target *t) +{ + struct damon_region *r; + struct damon_addr_range regions[3]; + unsigned long sz = 0, nr_pieces; + int i; + + if (damon_va_three_regions(t, regions)) { + pr_err("Failed to get three regions of target %lu\n", t->id); + return; + } + + for (i = 0; i < 3; i++) + sz += regions[i].end - regions[i].start; + if (ctx->min_nr_regions) + sz /= ctx->min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + /* Set the initial three regions of the target */ + for (i = 0; i < 3; i++) { + r = damon_new_region(regions[i].start, regions[i].end); + if (!r) { + pr_err("%d'th init region creation failed\n", i); + return; + } + damon_add_region(r, t); + + nr_pieces = (regions[i].end - regions[i].start) / sz; + damon_va_evenly_split_region(t, r, nr_pieces); + } +} + +/* Initialize '->regions_list' of every target (task) */ +void damon_va_init(struct damon_ctx *ctx) +{ + struct damon_target *t; + + damon_for_each_target(t, ctx) { + /* the user may set the target regions as they want */ + if (!damon_nr_regions(t)) + __damon_va_init_regions(ctx, t); + } +} + +/* + * Functions for the dynamic monitoring target regions update + */ + +/* + * Check whether a region is intersecting an address range + * + * Returns true if it is. + */ +static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re) +{ + return !(r->ar.end <= re->start || re->end <= r->ar.start); +} + +/* + * Update damon regions for the three big regions of the given target + * + * t the given target + * bregions the three big regions of the target + */ +static void damon_va_apply_three_regions(struct damon_target *t, + struct damon_addr_range bregions[3]) +{ + struct damon_region *r, *next; + unsigned int i = 0; + + /* Remove regions which are not in the three big regions now */ + damon_for_each_region_safe(r, next, t) { + for (i = 0; i < 3; i++) { + if (damon_intersect(r, &bregions[i])) + break; + } + if (i == 3) + damon_destroy_region(r, t); + } + + /* Adjust intersecting regions to fit with the three big regions */ + for (i = 0; i < 3; i++) { + struct damon_region *first = NULL, *last; + struct damon_region *newr; + struct damon_addr_range *br; + + br = &bregions[i]; + /* Get the first and last regions which intersects with br */ + damon_for_each_region(r, t) { + if (damon_intersect(r, br)) { + if (!first) + first = r; + last = r; + } + if (r->ar.start >= br->end) + break; + } + if (!first) { + /* no damon_region intersects with this big region */ + newr = damon_new_region( + ALIGN_DOWN(br->start, + DAMON_MIN_REGION), + ALIGN(br->end, DAMON_MIN_REGION)); + if (!newr) + continue; + damon_insert_region(newr, damon_prev_region(r), r, t); + } else { + first->ar.start = ALIGN_DOWN(br->start, + DAMON_MIN_REGION); + last->ar.end = ALIGN(br->end, DAMON_MIN_REGION); + } + } +} + +/* + * Update regions for current memory mappings + */ +void damon_va_update(struct damon_ctx *ctx) +{ + struct damon_addr_range three_regions[3]; + struct damon_target *t; + + damon_for_each_target(t, ctx) { + if (damon_va_three_regions(t, three_regions)) + continue; + damon_va_apply_three_regions(t, three_regions); + } +} + +/* + * Get an online page for a pfn if it's in the LRU list. Otherwise, returns + * NULL. + * + * The body of this function is stolen from the 'page_idle_get_page()'. We + * steal rather than reuse it because the code is quite simple. + */ +static struct page *damon_get_page(unsigned long pfn) +{ + struct page *page = pfn_to_online_page(pfn); + + if (!page || !PageLRU(page) || !get_page_unless_zero(page)) + return NULL; + + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + return page; +} + +static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, + unsigned long addr) +{ + bool referenced = false; + struct page *page = damon_get_page(pte_pfn(*pte)); + + if (!page) + return; + + if (pte_young(*pte)) { + referenced = true; + *pte = pte_mkold(*pte); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, + unsigned long addr) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool referenced = false; + struct page *page = damon_get_page(pmd_pfn(*pmd)); + + if (!page) + return; + + if (pmd_young(*pmd)) { + referenced = true; + *pmd = pmd_mkold(*pmd); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + ((1UL) << HPAGE_PMD_SHIFT))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} + +static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t *pte; + spinlock_t *ptl; + + if (pmd_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (pmd_huge(*pmd)) { + damon_pmdp_mkold(pmd, walk->mm, addr); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + } + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return 0; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte_present(*pte)) + goto out; + damon_ptep_mkold(pte, walk->mm, addr); +out: + pte_unmap_unlock(pte, ptl); + return 0; +} + +static struct mm_walk_ops damon_mkold_ops = { + .pmd_entry = damon_mkold_pmd_entry, +}; + +static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) +{ + mmap_read_lock(mm); + walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); + mmap_read_unlock(mm); +} + +/* + * Functions for the access checking of the regions + */ + +static void damon_va_prepare_access_check(struct damon_ctx *ctx, + struct mm_struct *mm, struct damon_region *r) +{ + r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + + damon_va_mkold(mm, r->sampling_addr); +} + +void damon_va_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + damon_for_each_region(r, t) + damon_va_prepare_access_check(ctx, mm, r); + mmput(mm); + } +} + +struct damon_young_walk_private { + unsigned long *page_sz; + bool young; +}; + +static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t *pte; + spinlock_t *ptl; + struct page *page; + struct damon_young_walk_private *priv = walk->private; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (!pmd_huge(*pmd)) { + spin_unlock(ptl); + goto regular_page; + } + page = damon_get_page(pmd_pfn(*pmd)); + if (!page) + goto huge_out; + if (pmd_young(*pmd) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, + addr)) { + *priv->page_sz = ((1UL) << HPAGE_PMD_SHIFT); + priv->young = true; + } + put_page(page); +huge_out: + spin_unlock(ptl); + return 0; + } + +regular_page: +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return -EINVAL; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte_present(*pte)) + goto out; + page = damon_get_page(pte_pfn(*pte)); + if (!page) + goto out; + if (pte_young(*pte) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) { + *priv->page_sz = PAGE_SIZE; + priv->young = true; + } + put_page(page); +out: + pte_unmap_unlock(pte, ptl); + return 0; +} + +static struct mm_walk_ops damon_young_ops = { + .pmd_entry = damon_young_pmd_entry, +}; + +static bool damon_va_young(struct mm_struct *mm, unsigned long addr, + unsigned long *page_sz) +{ + struct damon_young_walk_private arg = { + .page_sz = page_sz, + .young = false, + }; + + mmap_read_lock(mm); + walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); + mmap_read_unlock(mm); + return arg.young; +} + +/* + * Check whether the region was accessed after the last preparation + * + * mm 'mm_struct' for the given virtual address space + * r the region to be checked + */ +static void damon_va_check_access(struct damon_ctx *ctx, + struct mm_struct *mm, struct damon_region *r) +{ + static struct mm_struct *last_mm; + static unsigned long last_addr; + static unsigned long last_page_sz = PAGE_SIZE; + static bool last_accessed; + + /* If the region is in the last checked page, reuse the result */ + if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) == + ALIGN_DOWN(r->sampling_addr, last_page_sz))) { + if (last_accessed) + r->nr_accesses++; + return; + } + + last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz); + if (last_accessed) + r->nr_accesses++; + + last_mm = mm; + last_addr = r->sampling_addr; +} + +unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + unsigned int max_nr_accesses = 0; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + damon_for_each_region(r, t) { + damon_va_check_access(ctx, mm, r); + max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + } + mmput(mm); + } + + return max_nr_accesses; +} + +/* + * Functions for the target validity check and cleanup + */ + +bool damon_va_target_valid(void *target) +{ + struct damon_target *t = target; + struct task_struct *task; + + task = damon_get_task_struct(t); + if (task) { + put_task_struct(task); + return true; + } + + return false; +} + +void damon_va_set_primitives(struct damon_ctx *ctx) +{ + ctx->primitive.init = damon_va_init; + ctx->primitive.update = damon_va_update; + ctx->primitive.prepare_access_checks = damon_va_prepare_access_checks; + ctx->primitive.check_accesses = damon_va_check_accesses; + ctx->primitive.reset_aggregated = NULL; + ctx->primitive.target_valid = damon_va_target_valid; + ctx->primitive.cleanup = NULL; +} -- cgit v1.2.3-71-gd317 From 4bc05954d0076655cfaf6f0135585bdc20cd6b11 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:53 -0700 Subject: mm/damon: implement a debugfs-based user space interface DAMON is designed to be used by kernel space code such as the memory management subsystems, and therefore it provides only kernel space API. That said, letting the user space control DAMON could provide some benefits to them. For example, it will allow user space to analyze their specific workloads and make their own special optimizations. For such cases, this commit implements a simple DAMON application kernel module, namely 'damon-dbgfs', which merely wraps the DAMON api and exports those to the user space via the debugfs. 'damon-dbgfs' exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under its debugfs directory, ``/damon/``. Attributes ---------- Users can read and write the ``sampling interval``, ``aggregation interval``, ``regions update interval``, and min/max number of monitoring target regions by reading from and writing to the ``attrs`` file. For example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10, 1000 and check it again:: # cd /damon # echo 5000 100000 1000000 10 1000 > attrs # cat attrs 5000 100000 1000000 10 1000 Target IDs ---------- Some types of address spaces supports multiple monitoring target. For example, the virtual memory address spaces monitoring can have multiple processes as the monitoring targets. Users can set the targets by writing relevant id values of the targets to, and get the ids of the current targets by reading from the ``target_ids`` file. In case of the virtual address spaces monitoring, the values should be pids of the monitoring target processes. For example, below commands set processes having pids 42 and 4242 as the monitoring targets and check it again:: # cd /damon # echo 42 4242 > target_ids # cat target_ids 42 4242 Note that setting the target ids doesn't start the monitoring. Turning On/Off -------------- Setting the files as described above doesn't incur effect unless you explicitly start the monitoring. You can start, stop, and check the current status of the monitoring by writing to and reading from the ``monitor_on`` file. Writing ``on`` to the file starts the monitoring of the targets with the attributes. Writing ``off`` to the file stops those. DAMON also stops if every targets are invalidated (in case of the virtual memory monitoring, target processes are invalidated when terminated). Below example commands turn on, off, and check the status of DAMON:: # cd /damon # echo on > monitor_on # echo off > monitor_on # cat monitor_on off Please note that you cannot write to the above-mentioned debugfs files while the monitoring is turned on. If you write to the files while DAMON is running, an error code such as ``-EBUSY`` will be returned. [akpm@linux-foundation.org: remove unneeded "alloc failed" printks] [akpm@linux-foundation.org: replace macro with static inline] Link: https://lkml.kernel.org/r/20210716081449.22187-8-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 3 + mm/damon/Kconfig | 9 ++ mm/damon/Makefile | 1 + mm/damon/core.c | 47 ++++++ mm/damon/dbgfs.c | 397 ++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 457 insertions(+) create mode 100644 mm/damon/dbgfs.c (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index edb350e52b93..d68b67b8d458 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -240,9 +240,12 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); +int damon_set_targets(struct damon_ctx *ctx, + unsigned long *ids, ssize_t nr_ids); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_nr_running_ctxs(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 5cbb5db54158..c8e3dba6fb4c 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -20,4 +20,13 @@ config DAMON_VADDR This builds the default data access monitoring primitives for DAMON that works for virtual address spaces. +config DAMON_DBGFS + bool "DAMON debugfs interface" + depends on DAMON_VADDR && DEBUG_FS + help + This builds the debugfs interface for DAMON. The user space admins + can use the interface for arbitrary data access monitoring. + + If unsure, say N. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 6ebbd08aed67..fed4be3bace3 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_DAMON) := core.o obj-$(CONFIG_DAMON_VADDR) += vaddr.o +obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o diff --git a/mm/damon/core.c b/mm/damon/core.c index ee24d64e8019..59033488402e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -171,6 +171,39 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); } +/** + * damon_set_targets() - Set monitoring targets. + * @ctx: monitoring context + * @ids: array of target ids + * @nr_ids: number of entries in @ids + * + * This function should not be called while the kdamond is running. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_targets(struct damon_ctx *ctx, + unsigned long *ids, ssize_t nr_ids) +{ + ssize_t i; + struct damon_target *t, *next; + + damon_destroy_targets(ctx); + + for (i = 0; i < nr_ids; i++) { + t = damon_new_target(ids[i]); + if (!t) { + pr_err("Failed to alloc damon_target\n"); + /* The caller should do cleanup of the ids itself */ + damon_for_each_target_safe(t, next, ctx) + damon_destroy_target(t); + return -ENOMEM; + } + damon_add_target(ctx, t); + } + + return 0; +} + /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context @@ -209,6 +242,20 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, return 0; } +/** + * damon_nr_running_ctxs() - Return number of currently running contexts. + */ +int damon_nr_running_ctxs(void) +{ + int nr_ctxs; + + mutex_lock(&damon_lock); + nr_ctxs = nr_running_ctxs; + mutex_unlock(&damon_lock); + + return nr_ctxs; +} + /* Returns the size upper limit for each monitoring region */ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) { diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c new file mode 100644 index 000000000000..d2e0a547eb3f --- /dev/null +++ b/mm/damon/dbgfs.c @@ -0,0 +1,397 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Debugfs Interface + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-dbgfs: " fmt + +#include +#include +#include +#include +#include +#include +#include + +static struct damon_ctx **dbgfs_ctxs; +static int dbgfs_nr_ctxs; +static struct dentry **dbgfs_dirs; + +/* + * Returns non-empty string on success, negative error code otherwise. + */ +static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t ret; + + /* We do not accept continuous write */ + if (*ppos) + return ERR_PTR(-EINVAL); + + kbuf = kmalloc(count + 1, GFP_KERNEL); + if (!kbuf) + return ERR_PTR(-ENOMEM); + + ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count); + if (ret != count) { + kfree(kbuf); + return ERR_PTR(-EIO); + } + kbuf[ret] = '\0'; + + return kbuf; +} + +static ssize_t dbgfs_attrs_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char kbuf[128]; + int ret; + + mutex_lock(&ctx->kdamond_lock); + ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", + ctx->sample_interval, ctx->aggr_interval, + ctx->primitive_update_interval, ctx->min_nr_regions, + ctx->max_nr_regions); + mutex_unlock(&ctx->kdamond_lock); + + return simple_read_from_buffer(buf, count, ppos, kbuf, ret); +} + +static ssize_t dbgfs_attrs_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + unsigned long s, a, r, minr, maxr; + char *kbuf; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + if (sscanf(kbuf, "%lu %lu %lu %lu %lu", + &s, &a, &r, &minr, &maxr) != 5) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + err = damon_set_attrs(ctx, s, a, r, minr, maxr); + if (err) + ret = err; +unlock_out: + mutex_unlock(&ctx->kdamond_lock); +out: + kfree(kbuf); + return ret; +} + +static inline bool targetid_is_pid(const struct damon_ctx *ctx) +{ + return ctx->primitive.target_valid == damon_va_target_valid; +} + +static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) +{ + struct damon_target *t; + unsigned long id; + int written = 0; + int rc; + + damon_for_each_target(t, ctx) { + id = t->id; + if (targetid_is_pid(ctx)) + /* Show pid numbers to debugfs users */ + id = (unsigned long)pid_vnr((struct pid *)id); + + rc = scnprintf(&buf[written], len - written, "%lu ", id); + if (!rc) + return -ENOMEM; + written += rc; + } + if (written) + written -= 1; + written += scnprintf(&buf[written], len - written, "\n"); + return written; +} + +static ssize_t dbgfs_target_ids_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + ssize_t len; + char ids_buf[320]; + + mutex_lock(&ctx->kdamond_lock); + len = sprint_target_ids(ctx, ids_buf, 320); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + return len; + + return simple_read_from_buffer(buf, count, ppos, ids_buf, len); +} + +/* + * Converts a string into an array of unsigned long integers + * + * Returns an array of unsigned long integers if the conversion success, or + * NULL otherwise. + */ +static unsigned long *str_to_target_ids(const char *str, ssize_t len, + ssize_t *nr_ids) +{ + unsigned long *ids; + const int max_nr_ids = 32; + unsigned long id; + int pos = 0, parsed, ret; + + *nr_ids = 0; + ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL); + if (!ids) + return NULL; + while (*nr_ids < max_nr_ids && pos < len) { + ret = sscanf(&str[pos], "%lu%n", &id, &parsed); + pos += parsed; + if (ret != 1) + break; + ids[*nr_ids] = id; + *nr_ids += 1; + } + + return ids; +} + +static void dbgfs_put_pids(unsigned long *ids, int nr_ids) +{ + int i; + + for (i = 0; i < nr_ids; i++) + put_pid((struct pid *)ids[i]); +} + +static ssize_t dbgfs_target_ids_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf, *nrs; + unsigned long *targets; + ssize_t nr_targets; + ssize_t ret = count; + int i; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + nrs = kbuf; + + targets = str_to_target_ids(nrs, ret, &nr_targets); + if (!targets) { + ret = -ENOMEM; + goto out; + } + + if (targetid_is_pid(ctx)) { + for (i = 0; i < nr_targets; i++) { + targets[i] = (unsigned long)find_get_pid( + (int)targets[i]); + if (!targets[i]) { + dbgfs_put_pids(targets, i); + ret = -EINVAL; + goto free_targets_out; + } + } + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + if (targetid_is_pid(ctx)) + dbgfs_put_pids(targets, nr_targets); + ret = -EBUSY; + goto unlock_out; + } + + err = damon_set_targets(ctx, targets, nr_targets); + if (err) { + if (targetid_is_pid(ctx)) + dbgfs_put_pids(targets, nr_targets); + ret = err; + } + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); +free_targets_out: + kfree(targets); +out: + kfree(kbuf); + return ret; +} + +static int damon_dbgfs_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + + return nonseekable_open(inode, file); +} + +static const struct file_operations attrs_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_attrs_read, + .write = dbgfs_attrs_write, +}; + +static const struct file_operations target_ids_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_target_ids_read, + .write = dbgfs_target_ids_write, +}; + +static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) +{ + const char * const file_names[] = {"attrs", "target_ids"}; + const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops}; + int i; + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); +} + +static int dbgfs_before_terminate(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + + if (!targetid_is_pid(ctx)) + return 0; + + damon_for_each_target_safe(t, next, ctx) { + put_pid((struct pid *)t->id); + damon_destroy_target(t); + } + return 0; +} + +static struct damon_ctx *dbgfs_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = damon_new_ctx(); + if (!ctx) + return NULL; + + damon_va_set_primitives(ctx); + ctx->callback.before_terminate = dbgfs_before_terminate; + return ctx; +} + +static ssize_t dbgfs_monitor_on_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + char monitor_on_buf[5]; + bool monitor_on = damon_nr_running_ctxs() != 0; + int len; + + len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n"); + + return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len); +} + +static ssize_t dbgfs_monitor_on_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + ssize_t ret = count; + char *kbuf; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Remove white space */ + if (sscanf(kbuf, "%s", kbuf) != 1) { + kfree(kbuf); + return -EINVAL; + } + + if (!strncmp(kbuf, "on", count)) + err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); + else if (!strncmp(kbuf, "off", count)) + err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); + else + err = -EINVAL; + + if (err) + ret = err; + kfree(kbuf); + return ret; +} + +static const struct file_operations monitor_on_fops = { + .read = dbgfs_monitor_on_read, + .write = dbgfs_monitor_on_write, +}; + +static int __init __damon_dbgfs_init(void) +{ + struct dentry *dbgfs_root; + const char * const file_names[] = {"monitor_on"}; + const struct file_operations *fops[] = {&monitor_on_fops}; + int i; + + dbgfs_root = debugfs_create_dir("damon", NULL); + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL, + fops[i]); + dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); + + dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL); + if (!dbgfs_dirs) { + debugfs_remove(dbgfs_root); + return -ENOMEM; + } + dbgfs_dirs[0] = dbgfs_root; + + return 0; +} + +/* + * Functions for the initialization + */ + +static int __init damon_dbgfs_init(void) +{ + int rc; + + dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); + if (!dbgfs_ctxs) + return -ENOMEM; + dbgfs_ctxs[0] = dbgfs_new_ctx(); + if (!dbgfs_ctxs[0]) { + kfree(dbgfs_ctxs); + return -ENOMEM; + } + dbgfs_nr_ctxs = 1; + + rc = __damon_dbgfs_init(); + if (rc) { + kfree(dbgfs_ctxs[0]); + kfree(dbgfs_ctxs); + pr_err("%s: dbgfs init failed\n", __func__); + } + + return rc; +} + +module_init(damon_dbgfs_init); -- cgit v1.2.3-71-gd317 From a8a47cf5ce4bbc70a54fa4eca71d35f43dc8218a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 7 Sep 2021 19:57:41 -0700 Subject: include/linux/once.h: fix trivia typo Not -> Note Fix trivia typo Not -> Note in the comment to DO_ONCE(). Link: https://lkml.kernel.org/r/20210722184349.76290-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/once.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/once.h b/include/linux/once.h index ae6f4eb41cbe..d361fb14ac3a 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -16,7 +16,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key, * out the condition into a nop. DO_ONCE() guarantees type safety of * arguments! * - * Not that the following is not equivalent ... + * Note that the following is not equivalent ... * * DO_ONCE(func, arg); * DO_ONCE(func, arg); -- cgit v1.2.3-71-gd317 From c9221919a2d2df5741ab074dfec5bdfc6f1e043b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 7 Sep 2021 19:57:44 -0700 Subject: units: change from 'L' to 'UL' Patch series "Add Hz macros", v3. There are multiple definitions of the HZ_PER_MHZ or HZ_PER_KHZ in the different drivers. Instead of duplicating this definition again and again, add one in the units.h header to be reused in all the place the redefiniton occurs. At the same time, change the type of the Watts, as they can not be negative. This patch (of 10): The users of the macros are safe to be assigned with an unsigned instead of signed as the variables using them are themselves unsigned. Link: https://lkml.kernel.org/r/20210816114732.1834145-1-daniel.lezcano@linaro.org Link: https://lkml.kernel.org/r/20210816114732.1834145-2-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Cc: Andy Shevchenko Cc: Jonathan Cameron Cc: Christian Eggers Cc: Lukasz Luba Cc: MyungJoo Ham Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Peter Meerwald Cc: Zhang Rui Cc: Guenter Roeck Cc: Miquel Raynal Cc: Maxime Coquelin Cc: "Rafael J. Wysocki" Cc: Daniel Lezcano Cc: Chanwoo Choi Cc: Jonathan Cameron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/units.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/units.h b/include/linux/units.h index dcc30a53fa93..ff51d3cfc6a0 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -4,9 +4,9 @@ #include -#define MILLIWATT_PER_WATT 1000L -#define MICROWATT_PER_MILLIWATT 1000L -#define MICROWATT_PER_WATT 1000000L +#define MILLIWATT_PER_WATT 1000UL +#define MICROWATT_PER_MILLIWATT 1000UL +#define MICROWATT_PER_WATT 1000000UL #define ABSOLUTE_ZERO_MILLICELSIUS -273150 -- cgit v1.2.3-71-gd317 From e2c77032fcbe515194107994d12cd72ddb77b022 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 7 Sep 2021 19:57:48 -0700 Subject: units: add the HZ macros The macros for the unit conversion for frequency are duplicated in different places. Provide these macros in the 'units' header, so they can be reused. Link: https://lkml.kernel.org/r/20210816114732.1834145-3-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Reviewed-by: Christian Eggers Reviewed-by: Andy Shevchenko Cc: Chanwoo Choi Cc: Guenter Roeck Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: Miquel Raynal Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/units.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/units.h b/include/linux/units.h index ff51d3cfc6a0..8b8dc8a84d93 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -4,6 +4,10 @@ #include +#define HZ_PER_KHZ 1000UL +#define KHZ_PER_MHZ 1000UL +#define HZ_PER_MHZ 1000000UL + #define MILLIWATT_PER_WATT 1000UL #define MICROWATT_PER_MILLIWATT 1000UL #define MICROWATT_PER_WATT 1000000UL -- cgit v1.2.3-71-gd317 From 1e1c15839df084f4011825fee922aa976c9159dc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 7 Sep 2021 20:00:00 -0700 Subject: fs/epoll: use a per-cpu counter for user's watches count This counter tracks the number of watches a user has, to compare against the 'max_user_watches' limit. This causes a scalability bottleneck on SPECjbb2015 on large systems as there is only one user. Changing to a per-cpu counter increases throughput of the benchmark by about 30% on a 16-socket, > 1000 thread system. [rdunlap@infradead.org: fix build errors in kernel/user.c when CONFIG_EPOLL=n] [npiggin@gmail.com: move ifdefs into wrapper functions, slightly improve panic message] Link: https://lkml.kernel.org/r/1628051945.fens3r99ox.astroid@bobo.none [akpm@linux-foundation.org: tweak user_epoll_alloc(), per Guenter] Link: https://lkml.kernel.org/r/20210804191421.GA1900577@roeck-us.net Link: https://lkml.kernel.org/r/20210802032013.2751916-1-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reported-by: Anton Blanchard Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 18 ++++++++++-------- include/linux/sched/user.h | 3 ++- kernel/user.c | 25 +++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1e596e1d0bba..648ed77f4164 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -723,7 +723,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) */ call_rcu(&epi->rcu, epi_rcu_free); - atomic_long_dec(&ep->user->epoll_watches); + percpu_counter_dec(&ep->user->epoll_watches); return 0; } @@ -1439,7 +1439,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, { int error, pwake = 0; __poll_t revents; - long user_watches; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; @@ -1449,11 +1448,15 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, lockdep_assert_irqs_enabled(); - user_watches = atomic_long_read(&ep->user->epoll_watches); - if (unlikely(user_watches >= max_user_watches)) + if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, + max_user_watches) >= 0)) return -ENOSPC; - if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) + percpu_counter_inc(&ep->user->epoll_watches); + + if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { + percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; + } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); @@ -1466,17 +1469,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { - kmem_cache_free(epi_cache, epi); if (tep) mutex_unlock(&tep->mtx); + kmem_cache_free(epi_cache, epi); + percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); - atomic_long_inc(&ep->user->epoll_watches); - /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 2462f7d07695..00ed419dd464 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -13,7 +14,7 @@ struct user_struct { refcount_t __count; /* reference count */ #ifdef CONFIG_EPOLL - atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ + struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */ #endif unsigned long unix_inflight; /* How many files in flight in unix sockets */ atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ diff --git a/kernel/user.c b/kernel/user.c index c82399c1618a..e2cf8c22b539 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -129,6 +129,22 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) return NULL; } +static int user_epoll_alloc(struct user_struct *up) +{ +#ifdef CONFIG_EPOLL + return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); +#else + return 0; +#endif +} + +static void user_epoll_free(struct user_struct *up) +{ +#ifdef CONFIG_EPOLL + percpu_counter_destroy(&up->epoll_watches); +#endif +} + /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. @@ -138,6 +154,7 @@ static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); + user_epoll_free(up); kmem_cache_free(uid_cachep, up); } @@ -185,6 +202,10 @@ struct user_struct *alloc_uid(kuid_t uid) new->uid = uid; refcount_set(&new->__count, 1); + if (user_epoll_alloc(new)) { + kmem_cache_free(uid_cachep, new); + return NULL; + } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); @@ -195,6 +216,7 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + user_epoll_free(new); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -216,6 +238,9 @@ static int __init uid_cache_init(void) for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); + if (user_epoll_alloc(&root_user)) + panic("root_user epoll percpu counter alloc failed"); + /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); -- cgit v1.2.3-71-gd317 From 5b91a75b3312c03798f555e10569fd85211a490c Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Tue, 7 Sep 2021 20:00:38 -0700 Subject: pid: cleanup the stale comment mentioning pidmap_init(). pidmap_init() has already been replaced with pid_idr_init() in the commit 95846ecf9dac ("pid: replace pid bitmap implementation with IDR API"). Cleanup the stale comment which still mentions it. Link: https://lkml.kernel.org/r/20210714120713.19825-1-itazur@amazon.com Signed-off-by: Takahiro Itazuri Cc: Kuniyuki Iwashima Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/threads.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/threads.h b/include/linux/threads.h index 18d5a74bcc3d..c34173e6c5f1 100644 --- a/include/linux/threads.h +++ b/include/linux/threads.h @@ -38,7 +38,7 @@ * Define a minimum number of pids per cpu. Heuristically based * on original pid max of 32k for 32 cpus. Also, increase the * minimum settable value for pid_max on the running system based - * on similar defaults. See kernel/pid.c:pidmap_init() for details. + * on similar defaults. See kernel/pid.c:pid_idr_init() for details. */ #define PIDS_PER_CPU_DEFAULT 1024 #define PIDS_PER_CPU_MIN 8 -- cgit v1.2.3-71-gd317 From e130242dc351f1cfa2bbeb6766a1486ce936ef88 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:21 -0700 Subject: mm: simplify compat numa syscalls The compat implementations for mbind, get_mempolicy, set_mempolicy and migrate_pages are just there to handle the subtly different layout of bitmaps on 32-bit hosts. The compat implementation however lacks some of the checks that are present in the native one, in particular for checking that the extra bits are all zero when user space has a larger mask size than the kernel. Worse, those extra bits do not get cleared when copying in or out of the kernel, which can lead to incorrect data as well. Unify the implementation to handle the compat bitmap layout directly in the get_nodes() and copy_nodes_to_user() helpers. Splitting out the get_bitmap() helper from get_nodes() also helps readability of the native case. On x86, two additional problems are addressed by this: compat tasks can pass a bitmap at the end of a mapping, causing a fault when reading across the page boundary for a 64-bit word. x32 tasks might also run into problems with get_mempolicy corrupting data when an odd number of 32-bit words gets passed. On parisc the migrate_pages() system call apparently had the wrong calling convention, as big-endian architectures expect the words inside of a bitmap to be swapped. This is not a problem though since parisc has no NUMA support. [arnd@arndb.de: fix mempolicy crash] Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/ Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 17 ++--- mm/mempolicy.c | 176 ++++++++++++++++--------------------------------- 2 files changed, 64 insertions(+), 129 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 8e0598c7d1d1..3a2ac5afee30 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -395,14 +395,6 @@ struct compat_kexec_segment; struct compat_mq_attr; struct compat_msgbuf; -#define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) - -#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) - -long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, - unsigned long bitmap_size); -long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, - unsigned long bitmap_size); void copy_siginfo_to_external32(struct compat_siginfo *to, const struct kernel_siginfo *from); int copy_siginfo_from_user32(kernel_siginfo_t *to, @@ -976,6 +968,15 @@ static inline bool in_compat_syscall(void) { return false; } #endif /* CONFIG_COMPAT */ +#define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) + +#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) + +long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, + unsigned long bitmap_size); +long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, + unsigned long bitmap_size); + /* * Some legacy ABIs like the i386 one use less than natural alignment for 64-bit * types, and will need special compat treatment for that. Most architectures diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5e90b3fb7794..eb95578f5997 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1362,16 +1362,33 @@ mpol_out: /* * User space interface with variable sized bitmaps for nodelists. */ +static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long nlongs = BITS_TO_LONGS(maxnode); + int ret; + + if (in_compat_syscall()) + ret = compat_get_bitmap(mask, + (const compat_ulong_t __user *)nmask, + maxnode); + else + ret = copy_from_user(mask, nmask, + nlongs * sizeof(unsigned long)); + + if (ret) + return -EFAULT; + + if (maxnode % BITS_PER_LONG) + mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; + + return 0; +} /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { - unsigned long k; - unsigned long t; - unsigned long nlongs; - unsigned long endmask; - --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) @@ -1379,49 +1396,29 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; - nlongs = BITS_TO_LONGS(maxnode); - if ((maxnode % BITS_PER_LONG) == 0) - endmask = ~0UL; - else - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - /* * When the user specified more nodes than supported just check - * if the non supported part is all zero. - * - * If maxnode have more longs than MAX_NUMNODES, check - * the bits in that area first. And then go through to - * check the rest bits which equal or bigger than MAX_NUMNODES. - * Otherwise, just check bits [MAX_NUMNODES, maxnode). + * if the non supported part is all zero, one word at a time, + * starting at the end. */ - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - if (get_user(t, nmask + k)) - return -EFAULT; - if (k == nlongs - 1) { - if (t & endmask) - return -EINVAL; - } else if (t) - return -EINVAL; - } - nlongs = BITS_TO_LONGS(MAX_NUMNODES); - endmask = ~0UL; - } + while (maxnode > MAX_NUMNODES) { + unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); + unsigned long t; - if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) { - unsigned long valid_mask = endmask; - - valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); - if (get_user(t, nmask + nlongs - 1)) + if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits)) return -EFAULT; - if (t & valid_mask) + + if (maxnode - bits >= MAX_NUMNODES) { + maxnode -= bits; + } else { + maxnode = MAX_NUMNODES; + t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); + } + if (t) return -EINVAL; } - if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) - return -EFAULT; - nodes_addr(*nodes)[nlongs-1] &= endmask; - return 0; + return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ @@ -1430,6 +1427,10 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); + bool compat = in_compat_syscall(); + + if (compat) + nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) @@ -1437,7 +1438,13 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; + maxnode = nr_node_ids; } + + if (compat) + return compat_put_bitmap((compat_ulong_t __user *)mask, + nodes_addr(*nodes), maxnode); + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } @@ -1649,72 +1656,22 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, compat_ulong_t, maxnode, compat_ulong_t, addr, compat_ulong_t, flags) { - long err; - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); - - nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) - nm = compat_alloc_user_space(alloc_size); - - err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags); - - if (!err && nmask) { - unsigned long copy_size; - copy_size = min_t(unsigned long, sizeof(bm), alloc_size); - err = copy_from_user(bm, nm, copy_size); - /* ensure entire bitmap is zeroed */ - err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); - err |= compat_put_bitmap(nmask, bm, nr_bits); - } - - return err; + return kernel_get_mempolicy(policy, (unsigned long __user *)nmask, + maxnode, addr, flags); } COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode) { - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); - - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) { - if (compat_get_bitmap(bm, nmask, nr_bits)) - return -EFAULT; - nm = compat_alloc_user_space(alloc_size); - if (copy_to_user(nm, bm, alloc_size)) - return -EFAULT; - } - - return kernel_set_mempolicy(mode, nm, nr_bits+1); + return kernel_set_mempolicy(mode, (unsigned long __user *)nmask, maxnode); } COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, compat_ulong_t, mode, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode, compat_ulong_t, flags) { - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - nodemask_t bm; - - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) { - if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits)) - return -EFAULT; - nm = compat_alloc_user_space(alloc_size); - if (copy_to_user(nm, nodes_addr(bm), alloc_size)) - return -EFAULT; - } - - return kernel_mbind(start, len, mode, nm, nr_bits+1, flags); + return kernel_mbind(start, len, mode, (unsigned long __user *)nmask, + maxnode, flags); } COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, @@ -1722,32 +1679,9 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, const compat_ulong_t __user *, old_nodes, const compat_ulong_t __user *, new_nodes) { - unsigned long __user *old = NULL; - unsigned long __user *new = NULL; - nodemask_t tmp_mask; - unsigned long nr_bits; - unsigned long size; - - nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); - size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - if (old_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) - return -EFAULT; - old = compat_alloc_user_space(new_nodes ? size * 2 : size); - if (new_nodes) - new = old + size / sizeof(unsigned long); - if (copy_to_user(old, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - if (new_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) - return -EFAULT; - if (new == NULL) - new = compat_alloc_user_space(size); - if (copy_to_user(new, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - return kernel_migrate_pages(pid, nr_bits + 1, old, new); + return kernel_migrate_pages(pid, maxnode, + (const unsigned long __user *)old_nodes, + (const unsigned long __user *)new_nodes); } #endif /* CONFIG_COMPAT */ -- cgit v1.2.3-71-gd317 From 59ab844eed9c6b01d32dcb27b57accc23771b324 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:25 -0700 Subject: compat: remove some compat entry points These are all handled correctly when calling the native system call entry point, so remove the special cases. Link: https://lkml.kernel.org/r/20210727144859.4150043-6-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/unistd32.h | 10 ++++----- arch/mips/kernel/syscalls/syscall_n32.tbl | 10 ++++----- arch/mips/kernel/syscalls/syscall_o32.tbl | 10 ++++----- arch/parisc/kernel/syscalls/syscall.tbl | 8 +++---- arch/powerpc/kernel/syscalls/syscall.tbl | 10 ++++----- arch/s390/kernel/syscalls/syscall.tbl | 10 ++++----- arch/sparc/kernel/syscalls/syscall.tbl | 10 ++++----- arch/x86/entry/syscalls/syscall_32.tbl | 4 ++-- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- include/linux/compat.h | 20 ----------------- include/uapi/asm-generic/unistd.h | 10 ++++----- kernel/sys_ni.c | 5 ----- mm/mempolicy.c | 37 ------------------------------- mm/migrate.c | 13 ----------- 14 files changed, 42 insertions(+), 117 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 4e99e4b912ef..844f6ae58662 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -649,11 +649,11 @@ __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) #define __NR_inotify_rm_watch 318 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) #define __NR_mbind 319 -__SYSCALL(__NR_mbind, compat_sys_mbind) +__SYSCALL(__NR_mbind, sys_mbind) #define __NR_get_mempolicy 320 -__SYSCALL(__NR_get_mempolicy, compat_sys_get_mempolicy) +__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) #define __NR_set_mempolicy 321 -__SYSCALL(__NR_set_mempolicy, compat_sys_set_mempolicy) +__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) #define __NR_openat 322 __SYSCALL(__NR_openat, compat_sys_openat) #define __NR_mkdirat 323 @@ -699,7 +699,7 @@ __SYSCALL(__NR_tee, sys_tee) #define __NR_vmsplice 343 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages 344 -__SYSCALL(__NR_move_pages, compat_sys_move_pages) +__SYSCALL(__NR_move_pages, sys_move_pages) #define __NR_getcpu 345 __SYSCALL(__NR_getcpu, sys_getcpu) #define __NR_epoll_pwait 346 @@ -811,7 +811,7 @@ __SYSCALL(__NR_rseq, sys_rseq) #define __NR_io_pgetevents 399 __SYSCALL(__NR_io_pgetevents, compat_sys_io_pgetevents) #define __NR_migrate_pages 400 -__SYSCALL(__NR_migrate_pages, compat_sys_migrate_pages) +__SYSCALL(__NR_migrate_pages, sys_migrate_pages) #define __NR_kexec_file_load 401 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) /* 402 is unused */ diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 56c8d3cf42ed..70e32de2bcaa 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -239,9 +239,9 @@ 228 n32 clock_nanosleep sys_clock_nanosleep_time32 229 n32 tgkill sys_tgkill 230 n32 utimes sys_utimes_time32 -231 n32 mbind compat_sys_mbind -232 n32 get_mempolicy compat_sys_get_mempolicy -233 n32 set_mempolicy compat_sys_set_mempolicy +231 n32 mbind sys_mbind +232 n32 get_mempolicy sys_get_mempolicy +233 n32 set_mempolicy sys_set_mempolicy 234 n32 mq_open compat_sys_mq_open 235 n32 mq_unlink sys_mq_unlink 236 n32 mq_timedsend sys_mq_timedsend_time32 @@ -258,7 +258,7 @@ 247 n32 inotify_init sys_inotify_init 248 n32 inotify_add_watch sys_inotify_add_watch 249 n32 inotify_rm_watch sys_inotify_rm_watch -250 n32 migrate_pages compat_sys_migrate_pages +250 n32 migrate_pages sys_migrate_pages 251 n32 openat sys_openat 252 n32 mkdirat sys_mkdirat 253 n32 mknodat sys_mknodat @@ -279,7 +279,7 @@ 268 n32 sync_file_range sys_sync_file_range 269 n32 tee sys_tee 270 n32 vmsplice sys_vmsplice -271 n32 move_pages compat_sys_move_pages +271 n32 move_pages sys_move_pages 272 n32 set_robust_list compat_sys_set_robust_list 273 n32 get_robust_list compat_sys_get_robust_list 274 n32 kexec_load compat_sys_kexec_load diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 201237fd0f43..a61c35edaa74 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -279,9 +279,9 @@ 265 o32 clock_nanosleep sys_clock_nanosleep_time32 266 o32 tgkill sys_tgkill 267 o32 utimes sys_utimes_time32 -268 o32 mbind sys_mbind compat_sys_mbind -269 o32 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -270 o32 set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +268 o32 mbind sys_mbind +269 o32 get_mempolicy sys_get_mempolicy +270 o32 set_mempolicy sys_set_mempolicy 271 o32 mq_open sys_mq_open compat_sys_mq_open 272 o32 mq_unlink sys_mq_unlink 273 o32 mq_timedsend sys_mq_timedsend_time32 @@ -298,7 +298,7 @@ 284 o32 inotify_init sys_inotify_init 285 o32 inotify_add_watch sys_inotify_add_watch 286 o32 inotify_rm_watch sys_inotify_rm_watch -287 o32 migrate_pages sys_migrate_pages compat_sys_migrate_pages +287 o32 migrate_pages sys_migrate_pages 288 o32 openat sys_openat compat_sys_openat 289 o32 mkdirat sys_mkdirat 290 o32 mknodat sys_mknodat @@ -319,7 +319,7 @@ 305 o32 sync_file_range sys_sync_file_range sys32_sync_file_range 306 o32 tee sys_tee 307 o32 vmsplice sys_vmsplice -308 o32 move_pages sys_move_pages compat_sys_move_pages +308 o32 move_pages sys_move_pages 309 o32 set_robust_list sys_set_robust_list compat_sys_set_robust_list 310 o32 get_robust_list sys_get_robust_list compat_sys_get_robust_list 311 o32 kexec_load sys_kexec_load compat_sys_kexec_load diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 0bf854b70612..bf751e0732b7 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -292,9 +292,9 @@ 258 32 clock_nanosleep sys_clock_nanosleep_time32 258 64 clock_nanosleep sys_clock_nanosleep 259 common tgkill sys_tgkill -260 common mbind sys_mbind compat_sys_mbind -261 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -262 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +260 common mbind sys_mbind +261 common get_mempolicy sys_get_mempolicy +262 common set_mempolicy sys_set_mempolicy # 263 was vserver 264 common add_key sys_add_key 265 common request_key sys_request_key @@ -331,7 +331,7 @@ 292 64 sync_file_range sys_sync_file_range 293 common tee sys_tee 294 common vmsplice sys_vmsplice -295 common move_pages sys_move_pages compat_sys_move_pages +295 common move_pages sys_move_pages 296 common getcpu sys_getcpu 297 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 298 common statfs64 sys_statfs64 compat_sys_statfs64 diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 29b55e2e035c..7bef917cc84e 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -330,10 +330,10 @@ 256 64 sys_debug_setcontext sys_ni_syscall 256 spu sys_debug_setcontext sys_ni_syscall # 257 reserved for vserver -258 nospu migrate_pages sys_migrate_pages compat_sys_migrate_pages -259 nospu mbind sys_mbind compat_sys_mbind -260 nospu get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -261 nospu set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +258 nospu migrate_pages sys_migrate_pages +259 nospu mbind sys_mbind +260 nospu get_mempolicy sys_get_mempolicy +261 nospu set_mempolicy sys_set_mempolicy 262 nospu mq_open sys_mq_open compat_sys_mq_open 263 nospu mq_unlink sys_mq_unlink 264 32 mq_timedsend sys_mq_timedsend_time32 @@ -381,7 +381,7 @@ 298 common faccessat sys_faccessat 299 common get_robust_list sys_get_robust_list compat_sys_get_robust_list 300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list -301 common move_pages sys_move_pages compat_sys_move_pages +301 common move_pages sys_move_pages 302 common getcpu sys_getcpu 303 nospu epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 304 32 utimensat sys_utimensat_time32 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index aa9d68b8ee14..df5261e5cfe1 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -274,9 +274,9 @@ 265 common statfs64 sys_statfs64 compat_sys_statfs64 266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages -268 common mbind sys_mbind compat_sys_mbind -269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +268 common mbind sys_mbind sys_mbind +269 common get_mempolicy sys_get_mempolicy sys_get_mempolicy +270 common set_mempolicy sys_set_mempolicy sys_set_mempolicy 271 common mq_open sys_mq_open compat_sys_mq_open 272 common mq_unlink sys_mq_unlink sys_mq_unlink 273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 @@ -293,7 +293,7 @@ 284 common inotify_init sys_inotify_init sys_inotify_init 285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch 286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch -287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages +287 common migrate_pages sys_migrate_pages sys_migrate_pages 288 common openat sys_openat compat_sys_openat 289 common mkdirat sys_mkdirat sys_mkdirat 290 common mknodat sys_mknodat sys_mknodat @@ -317,7 +317,7 @@ 307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range 308 common tee sys_tee sys_tee 309 common vmsplice sys_vmsplice sys_vmsplice -310 common move_pages sys_move_pages compat_sys_move_pages +310 common move_pages sys_move_pages sys_move_pages 311 common getcpu sys_getcpu sys_getcpu 312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 313 common utimes sys_utimes sys_utimes_time32 diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 7893104718c2..c37764dc764d 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -365,12 +365,12 @@ 299 common unshare sys_unshare 300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list 301 common get_robust_list sys_get_robust_list compat_sys_get_robust_list -302 common migrate_pages sys_migrate_pages compat_sys_migrate_pages -303 common mbind sys_mbind compat_sys_mbind -304 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -305 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +302 common migrate_pages sys_migrate_pages +303 common mbind sys_mbind +304 common get_mempolicy sys_get_mempolicy +305 common set_mempolicy sys_set_mempolicy 306 common kexec_load sys_kexec_load compat_sys_kexec_load -307 common move_pages sys_move_pages compat_sys_move_pages +307 common move_pages sys_move_pages 308 common getcpu sys_getcpu 309 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait 310 32 utimensat sys_utimensat_time32 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 61f18b72552b..960a021d543e 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -286,7 +286,7 @@ 272 i386 fadvise64_64 sys_ia32_fadvise64_64 273 i386 vserver 274 i386 mbind sys_mbind -275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +275 i386 get_mempolicy sys_get_mempolicy 276 i386 set_mempolicy sys_set_mempolicy 277 i386 mq_open sys_mq_open compat_sys_mq_open 278 i386 mq_unlink sys_mq_unlink @@ -328,7 +328,7 @@ 314 i386 sync_file_range sys_ia32_sync_file_range 315 i386 tee sys_tee 316 i386 vmsplice sys_vmsplice -317 i386 move_pages sys_move_pages compat_sys_move_pages +317 i386 move_pages sys_move_pages 318 i386 getcpu sys_getcpu 319 i386 epoll_pwait sys_epoll_pwait 320 i386 utimensat sys_utimensat_time32 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 807b6a1de8e8..18b5500ea8bf 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -398,7 +398,7 @@ 530 x32 set_robust_list compat_sys_set_robust_list 531 x32 get_robust_list compat_sys_get_robust_list 532 x32 vmsplice sys_vmsplice -533 x32 move_pages compat_sys_move_pages +533 x32 move_pages sys_move_pages 534 x32 preadv compat_sys_preadv64 535 x32 pwritev compat_sys_pwritev64 536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo diff --git a/include/linux/compat.h b/include/linux/compat.h index 3a2ac5afee30..2d42cebd1fb8 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -799,26 +799,6 @@ asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr /* mm/fadvise.c: No generic prototype for fadvise64_64 */ /* mm/, CONFIG_MMU only */ -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags); -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, - compat_ulong_t flags); -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode); -asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, - compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes, - const compat_ulong_t __user *new_nodes); -asmlinkage long compat_sys_move_pages(pid_t pid, compat_ulong_t nr_pages, - __u32 __user *pages, - const int __user *nodes, - int __user *status, - int flags); - asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 14c8fe863c6d..1c5fb86d455a 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -673,15 +673,15 @@ __SYSCALL(__NR_madvise, sys_madvise) #define __NR_remap_file_pages 234 __SYSCALL(__NR_remap_file_pages, sys_remap_file_pages) #define __NR_mbind 235 -__SC_COMP(__NR_mbind, sys_mbind, compat_sys_mbind) +__SYSCALL(__NR_mbind, sys_mbind) #define __NR_get_mempolicy 236 -__SC_COMP(__NR_get_mempolicy, sys_get_mempolicy, compat_sys_get_mempolicy) +__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) #define __NR_set_mempolicy 237 -__SC_COMP(__NR_set_mempolicy, sys_set_mempolicy, compat_sys_set_mempolicy) +__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) #define __NR_migrate_pages 238 -__SC_COMP(__NR_migrate_pages, sys_migrate_pages, compat_sys_migrate_pages) +__SYSCALL(__NR_migrate_pages, sys_migrate_pages) #define __NR_move_pages 239 -__SC_COMP(__NR_move_pages, sys_move_pages, compat_sys_move_pages) +__SYSCALL(__NR_move_pages, sys_move_pages) #endif #define __NR_rt_tgsigqueueinfo 240 diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 64578adfe115..f43d89d92860 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -292,15 +292,10 @@ COND_SYSCALL(process_madvise); COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); -COND_SYSCALL_COMPAT(mbind); COND_SYSCALL(get_mempolicy); -COND_SYSCALL_COMPAT(get_mempolicy); COND_SYSCALL(set_mempolicy); -COND_SYSCALL_COMPAT(set_mempolicy); COND_SYSCALL(migrate_pages); -COND_SYSCALL_COMPAT(migrate_pages); COND_SYSCALL(move_pages); -COND_SYSCALL_COMPAT(move_pages); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eb95578f5997..8d14240896a8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1649,43 +1649,6 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } -#ifdef CONFIG_COMPAT - -COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, - compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode, - compat_ulong_t, addr, compat_ulong_t, flags) -{ - return kernel_get_mempolicy(policy, (unsigned long __user *)nmask, - maxnode, addr, flags); -} - -COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode) -{ - return kernel_set_mempolicy(mode, (unsigned long __user *)nmask, maxnode); -} - -COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, - compat_ulong_t, mode, compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode, compat_ulong_t, flags) -{ - return kernel_mbind(start, len, mode, (unsigned long __user *)nmask, - maxnode, flags); -} - -COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, - compat_ulong_t, maxnode, - const compat_ulong_t __user *, old_nodes, - const compat_ulong_t __user *, new_nodes) -{ - return kernel_migrate_pages(pid, maxnode, - (const unsigned long __user *)old_nodes, - (const unsigned long __user *)new_nodes); -} - -#endif /* CONFIG_COMPAT */ - bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) diff --git a/mm/migrate.c b/mm/migrate.c index 2bc494875cea..a6a7743ee98f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2047,19 +2047,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); } -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, - compat_uptr_t __user *, pages, - const int __user *, nodes, - int __user *, status, - int, flags) -{ - return kernel_move_pages(pid, nr_pages, - (const void __user *__user *)pages, - nodes, status, flags); -} -#endif /* CONFIG_COMPAT */ - #ifdef CONFIG_NUMA_BALANCING /* * Returns true if this is a safe migration target node for misplaced NUMA -- cgit v1.2.3-71-gd317 From a7a08b275a8bbade798c4bdaad07ade68fe7003c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:29 -0700 Subject: arch: remove compat_alloc_user_space All users of compat_alloc_user_space() and copy_in_user() have been removed from the kernel, only a few functions in sparc remain that can be changed to calling arch_copy_in_user() instead. Link: https://lkml.kernel.org/r/20210727144859.4150043-7-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/compat.h | 5 --- arch/arm64/include/asm/uaccess.h | 11 ----- arch/arm64/lib/Makefile | 2 +- arch/arm64/lib/copy_in_user.S | 77 --------------------------------- arch/mips/cavium-octeon/octeon-memcpy.S | 2 - arch/mips/include/asm/compat.h | 8 ---- arch/mips/include/asm/uaccess.h | 26 ----------- arch/mips/lib/memcpy.S | 11 ----- arch/parisc/include/asm/compat.h | 6 --- arch/parisc/include/asm/uaccess.h | 2 - arch/parisc/lib/memcpy.c | 9 ---- arch/powerpc/include/asm/compat.h | 16 ------- arch/s390/include/asm/compat.h | 10 ----- arch/s390/include/asm/uaccess.h | 3 -- arch/s390/lib/uaccess.c | 63 --------------------------- arch/sparc/include/asm/compat.h | 19 -------- arch/sparc/kernel/process_64.c | 2 +- arch/sparc/kernel/signal32.c | 12 ++--- arch/sparc/kernel/signal_64.c | 8 ++-- arch/x86/include/asm/compat.h | 13 ------ arch/x86/include/asm/uaccess_64.h | 7 --- include/linux/compat.h | 2 - include/linux/uaccess.h | 10 ----- kernel/compat.c | 21 --------- 24 files changed, 12 insertions(+), 333 deletions(-) delete mode 100644 arch/arm64/lib/copy_in_user.S (limited to 'include/linux') diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index 79c1a750e357..eaa6ca062d89 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -107,11 +107,6 @@ struct compat_statfs { #define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current))) #define COMPAT_MINSIGSTKSZ 2048 -static inline void __user *arch_compat_alloc_user_space(long len) -{ - return (void __user *)compat_user_stack_pointer() - len; -} - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index b5f08621fa29..190b494e22ab 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -430,17 +430,6 @@ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const voi __actu_ret; \ }) -extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n); -#define raw_copy_in_user(to, from, n) \ -({ \ - unsigned long __aciu_ret; \ - uaccess_ttbr0_enable(); \ - __aciu_ret = __arch_copy_in_user(__uaccess_mask_ptr(to), \ - __uaccess_mask_ptr(from), (n)); \ - uaccess_ttbr0_disable(); \ - __aciu_ret; \ -}) - #define INLINE_COPY_TO_USER #define INLINE_COPY_FROM_USER diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 6dd56a49790a..0941180a86d3 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 lib-y := clear_user.o delay.o copy_from_user.o \ - copy_to_user.o copy_in_user.o copy_page.o \ + copy_to_user.o copy_page.o \ clear_page.o csum.o insn.o memchr.o memcpy.o \ memset.o memcmp.o strcmp.o strncmp.o strlen.o \ strnlen.o strchr.o strrchr.o tishift.o diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S deleted file mode 100644 index dbea3799c3ef..000000000000 --- a/arch/arm64/lib/copy_in_user.S +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copy from user space to user space - * - * Copyright (C) 2012 ARM Ltd. - */ - -#include - -#include -#include -#include - -/* - * Copy from user space to user space (alignment handled by the hardware) - * - * Parameters: - * x0 - to - * x1 - from - * x2 - n - * Returns: - * x0 - bytes not copied - */ - .macro ldrb1 reg, ptr, val - user_ldst 9998f, ldtrb, \reg, \ptr, \val - .endm - - .macro strb1 reg, ptr, val - user_ldst 9998f, sttrb, \reg, \ptr, \val - .endm - - .macro ldrh1 reg, ptr, val - user_ldst 9997f, ldtrh, \reg, \ptr, \val - .endm - - .macro strh1 reg, ptr, val - user_ldst 9997f, sttrh, \reg, \ptr, \val - .endm - - .macro ldr1 reg, ptr, val - user_ldst 9997f, ldtr, \reg, \ptr, \val - .endm - - .macro str1 reg, ptr, val - user_ldst 9997f, sttr, \reg, \ptr, \val - .endm - - .macro ldp1 reg1, reg2, ptr, val - user_ldp 9997f, \reg1, \reg2, \ptr, \val - .endm - - .macro stp1 reg1, reg2, ptr, val - user_stp 9997f, \reg1, \reg2, \ptr, \val - .endm - -end .req x5 -srcin .req x15 -SYM_FUNC_START(__arch_copy_in_user) - add end, x0, x2 - mov srcin, x1 -#include "copy_template.S" - mov x0, #0 - ret -SYM_FUNC_END(__arch_copy_in_user) -EXPORT_SYMBOL(__arch_copy_in_user) - - .section .fixup,"ax" - .align 2 -9997: cmp dst, dstin - b.ne 9998f - // Before being absolutely sure we couldn't copy anything, try harder -USER(9998f, ldtrb tmp1w, [srcin]) -USER(9998f, sttrb tmp1w, [dst]) - add dst, dst, #1 -9998: sub x0, end, dst // bytes not copied - ret - .previous diff --git a/arch/mips/cavium-octeon/octeon-memcpy.S b/arch/mips/cavium-octeon/octeon-memcpy.S index 600d018cf354..0a515cde1c18 100644 --- a/arch/mips/cavium-octeon/octeon-memcpy.S +++ b/arch/mips/cavium-octeon/octeon-memcpy.S @@ -154,8 +154,6 @@ FEXPORT(__raw_copy_from_user) EXPORT_SYMBOL(__raw_copy_from_user) FEXPORT(__raw_copy_to_user) EXPORT_SYMBOL(__raw_copy_to_user) -FEXPORT(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) /* * Note: dst & src may be unaligned, len may be 0 * Temps diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index 53f015a1b0a7..bbb3bc5a42fd 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -96,14 +96,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = (struct pt_regs *) - ((unsigned long) current_thread_info() + THREAD_SIZE - 32) - 1; - - return (void __user *) (regs->regs[29] - len); -} - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index 783fecce65c8..f8f74f9f5883 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -428,7 +428,6 @@ do { \ extern size_t __raw_copy_from_user(void *__to, const void *__from, size_t __n); extern size_t __raw_copy_to_user(void *__to, const void *__from, size_t __n); -extern size_t __raw_copy_in_user(void *__to, const void *__from, size_t __n); static inline unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) @@ -480,31 +479,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER -static inline unsigned long -raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - register void __user *__cu_to_r __asm__("$4"); - register const void __user *__cu_from_r __asm__("$5"); - register long __cu_len_r __asm__("$6"); - - __cu_to_r = to; - __cu_from_r = from; - __cu_len_r = n; - - __asm__ __volatile__( - ".set\tnoreorder\n\t" - __MODULE_JAL(__raw_copy_in_user) - ".set\tnoat\n\t" - __UA_ADDU "\t$1, %1, %2\n\t" - ".set\tat\n\t" - ".set\treorder" - : "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r) - : - : "$8", "$9", "$10", "$11", "$12", "$14", "$15", "$24", "$31", - DADDI_SCRATCH, "memory"); - return __cu_len_r; -} - extern __kernel_size_t __bzero(void __user *addr, __kernel_size_t size); /* diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S index e19fb98b5d38..277c32296636 100644 --- a/arch/mips/lib/memcpy.S +++ b/arch/mips/lib/memcpy.S @@ -666,8 +666,6 @@ FEXPORT(__raw_copy_from_user) EXPORT_SYMBOL(__raw_copy_from_user) FEXPORT(__raw_copy_to_user) EXPORT_SYMBOL(__raw_copy_to_user) -FEXPORT(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) #endif /* Legacy Mode, user <-> user */ __BUILD_COPY_USER LEGACY_MODE USEROP USEROP @@ -703,13 +701,4 @@ EXPORT_SYMBOL(__raw_copy_to_user) __BUILD_COPY_USER EVA_MODE KERNELOP USEROP END(__raw_copy_to_user) -/* - * __copy_in_user (EVA) - */ - -LEAF(__raw_copy_in_user) -EXPORT_SYMBOL(__raw_copy_in_user) -__BUILD_COPY_USER EVA_MODE USEROP USEROP -END(__raw_copy_in_user) - #endif diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index b5d90e82b65d..c04f5a637c39 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -163,12 +163,6 @@ struct compat_shmid64_ds { #define COMPAT_ELF_NGREG 80 typedef compat_ulong_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; -static __inline__ void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = ¤t->thread.regs; - return (void __user *)regs->gr[30]; -} - static inline int __is_compat_task(struct task_struct *t) { return test_tsk_thread_flag(t, TIF_32BIT); diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index ed2cd4fb479b..7c13314aae4a 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -215,8 +215,6 @@ unsigned long __must_check raw_copy_to_user(void __user *dst, const void *src, unsigned long len); unsigned long __must_check raw_copy_from_user(void *dst, const void __user *src, unsigned long len); -unsigned long __must_check raw_copy_in_user(void __user *dst, const void __user *src, - unsigned long len); #define INLINE_COPY_TO_USER #define INLINE_COPY_FROM_USER diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index 4b75388190b4..ea70a0e08321 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -38,14 +38,6 @@ unsigned long raw_copy_from_user(void *dst, const void __user *src, } EXPORT_SYMBOL(raw_copy_from_user); -unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long len) -{ - mtsp(get_user_space(), 1); - mtsp(get_user_space(), 2); - return pa_memcpy((void __force *)dst, (void __force *)src, len); -} - - void * memcpy(void * dst,const void *src, size_t count) { mtsp(get_kernel_space(), 1); @@ -54,7 +46,6 @@ void * memcpy(void * dst,const void *src, size_t count) return dst; } -EXPORT_SYMBOL(raw_copy_in_user); EXPORT_SYMBOL(memcpy); bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index e33dcf134cdd..7afc96fb6524 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -83,22 +83,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = current->thread.regs; - unsigned long usp = regs->gpr[1]; - - /* - * We can't access below the stack pointer in the 32bit ABI and - * can access 288 bytes in the 64bit big-endian ABI, - * or 512 bytes with the new ELFv2 little-endian ABI. - */ - if (!is_32bit_task()) - usp -= USER_REDZONE_SIZE; - - return (void __user *) (usp - len); -} - /* * ipc64_perm is actually 32/64bit clean but since the compat layer refers to * it we may as well define it. diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 8d49505b4a43..cdc7ae72529d 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -176,16 +176,6 @@ static inline int is_compat_task(void) return test_thread_flag(TIF_31BIT); } -static inline void __user *arch_compat_alloc_user_space(long len) -{ - unsigned long stack; - - stack = KSTK_ESP(current); - if (is_compat_task()) - stack &= 0x7fffffffUL; - return (void __user *) (stack - len); -} - #endif struct compat_ipc64_perm { diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 9ed9aa37e836..ce550d06abc3 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -227,9 +227,6 @@ static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long s __get_user(x, ptr); \ }) -unsigned long __must_check -raw_copy_in_user(void __user *to, const void __user *from, unsigned long n); - /* * Copy a null terminated string from userspace. */ diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 94ca99bde59d..a596e69d3c47 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -204,69 +204,6 @@ unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long } EXPORT_SYMBOL(raw_copy_to_user); -static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from, - unsigned long size) -{ - unsigned long tmp1, tmp2; - - tmp1 = -4096UL; - /* FIXME: copy with reduced length. */ - asm volatile( - " lgr 0,%[spec]\n" - "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" - " jz 2f\n" - "1: algr %0,%3\n" - " slgr %1,%3\n" - " slgr %2,%3\n" - " j 0b\n" - "2:slgr %0,%0\n" - "3: \n" - EX_TABLE(0b,3b) - : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2) - : [spec] "d" (0x810081UL) - : "cc", "memory", "0"); - return size; -} - -static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from, - unsigned long size) -{ - unsigned long tmp1; - - asm volatile( - " sacf 256\n" - " aghi %0,-1\n" - " jo 5f\n" - " bras %3,3f\n" - "0: aghi %0,257\n" - "1: mvc 0(1,%1),0(%2)\n" - " la %1,1(%1)\n" - " la %2,1(%2)\n" - " aghi %0,-1\n" - " jnz 1b\n" - " j 5f\n" - "2: mvc 0(256,%1),0(%2)\n" - " la %1,256(%1)\n" - " la %2,256(%2)\n" - "3: aghi %0,-256\n" - " jnm 2b\n" - "4: ex %0,1b-0b(%3)\n" - "5: slgr %0,%0\n" - "6: sacf 768\n" - EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) - : "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1) - : : "cc", "memory"); - return size; -} - -unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - if (copy_with_mvcos()) - return copy_in_user_mvcos(to, from, n); - return copy_in_user_mvc(to, from, n); -} -EXPORT_SYMBOL(raw_copy_in_user); - static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) { unsigned long tmp1, tmp2; diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 8b63410e830f..bd949fcf9d63 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -116,25 +116,6 @@ struct compat_statfs { #define COMPAT_OFF_T_MAX 0x7fffffff -#ifdef CONFIG_COMPAT -static inline void __user *arch_compat_alloc_user_space(long len) -{ - struct pt_regs *regs = current_thread_info()->kregs; - unsigned long usp = regs->u_regs[UREG_I6]; - - if (test_thread_64bit_stack(usp)) - usp += STACK_BIAS; - - if (test_thread_flag(TIF_32BIT)) - usp &= 0xffffffffUL; - - usp -= len; - usp &= ~0x7UL; - - return (void __user *) usp; -} -#endif - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 093849bfda50..d1cc410d2f64 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -455,7 +455,7 @@ static unsigned long clone_stackframe(unsigned long csp, unsigned long psp) distance = fp - psp; rval = (csp - distance); - if (copy_in_user((void __user *) rval, (void __user *) psp, distance)) + if (raw_copy_in_user((void __user *)rval, (void __user *)psp, distance)) rval = 0; else if (!stack_64bit) { if (put_user(((u32)csp), diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 4276b9e003ca..6cc124a3bb98 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -435,9 +435,9 @@ static int setup_frame32(struct ksignal *ksig, struct pt_regs *regs, (_COMPAT_NSIG_WORDS - 1) * sizeof(unsigned int)); if (!wsaved) { - err |= copy_in_user((u32 __user *)sf, - (u32 __user *)(regs->u_regs[UREG_FP]), - sizeof(struct reg_window32)); + err |= raw_copy_in_user((u32 __user *)sf, + (u32 __user *)(regs->u_regs[UREG_FP]), + sizeof(struct reg_window32)); } else { struct reg_window *rp; @@ -567,9 +567,9 @@ static int setup_rt_frame32(struct ksignal *ksig, struct pt_regs *regs, err |= put_compat_sigset(&sf->mask, oldset, sizeof(compat_sigset_t)); if (!wsaved) { - err |= copy_in_user((u32 __user *)sf, - (u32 __user *)(regs->u_regs[UREG_FP]), - sizeof(struct reg_window32)); + err |= raw_copy_in_user((u32 __user *)sf, + (u32 __user *)(regs->u_regs[UREG_FP]), + sizeof(struct reg_window32)); } else { struct reg_window *rp; diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index cea23cf95600..2a78d2af1265 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -406,10 +406,10 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) err |= copy_to_user(&sf->mask, sigmask_to_save(), sizeof(sigset_t)); if (!wsaved) { - err |= copy_in_user((u64 __user *)sf, - (u64 __user *)(regs->u_regs[UREG_FP] + - STACK_BIAS), - sizeof(struct reg_window)); + err |= raw_copy_in_user((u64 __user *)sf, + (u64 __user *)(regs->u_regs[UREG_FP] + + STACK_BIAS), + sizeof(struct reg_window)); } else { struct reg_window *rp; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 4ae01cdb99de..7516e4199b3c 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -156,19 +156,6 @@ struct compat_shmid64_ds { (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) #endif -static inline void __user *arch_compat_alloc_user_space(long len) -{ - compat_uptr_t sp = task_pt_regs(current)->sp; - - /* - * -128 for the x32 ABI redzone. For IA32, it is not strictly - * necessary, but not harmful. - */ - sp -= 128; - - return (void __user *)round_down(sp - len, 16); -} - static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index e7265a552f4f..45697e04d771 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -58,13 +58,6 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size) return copy_user_generic((__force void *)dst, src, size); } -static __always_inline __must_check -unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size) -{ - return copy_user_generic((__force void *)dst, - (__force void *)src, size); -} - extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); diff --git a/include/linux/compat.h b/include/linux/compat.h index 2d42cebd1fb8..1c758b0e0359 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -511,8 +511,6 @@ extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request, struct epoll_event; /* fortunately, this one is fixed-layout */ -extern void __user *compat_alloc_user_space(unsigned long len); - int compat_restore_altstack(const compat_stack_t __user *uss); int __compat_save_altstack(compat_stack_t __user *, unsigned long); #define unsafe_compat_save_altstack(uss, sp, label) do { \ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index c05e903cef02..ac0394087f7d 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -200,16 +200,6 @@ copy_to_user(void __user *to, const void *from, unsigned long n) n = _copy_to_user(to, from, n); return n; } -#ifdef CONFIG_COMPAT -static __always_inline unsigned long __must_check -copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - might_fault(); - if (access_ok(to, n) && access_ok(from, n)) - n = raw_copy_in_user(to, from, n); - return n; -} -#endif #ifndef copy_mc_to_kernel /* diff --git a/kernel/compat.c b/kernel/compat.c index 05adfd6fa8bf..55551989d9da 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -269,24 +269,3 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return 0; } EXPORT_SYMBOL_GPL(get_compat_sigset); - -/* - * Allocate user-space memory for the duration of a single system call, - * in order to marshall parameters inside a compat thunk. - */ -void __user *compat_alloc_user_space(unsigned long len) -{ - void __user *ptr; - - /* If len would occupy more than half of the entire compat space... */ - if (unlikely(len > (((compat_uptr_t)~0) >> 1))) - return NULL; - - ptr = arch_compat_alloc_user_space(len); - - if (unlikely(!access_ok(ptr, len))) - return NULL; - - return ptr; -} -EXPORT_SYMBOL_GPL(compat_alloc_user_space); -- cgit v1.2.3-71-gd317 From b83a908498d68fafca931e1276e145b339cac5fb Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 2 Aug 2021 13:23:20 -0700 Subject: compiler_attributes.h: move __compiletime_{error|warning} Clang 14 will add support for __attribute__((__error__(""))) and __attribute__((__warning__(""))). To make use of these in __compiletime_error and __compiletime_warning (as used by BUILD_BUG and friends) for newer clang and detect/fallback for older versions of clang, move these to compiler_attributes.h and guard them with __has_attribute preprocessor guards. Link: https://reviews.llvm.org/D106030 Link: https://bugs.llvm.org/show_bug.cgi?id=16428 Link: https://github.com/ClangBuiltLinux/linux/issues/1173 Signed-off-by: Nick Desaulniers Reviewed-by: Nathan Chancellor Reviewed-by: Kees Cook [Reworded, landed in Clang 14] Signed-off-by: Miguel Ojeda --- include/linux/compiler-gcc.h | 3 --- include/linux/compiler_attributes.h | 24 ++++++++++++++++++++++++ include/linux/compiler_types.h | 6 ------ 3 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 5d97ef738a57..61c1479688db 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -43,9 +43,6 @@ #define __compiletime_object_size(obj) __builtin_object_size(obj, 0) -#define __compiletime_warning(message) __attribute__((__warning__(message))) -#define __compiletime_error(message) __attribute__((__error__(message))) - #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) #define __latent_entropy __attribute__((latent_entropy)) #endif diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 7b1fa5c30169..f4df9e5a8c76 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -30,6 +30,7 @@ # define __GCC4_has_attribute___assume_aligned__ (__GNUC_MINOR__ >= 9) # define __GCC4_has_attribute___copy__ 0 # define __GCC4_has_attribute___designated_init__ 0 +# define __GCC4_has_attribute___error__ 1 # define __GCC4_has_attribute___externally_visible__ 1 # define __GCC4_has_attribute___no_caller_saved_registers__ 0 # define __GCC4_has_attribute___noclone__ 1 @@ -38,6 +39,7 @@ # define __GCC4_has_attribute___no_sanitize_undefined__ (__GNUC_MINOR__ >= 9) # define __GCC4_has_attribute___no_sanitize_coverage__ 0 # define __GCC4_has_attribute___fallthrough__ 0 +# define __GCC4_has_attribute___warning__ 1 #endif /* @@ -137,6 +139,17 @@ # define __designated_init #endif +/* + * Optional: only supported since clang >= 14.0 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-error-function-attribute + */ +#if __has_attribute(__error__) +# define __compiletime_error(msg) __attribute__((__error__(msg))) +#else +# define __compiletime_error(msg) +#endif + /* * Optional: not supported by clang * @@ -286,6 +299,17 @@ */ #define __must_check __attribute__((__warn_unused_result__)) +/* + * Optional: only supported since clang >= 14.0 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-warning-function-attribute + */ +#if __has_attribute(__warning__) +# define __compiletime_warning(msg) __attribute__((__warning__(msg))) +#else +# define __compiletime_warning(msg) +#endif + /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-weak-function-attribute * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-weak-variable-attribute diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index d29bda7f6ebd..8246d0caffa6 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -294,12 +294,6 @@ struct ftrace_likely_data { #ifndef __compiletime_object_size # define __compiletime_object_size(obj) -1 #endif -#ifndef __compiletime_warning -# define __compiletime_warning(message) -#endif -#ifndef __compiletime_error -# define __compiletime_error(message) -#endif #ifdef __OPTIMIZE__ # define __compiletime_assert(condition, msg, prefix, suffix) \ -- cgit v1.2.3-71-gd317 From 13db8c50477d83ad3e3b9b0ae247e5cd833a7ae4 Mon Sep 17 00:00:00 2001 From: Liu Zixian Date: Wed, 8 Sep 2021 18:10:05 -0700 Subject: mm/hugetlb: initialize hugetlb_usage in mm_init After fork, the child process will get incorrect (2x) hugetlb_usage. If a process uses 5 2MB hugetlb pages in an anonymous mapping, HugetlbPages: 10240 kB and then forks, the child will show, HugetlbPages: 20480 kB The reason for double the amount is because hugetlb_usage will be copied from the parent and then increased when we copy page tables from parent to child. Child will have 2x actual usage. Fix this by adding hugetlb_count_init in mm_init. Link: https://lkml.kernel.org/r/20210826071742.877-1-liuzixian4@huawei.com Fixes: 5d317b2b6536 ("mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status") Signed-off-by: Liu Zixian Reviewed-by: Naoya Horiguchi Reviewed-by: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 9 +++++++++ kernel/fork.c | 1 + 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f7ca1a3870ea..1faebe1cd0ed 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -858,6 +858,11 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); +static inline void hugetlb_count_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->hugetlb_usage, 0); +} + static inline void hugetlb_count_add(long l, struct mm_struct *mm) { atomic_long_add(l, &mm->hugetlb_usage); @@ -1042,6 +1047,10 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, return &mm->page_table_lock; } +static inline void hugetlb_count_init(struct mm_struct *mm) +{ +} + static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) { } diff --git a/kernel/fork.c b/kernel/fork.c index ff5be23800af..38681ad44c76 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1063,6 +1063,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); + hugetlb_count_init(mm); if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; -- cgit v1.2.3-71-gd317 From 10994316089c9682f2fbe0be0b1e82bcaf5f4e8c Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Wed, 8 Sep 2021 18:10:14 -0700 Subject: mmap_lock: change trace and locking order Print to the trace log before releasing the lock to avoid racing with other trace log printers of the same lock type. Link: https://lkml.kernel.org/r/20210903022041.1843024-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Suggested-by: Steven Rostedt (VMware) Reviewed-by: Matthew Wilcox (Oracle) Cc: Michel Lespinasse Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmap_lock.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 0540f0156f58..b179f1e3541a 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -101,14 +101,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm) static inline void mmap_write_unlock(struct mm_struct *mm) { - up_write(&mm->mmap_lock); __mmap_lock_trace_released(mm, true); + up_write(&mm->mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { - downgrade_write(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, true); + downgrade_write(&mm->mmap_lock); } static inline void mmap_read_lock(struct mm_struct *mm) @@ -140,8 +140,8 @@ static inline bool mmap_read_trylock(struct mm_struct *mm) static inline void mmap_read_unlock(struct mm_struct *mm) { - up_read(&mm->mmap_lock); __mmap_lock_trace_released(mm, false); + up_read(&mm->mmap_lock); } static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) @@ -155,8 +155,8 @@ static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { - up_read_non_owner(&mm->mmap_lock); __mmap_lock_trace_released(mm, false); + up_read_non_owner(&mm->mmap_lock); } static inline void mmap_assert_locked(struct mm_struct *mm) -- cgit v1.2.3-71-gd317 From 04f08eb44b5011493d77b602fdec29ff0f5c6cd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Sep 2021 17:00:29 -0700 Subject: net/af_unix: fix a data-race in unix_dgram_poll syzbot reported another data-race in af_unix [1] Lets change __skb_insert() to use WRITE_ONCE() when changing skb head qlen. Also, change unix_dgram_poll() to use lockless version of unix_recvq_full() It is verry possible we can switch all/most unix_recvq_full() to the lockless version, this will be done in a future kernel version. [1] HEAD commit: 8596e589b787732c8346f0482919e83cc9362db1 BUG: KCSAN: data-race in skb_queue_tail / unix_dgram_poll write to 0xffff88814eeb24e0 of 4 bytes by task 25815 on cpu 0: __skb_insert include/linux/skbuff.h:1938 [inline] __skb_queue_before include/linux/skbuff.h:2043 [inline] __skb_queue_tail include/linux/skbuff.h:2076 [inline] skb_queue_tail+0x80/0xa0 net/core/skbuff.c:3264 unix_dgram_sendmsg+0xff2/0x1600 net/unix/af_unix.c:1850 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2532 __do_sys_sendmmsg net/socket.c:2561 [inline] __se_sys_sendmmsg net/socket.c:2558 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2558 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88814eeb24e0 of 4 bytes by task 25834 on cpu 1: skb_queue_len include/linux/skbuff.h:1869 [inline] unix_recvq_full net/unix/af_unix.c:194 [inline] unix_dgram_poll+0x2bc/0x3e0 net/unix/af_unix.c:2777 sock_poll+0x23e/0x260 net/socket.c:1288 vfs_poll include/linux/poll.h:90 [inline] ep_item_poll fs/eventpoll.c:846 [inline] ep_send_events fs/eventpoll.c:1683 [inline] ep_poll fs/eventpoll.c:1798 [inline] do_epoll_wait+0x6ad/0xf00 fs/eventpoll.c:2226 __do_sys_epoll_wait fs/eventpoll.c:2238 [inline] __se_sys_epoll_wait fs/eventpoll.c:2233 [inline] __x64_sys_epoll_wait+0xf6/0x120 fs/eventpoll.c:2233 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000001b -> 0x00000001 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 25834 Comm: syz-executor.1 Tainted: G W 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 86b18aaa2b5b ("skbuff: fix a data race in skb_queue_len()") Cc: Qian Cai Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/unix/af_unix.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6bdb0db3e825..841e2f0f5240 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1940,7 +1940,7 @@ static inline void __skb_insert(struct sk_buff *newsk, WRITE_ONCE(newsk->prev, prev); WRITE_ONCE(next->prev, newsk); WRITE_ONCE(prev->next, newsk); - list->qlen++; + WRITE_ONCE(list->qlen, list->qlen + 1); } static inline void __skb_queue_splice(const struct sk_buff_head *list, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index eb47b9de2380..92345c9bb60c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3073,7 +3073,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, other = unix_peer(sk); if (other && unix_peer(other) != sk && - unix_recvq_full(other) && + unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0; -- cgit v1.2.3-71-gd317 From 5dfe50b05588010f347cb2f436434bf22b7a84ed Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Sep 2021 22:36:38 +0900 Subject: bootconfig: Rename xbc_node_find_child() to xbc_node_find_subkey() Rename xbc_node_find_child() to xbc_node_find_subkey() for clarifying that function returns a key node (no value node). Since there are xbc_node_for_each_child() (loop on all child nodes) and xbc_node_for_each_subkey() (loop on only subkey nodes), this name distinction is necessary to avoid confusing users. Link: https://lkml.kernel.org/r/163119459826.161018.11200274779483115300.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 4 ++-- kernel/trace/trace_boot.c | 24 ++++++++++++------------ lib/bootconfig.c | 8 ++++---- 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index abe089c27529..537e1b991f11 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -110,7 +110,7 @@ static inline __init bool xbc_node_is_leaf(struct xbc_node *node) } /* Tree-based key-value access APIs */ -struct xbc_node * __init xbc_node_find_child(struct xbc_node *parent, +struct xbc_node * __init xbc_node_find_subkey(struct xbc_node *parent, const char *key); const char * __init xbc_node_find_value(struct xbc_node *parent, @@ -148,7 +148,7 @@ xbc_find_value(const char *key, struct xbc_node **vnode) */ static inline struct xbc_node * __init xbc_find_node(const char *key) { - return xbc_node_find_child(NULL, key); + return xbc_node_find_subkey(NULL, key); } /** diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index db6ee372dc6d..8d252f63cd78 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -262,9 +262,9 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, append_printf(bufp, end, ":%s(%s)", handler, p); /* Compose 'action' parameter */ - knode = xbc_node_find_child(hnode, "trace"); + knode = xbc_node_find_subkey(hnode, "trace"); if (!knode) - knode = xbc_node_find_child(hnode, "save"); + knode = xbc_node_find_subkey(hnode, "save"); if (knode) { anode = xbc_node_get_child(knode); @@ -283,7 +283,7 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, sep = ','; } append_printf(bufp, end, ")"); - } else if (xbc_node_find_child(hnode, "snapshot")) { + } else if (xbc_node_find_subkey(hnode, "snapshot")) { append_printf(bufp, end, ".snapshot()"); } else { pr_err("hist.%s requires an action.\n", @@ -314,7 +314,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, break; } - if (xbc_node_find_child(hnode, param)) + if (xbc_node_find_subkey(hnode, param)) ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param); return ret; @@ -374,7 +374,7 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) if (p) append_printf(&buf, end, ":name=%s", p); - node = xbc_node_find_child(hnode, "var"); + node = xbc_node_find_subkey(hnode, "var"); if (node) { xbc_node_for_each_key_value(node, knode, p) { /* Expression must not include spaces. */ @@ -393,13 +393,13 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) append_printf(&buf, end, ":clear"); /* Histogram handler and actions */ - node = xbc_node_find_child(hnode, "onmax"); + node = xbc_node_find_subkey(hnode, "onmax"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; - node = xbc_node_find_child(hnode, "onchange"); + node = xbc_node_find_subkey(hnode, "onchange"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; - node = xbc_node_find_child(hnode, "onmatch"); + node = xbc_node_find_subkey(hnode, "onmatch"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0) return -EINVAL; @@ -436,7 +436,7 @@ trace_boot_init_histograms(struct trace_event_file *file, } } - if (xbc_node_find_child(hnode, "keys")) { + if (xbc_node_find_subkey(hnode, "keys")) { if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) @@ -495,7 +495,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", p); } - anode = xbc_node_find_child(enode, "hist"); + anode = xbc_node_find_subkey(enode, "hist"); if (anode) trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf)); } else if (xbc_node_find_value(enode, "actions", NULL)) @@ -517,7 +517,7 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) bool enable, enable_all = false; const char *data; - node = xbc_node_find_child(node, "event"); + node = xbc_node_find_subkey(node, "event"); if (!node) return; /* per-event key starts with "event.GROUP.EVENT" */ @@ -620,7 +620,7 @@ trace_boot_init_instances(struct xbc_node *node) struct trace_array *tr; const char *p; - node = xbc_node_find_child(node, "instance"); + node = xbc_node_find_subkey(node, "instance"); if (!node) return; diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 927017431fb6..f8419cff1147 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -142,16 +142,16 @@ xbc_node_match_prefix(struct xbc_node *node, const char **prefix) } /** - * xbc_node_find_child() - Find a child node which matches given key + * xbc_node_find_subkey() - Find a subkey node which matches given key * @parent: An XBC node. * @key: A key string. * - * Search a node under @parent which matches @key. The @key can contain + * Search a key node under @parent which matches @key. The @key can contain * several words jointed with '.'. If @parent is NULL, this searches the * node from whole tree. Return NULL if no node is matched. */ struct xbc_node * __init -xbc_node_find_child(struct xbc_node *parent, const char *key) +xbc_node_find_subkey(struct xbc_node *parent, const char *key) { struct xbc_node *node; @@ -191,7 +191,7 @@ const char * __init xbc_node_find_value(struct xbc_node *parent, const char *key, struct xbc_node **vnode) { - struct xbc_node *node = xbc_node_find_child(parent, key); + struct xbc_node *node = xbc_node_find_subkey(parent, key); if (!node || !xbc_node_is_key(node)) return NULL; -- cgit v1.2.3-71-gd317 From 2f1aaf3ea666b737ad717b3d88667225aca23149 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Sep 2021 08:49:59 -0700 Subject: bpf, mm: Fix lockdep warning triggered by stack_map_get_build_id_offset() Currently the bpf selftest "get_stack_raw_tp" triggered the warning: [ 1411.304463] WARNING: CPU: 3 PID: 140 at include/linux/mmap_lock.h:164 find_vma+0x47/0xa0 [ 1411.304469] Modules linked in: bpf_testmod(O) [last unloaded: bpf_testmod] [ 1411.304476] CPU: 3 PID: 140 Comm: systemd-journal Tainted: G W O 5.14.0+ #53 [ 1411.304479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1411.304481] RIP: 0010:find_vma+0x47/0xa0 [ 1411.304484] Code: de 48 89 ef e8 ba f5 fe ff 48 85 c0 74 2e 48 83 c4 08 5b 5d c3 48 8d bf 28 01 00 00 be ff ff ff ff e8 2d 9f d8 00 85 c0 75 d4 <0f> 0b 48 89 de 48 8 [ 1411.304487] RSP: 0018:ffffabd440403db8 EFLAGS: 00010246 [ 1411.304490] RAX: 0000000000000000 RBX: 00007f00ad80a0e0 RCX: 0000000000000000 [ 1411.304492] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.304494] RBP: ffff9cf5c2f50000 R08: ffff9cf5c3eb25d8 R09: 00000000fffffffe [ 1411.304496] R10: 0000000000000001 R11: 00000000ef974e19 R12: ffff9cf5c39ae0e0 [ 1411.304498] R13: 0000000000000000 R14: 0000000000000000 R15: ffff9cf5c39ae0e0 [ 1411.304501] FS: 00007f00ae754780(0000) GS:ffff9cf5fba00000(0000) knlGS:0000000000000000 [ 1411.304504] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.304506] CR2: 000000003e34343c CR3: 0000000103a98005 CR4: 0000000000370ee0 [ 1411.304508] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.304510] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.304512] Call Trace: [ 1411.304517] stack_map_get_build_id_offset+0x17c/0x260 [ 1411.304528] __bpf_get_stack+0x18f/0x230 [ 1411.304541] bpf_get_stack_raw_tp+0x5a/0x70 [ 1411.305752] RAX: 0000000000000000 RBX: 5541f689495641d7 RCX: 0000000000000000 [ 1411.305756] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.305758] RBP: ffff9cf5c02b2f40 R08: ffff9cf5ca7606c0 R09: ffffcbd43ee02c04 [ 1411.306978] bpf_prog_32007c34f7726d29_bpf_prog1+0xaf/0xd9c [ 1411.307861] R10: 0000000000000001 R11: 0000000000000044 R12: ffff9cf5c2ef60e0 [ 1411.307865] R13: 0000000000000005 R14: 0000000000000000 R15: ffff9cf5c2ef6108 [ 1411.309074] bpf_trace_run2+0x8f/0x1a0 [ 1411.309891] FS: 00007ff485141700(0000) GS:ffff9cf5fae00000(0000) knlGS:0000000000000000 [ 1411.309896] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.311221] syscall_trace_enter.isra.20+0x161/0x1f0 [ 1411.311600] CR2: 00007ff48514d90e CR3: 0000000107114001 CR4: 0000000000370ef0 [ 1411.312291] do_syscall_64+0x15/0x80 [ 1411.312941] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.313803] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 1411.314223] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.315082] RIP: 0033:0x7f00ad80a0e0 [ 1411.315626] Call Trace: [ 1411.315632] stack_map_get_build_id_offset+0x17c/0x260 To reproduce, first build `test_progs` binary: make -C tools/testing/selftests/bpf -j60 and then run the binary at tools/testing/selftests/bpf directory: ./test_progs -t get_stack_raw_tp The warning is due to commit 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") which added mmap_assert_locked() in find_vma() function. The mmap_assert_locked() function asserts that mm->mmap_lock needs to be held. But this is not the case for bpf_get_stack() or bpf_get_stackid() helper (kernel/bpf/stackmap.c), which uses mmap_read_trylock_non_owner() instead. Since mm->mmap_lock is not held in bpf_get_stack[id]() use case, the above warning is emitted during test run. This patch fixed the issue by (1). using mmap_read_trylock() instead of mmap_read_trylock_non_owner() to satisfy lockdep checking in find_vma(), and (2). droping lockdep for mmap_lock right before the irq_work_queue(). The function mmap_read_trylock_non_owner() is also removed since after this patch nobody calls it any more. Fixes: 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") Suggested-by: Jason Gunthorpe Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Reviewed-by: Liam R. Howlett Cc: Luigi Rizzo Cc: Jason Gunthorpe Cc: linux-mm@kvack.org Link: https://lore.kernel.org/bpf/20210909155000.1610299-1-yhs@fb.com --- include/linux/mmap_lock.h | 9 --------- kernel/bpf/stackmap.c | 10 ++++++++-- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 0540f0156f58..3af8f7fb067d 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -144,15 +144,6 @@ static inline void mmap_read_unlock(struct mm_struct *mm) __mmap_lock_trace_released(mm, false); } -static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) -{ - if (mmap_read_trylock(mm)) { - rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); - return true; - } - return false; -} - static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { up_read_non_owner(&mm->mmap_lock); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index e8eefdf8cf3e..09a3fd97d329 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -179,7 +179,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock_non_owner(current->mm)) { + !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -204,9 +204,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - mmap_read_unlock_non_owner(current->mm); + mmap_read_unlock(current->mm); } else { work->mm = current->mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); irq_work_queue(&work->irq_work); } } -- cgit v1.2.3-71-gd317 From 8c854303ce0e38e5bbedd725ff39da7e235865d8 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:21 +0200 Subject: cpu/hotplug: Remove deprecated CPU-hotplug functions. No users in tree use the deprecated CPU-hotplug functions anymore. Remove them. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210803141621.780504-39-bigeasy@linutronix.de --- include/linux/cpu.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 94a578a96202..9cf51e41e697 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -143,12 +143,6 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; } static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { } #endif /* !CONFIG_HOTPLUG_CPU */ -/* Wrappers which go away once all code is converted */ -static inline void cpu_hotplug_begin(void) { cpus_write_lock(); } -static inline void cpu_hotplug_done(void) { cpus_write_unlock(); } -static inline void get_online_cpus(void) { cpus_read_lock(); } -static inline void put_online_cpus(void) { cpus_read_unlock(); } - #ifdef CONFIG_PM_SLEEP_SMP extern int freeze_secondary_cpus(int primary); extern void thaw_secondary_cpus(void); -- cgit v1.2.3-71-gd317 From c9871c800f65fffed40f3df3e1eb38984f95cfcf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 9 Sep 2021 14:34:59 +0200 Subject: Documentation: core-api/cpuhotplug: Rewrite the API section Dave stumbled over the incomplete and confusing documentation of the CPU hotplug API. Rewrite it, add the missing function documentations and correct the existing ones. Reported-by: Dave Chinner Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210909123212.489059409@linutronix.de --- Documentation/core-api/cpu_hotplug.rst | 579 +++++++++++++++++++++++++++------ include/linux/cpuhotplug.h | 132 ++++++-- 2 files changed, 590 insertions(+), 121 deletions(-) (limited to 'include/linux') diff --git a/Documentation/core-api/cpu_hotplug.rst b/Documentation/core-api/cpu_hotplug.rst index b66e3cae1472..c6f4ba2fb32d 100644 --- a/Documentation/core-api/cpu_hotplug.rst +++ b/Documentation/core-api/cpu_hotplug.rst @@ -2,12 +2,13 @@ CPU hotplug in the Kernel ========================= -:Date: December, 2016 +:Date: September, 2021 :Author: Sebastian Andrzej Siewior , - Rusty Russell , - Srivatsa Vaddagiri , - Ashok Raj , - Joel Schopp + Rusty Russell , + Srivatsa Vaddagiri , + Ashok Raj , + Joel Schopp , + Thomas Gleixner Introduction ============ @@ -158,100 +159,480 @@ at state ``CPUHP_OFFLINE``. This includes: * Once all services are migrated, kernel calls an arch specific routine ``__cpu_disable()`` to perform arch specific cleanup. -Using the hotplug API ---------------------- - -It is possible to receive notifications once a CPU is offline or onlined. This -might be important to certain drivers which need to perform some kind of setup -or clean up functions based on the number of available CPUs:: - - #include - - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "X/Y:online", - Y_online, Y_prepare_down); - -*X* is the subsystem and *Y* the particular driver. The *Y_online* callback -will be invoked during registration on all online CPUs. If an error -occurs during the online callback the *Y_prepare_down* callback will be -invoked on all CPUs on which the online callback was previously invoked. -After registration completed, the *Y_online* callback will be invoked -once a CPU is brought online and *Y_prepare_down* will be invoked when a -CPU is shutdown. All resources which were previously allocated in -*Y_online* should be released in *Y_prepare_down*. -The return value *ret* is negative if an error occurred during the -registration process. Otherwise a positive value is returned which -contains the allocated hotplug for dynamically allocated states -(*CPUHP_AP_ONLINE_DYN*). It will return zero for predefined states. - -The callback can be remove by invoking ``cpuhp_remove_state()``. In case of a -dynamically allocated state (*CPUHP_AP_ONLINE_DYN*) use the returned state. -During the removal of a hotplug state the teardown callback will be invoked. - -Multiple instances -~~~~~~~~~~~~~~~~~~ - -If a driver has multiple instances and each instance needs to perform the -callback independently then it is likely that a ''multi-state'' should be used. -First a multi-state state needs to be registered:: - - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "X/Y:online, - Y_online, Y_prepare_down); - Y_hp_online = ret; - -The ``cpuhp_setup_state_multi()`` behaves similar to ``cpuhp_setup_state()`` -except it prepares the callbacks for a multi state and does not invoke -the callbacks. This is a one time setup. -Once a new instance is allocated, you need to register this new instance:: - - ret = cpuhp_state_add_instance(Y_hp_online, &d->node); - -This function will add this instance to your previously allocated -*Y_hp_online* state and invoke the previously registered callback -(*Y_online*) on all online CPUs. The *node* element is a ``struct -hlist_node`` member of your per-instance data structure. - -On removal of the instance:: - - cpuhp_state_remove_instance(Y_hp_online, &d->node) - -should be invoked which will invoke the teardown callback on all online -CPUs. - -Manual setup -~~~~~~~~~~~~ - -Usually it is handy to invoke setup and teardown callbacks on registration or -removal of a state because usually the operation needs to performed once a CPU -goes online (offline) and during initial setup (shutdown) of the driver. However -each registration and removal function is also available with a ``_nocalls`` -suffix which does not invoke the provided callbacks if the invocation of the -callbacks is not desired. During the manual setup (or teardown) the functions -``cpus_read_lock()`` and ``cpus_read_unlock()`` should be used to inhibit CPU -hotplug operations. - - -The ordering of the events --------------------------- - -The hotplug states are defined in ``include/linux/cpuhotplug.h``: - -* The states *CPUHP_OFFLINE* … *CPUHP_AP_OFFLINE* are invoked before the - CPU is up. -* The states *CPUHP_AP_OFFLINE* … *CPUHP_AP_ONLINE* are invoked - just the after the CPU has been brought up. The interrupts are off and - the scheduler is not yet active on this CPU. Starting with *CPUHP_AP_OFFLINE* - the callbacks are invoked on the target CPU. -* The states between *CPUHP_AP_ONLINE_DYN* and *CPUHP_AP_ONLINE_DYN_END* are - reserved for the dynamic allocation. -* The states are invoked in the reverse order on CPU shutdown starting with - *CPUHP_ONLINE* and stopping at *CPUHP_OFFLINE*. Here the callbacks are - invoked on the CPU that will be shutdown until *CPUHP_AP_OFFLINE*. - -A dynamically allocated state via *CPUHP_AP_ONLINE_DYN* is often enough. -However if an earlier invocation during the bring up or shutdown is required -then an explicit state should be acquired. An explicit state might also be -required if the hotplug event requires specific ordering in respect to -another hotplug event. + +The CPU hotplug API +=================== + +CPU hotplug state machine +------------------------- + +CPU hotplug uses a trivial state machine with a linear state space from +CPUHP_OFFLINE to CPUHP_ONLINE. Each state has a startup and a teardown +callback. + +When a CPU is onlined, the startup callbacks are invoked sequentially until +the state CPUHP_ONLINE is reached. They can also be invoked when the +callbacks of a state are set up or an instance is added to a multi-instance +state. + +When a CPU is offlined the teardown callbacks are invoked in the reverse +order sequentially until the state CPUHP_OFFLINE is reached. They can also +be invoked when the callbacks of a state are removed or an instance is +removed from a multi-instance state. + +If a usage site requires only a callback in one direction of the hotplug +operations (CPU online or CPU offline) then the other not-required callback +can be set to NULL when the state is set up. + +The state space is divided into three sections: + +* The PREPARE section + + The PREPARE section covers the state space from CPUHP_OFFLINE to + CPUHP_BRINGUP_CPU. + + The startup callbacks in this section are invoked before the CPU is + started during a CPU online operation. The teardown callbacks are invoked + after the CPU has become dysfunctional during a CPU offline operation. + + The callbacks are invoked on a control CPU as they can't obviously run on + the hotplugged CPU which is either not yet started or has become + dysfunctional already. + + The startup callbacks are used to setup resources which are required to + bring a CPU successfully online. The teardown callbacks are used to free + resources or to move pending work to an online CPU after the hotplugged + CPU became dysfunctional. + + The startup callbacks are allowed to fail. If a callback fails, the CPU + online operation is aborted and the CPU is brought down to the previous + state (usually CPUHP_OFFLINE) again. + + The teardown callbacks in this section are not allowed to fail. + +* The STARTING section + + The STARTING section covers the state space between CPUHP_BRINGUP_CPU + 1 + and CPUHP_AP_ONLINE. + + The startup callbacks in this section are invoked on the hotplugged CPU + with interrupts disabled during a CPU online operation in the early CPU + setup code. The teardown callbacks are invoked with interrupts disabled + on the hotplugged CPU during a CPU offline operation shortly before the + CPU is completely shut down. + + The callbacks in this section are not allowed to fail. + + The callbacks are used for low level hardware initialization/shutdown and + for core subsystems. + +* The ONLINE section + + The ONLINE section covers the state space between CPUHP_AP_ONLINE + 1 and + CPUHP_ONLINE. + + The startup callbacks in this section are invoked on the hotplugged CPU + during a CPU online operation. The teardown callbacks are invoked on the + hotplugged CPU during a CPU offline operation. + + The callbacks are invoked in the context of the per CPU hotplug thread, + which is pinned on the hotplugged CPU. The callbacks are invoked with + interrupts and preemption enabled. + + The callbacks are allowed to fail. When a callback fails the hotplug + operation is aborted and the CPU is brought back to the previous state. + +CPU online/offline operations +----------------------------- + +A successful online operation looks like this:: + + [CPUHP_OFFLINE] + [CPUHP_OFFLINE + 1]->startup() -> success + [CPUHP_OFFLINE + 2]->startup() -> success + [CPUHP_OFFLINE + 3] -> skipped because startup == NULL + ... + [CPUHP_BRINGUP_CPU]->startup() -> success + === End of PREPARE section + [CPUHP_BRINGUP_CPU + 1]->startup() -> success + ... + [CPUHP_AP_ONLINE]->startup() -> success + === End of STARTUP section + [CPUHP_AP_ONLINE + 1]->startup() -> success + ... + [CPUHP_ONLINE - 1]->startup() -> success + [CPUHP_ONLINE] + +A successful offline operation looks like this:: + + [CPUHP_ONLINE] + [CPUHP_ONLINE - 1]->teardown() -> success + ... + [CPUHP_AP_ONLINE + 1]->teardown() -> success + === Start of STARTUP section + [CPUHP_AP_ONLINE]->teardown() -> success + ... + [CPUHP_BRINGUP_ONLINE - 1]->teardown() + ... + === Start of PREPARE section + [CPUHP_BRINGUP_CPU]->teardown() + [CPUHP_OFFLINE + 3]->teardown() + [CPUHP_OFFLINE + 2] -> skipped because teardown == NULL + [CPUHP_OFFLINE + 1]->teardown() + [CPUHP_OFFLINE] + +A failed online operation looks like this:: + + [CPUHP_OFFLINE] + [CPUHP_OFFLINE + 1]->startup() -> success + [CPUHP_OFFLINE + 2]->startup() -> success + [CPUHP_OFFLINE + 3] -> skipped because startup == NULL + ... + [CPUHP_BRINGUP_CPU]->startup() -> success + === End of PREPARE section + [CPUHP_BRINGUP_CPU + 1]->startup() -> success + ... + [CPUHP_AP_ONLINE]->startup() -> success + === End of STARTUP section + [CPUHP_AP_ONLINE + 1]->startup() -> success + --- + [CPUHP_AP_ONLINE + N]->startup() -> fail + [CPUHP_AP_ONLINE + (N - 1)]->teardown() + ... + [CPUHP_AP_ONLINE + 1]->teardown() + === Start of STARTUP section + [CPUHP_AP_ONLINE]->teardown() + ... + [CPUHP_BRINGUP_ONLINE - 1]->teardown() + ... + === Start of PREPARE section + [CPUHP_BRINGUP_CPU]->teardown() + [CPUHP_OFFLINE + 3]->teardown() + [CPUHP_OFFLINE + 2] -> skipped because teardown == NULL + [CPUHP_OFFLINE + 1]->teardown() + [CPUHP_OFFLINE] + +A failed offline operation looks like this:: + + [CPUHP_ONLINE] + [CPUHP_ONLINE - 1]->teardown() -> success + ... + [CPUHP_ONLINE - N]->teardown() -> fail + [CPUHP_ONLINE - (N - 1)]->startup() + ... + [CPUHP_ONLINE - 1]->startup() + [CPUHP_ONLINE] + +Recursive failures cannot be handled sensibly. Look at the following +example of a recursive fail due to a failed offline operation: :: + + [CPUHP_ONLINE] + [CPUHP_ONLINE - 1]->teardown() -> success + ... + [CPUHP_ONLINE - N]->teardown() -> fail + [CPUHP_ONLINE - (N - 1)]->startup() -> success + [CPUHP_ONLINE - (N - 2)]->startup() -> fail + +The CPU hotplug state machine stops right here and does not try to go back +down again because that would likely result in an endless loop:: + + [CPUHP_ONLINE - (N - 1)]->teardown() -> success + [CPUHP_ONLINE - N]->teardown() -> fail + [CPUHP_ONLINE - (N - 1)]->startup() -> success + [CPUHP_ONLINE - (N - 2)]->startup() -> fail + [CPUHP_ONLINE - (N - 1)]->teardown() -> success + [CPUHP_ONLINE - N]->teardown() -> fail + +Lather, rinse and repeat. In this case the CPU left in state:: + + [CPUHP_ONLINE - (N - 1)] + +which at least lets the system make progress and gives the user a chance to +debug or even resolve the situation. + +Allocating a state +------------------ + +There are two ways to allocate a CPU hotplug state: + +* Static allocation + + Static allocation has to be used when the subsystem or driver has + ordering requirements versus other CPU hotplug states. E.g. the PERF core + startup callback has to be invoked before the PERF driver startup + callbacks during a CPU online operation. During a CPU offline operation + the driver teardown callbacks have to be invoked before the core teardown + callback. The statically allocated states are described by constants in + the cpuhp_state enum which can be found in include/linux/cpuhotplug.h. + + Insert the state into the enum at the proper place so the ordering + requirements are fulfilled. The state constant has to be used for state + setup and removal. + + Static allocation is also required when the state callbacks are not set + up at runtime and are part of the initializer of the CPU hotplug state + array in kernel/cpu.c. + +* Dynamic allocation + + When there are no ordering requirements for the state callbacks then + dynamic allocation is the preferred method. The state number is allocated + by the setup function and returned to the caller on success. + + Only the PREPARE and ONLINE sections provide a dynamic allocation + range. The STARTING section does not as most of the callbacks in that + section have explicit ordering requirements. + +Setup of a CPU hotplug state +---------------------------- + +The core code provides the following functions to setup a state: + +* cpuhp_setup_state(state, name, startup, teardown) +* cpuhp_setup_state_nocalls(state, name, startup, teardown) +* cpuhp_setup_state_cpuslocked(state, name, startup, teardown) +* cpuhp_setup_state_nocalls_cpuslocked(state, name, startup, teardown) + +For cases where a driver or a subsystem has multiple instances and the same +CPU hotplug state callbacks need to be invoked for each instance, the CPU +hotplug core provides multi-instance support. The advantage over driver +specific instance lists is that the instance related functions are fully +serialized against CPU hotplug operations and provide the automatic +invocations of the state callbacks on add and removal. To set up such a +multi-instance state the following function is available: + +* cpuhp_setup_state_multi(state, name, startup, teardown) + +The @state argument is either a statically allocated state or one of the +constants for dynamically allocated states - CPUHP_PREPARE_DYN, +CPUHP_ONLINE_DYN - depending on the state section (PREPARE, ONLINE) for +which a dynamic state should be allocated. + +The @name argument is used for sysfs output and for instrumentation. The +naming convention is "subsys:mode" or "subsys/driver:mode", +e.g. "perf:mode" or "perf/x86:mode". The common mode names are: + +======== ======================================================= +prepare For states in the PREPARE section + +dead For states in the PREPARE section which do not provide + a startup callback + +starting For states in the STARTING section + +dying For states in the STARTING section which do not provide + a startup callback + +online For states in the ONLINE section + +offline For states in the ONLINE section which do not provide + a startup callback +======== ======================================================= + +As the @name argument is only used for sysfs and instrumentation other mode +descriptors can be used as well if they describe the nature of the state +better than the common ones. + +Examples for @name arguments: "perf/online", "perf/x86:prepare", +"RCU/tree:dying", "sched/waitempty" + +The @startup argument is a function pointer to the callback which should be +invoked during a CPU online operation. If the usage site does not require a +startup callback set the pointer to NULL. + +The @teardown argument is a function pointer to the callback which should +be invoked during a CPU offline operation. If the usage site does not +require a teardown callback set the pointer to NULL. + +The functions differ in the way how the installed callbacks are treated: + + * cpuhp_setup_state_nocalls(), cpuhp_setup_state_nocalls_cpuslocked() + and cpuhp_setup_state_multi() only install the callbacks + + * cpuhp_setup_state() and cpuhp_setup_state_cpuslocked() install the + callbacks and invoke the @startup callback (if not NULL) for all online + CPUs which have currently a state greater than the newly installed + state. Depending on the state section the callback is either invoked on + the current CPU (PREPARE section) or on each online CPU (ONLINE + section) in the context of the CPU's hotplug thread. + + If a callback fails for CPU N then the teardown callback for CPU + 0 .. N-1 is invoked to rollback the operation. The state setup fails, + the callbacks for the state are not installed and in case of dynamic + allocation the allocated state is freed. + +The state setup and the callback invocations are serialized against CPU +hotplug operations. If the setup function has to be called from a CPU +hotplug read locked region, then the _cpuslocked() variants have to be +used. These functions cannot be used from within CPU hotplug callbacks. + +The function return values: + ======== =================================================================== + 0 Statically allocated state was successfully set up + + >0 Dynamically allocated state was successfully set up. + + The returned number is the state number which was allocated. If + the state callbacks have to be removed later, e.g. module + removal, then this number has to be saved by the caller and used + as @state argument for the state remove function. For + multi-instance states the dynamically allocated state number is + also required as @state argument for the instance add/remove + operations. + + <0 Operation failed + ======== =================================================================== + +Removal of a CPU hotplug state +------------------------------ + +To remove a previously set up state, the following functions are provided: + +* cpuhp_remove_state(state) +* cpuhp_remove_state_nocalls(state) +* cpuhp_remove_state_nocalls_cpuslocked(state) +* cpuhp_remove_multi_state(state) + +The @state argument is either a statically allocated state or the state +number which was allocated in the dynamic range by cpuhp_setup_state*(). If +the state is in the dynamic range, then the state number is freed and +available for dynamic allocation again. + +The functions differ in the way how the installed callbacks are treated: + + * cpuhp_remove_state_nocalls(), cpuhp_remove_state_nocalls_cpuslocked() + and cpuhp_remove_multi_state() only remove the callbacks. + + * cpuhp_remove_state() removes the callbacks and invokes the teardown + callback (if not NULL) for all online CPUs which have currently a state + greater than the removed state. Depending on the state section the + callback is either invoked on the current CPU (PREPARE section) or on + each online CPU (ONLINE section) in the context of the CPU's hotplug + thread. + + In order to complete the removal, the teardown callback should not fail. + +The state removal and the callback invocations are serialized against CPU +hotplug operations. If the remove function has to be called from a CPU +hotplug read locked region, then the _cpuslocked() variants have to be +used. These functions cannot be used from within CPU hotplug callbacks. + +If a multi-instance state is removed then the caller has to remove all +instances first. + +Multi-Instance state instance management +---------------------------------------- + +Once the multi-instance state is set up, instances can be added to the +state: + + * cpuhp_state_add_instance(state, node) + * cpuhp_state_add_instance_nocalls(state, node) + +The @state argument is either a statically allocated state or the state +number which was allocated in the dynamic range by cpuhp_setup_state_multi(). + +The @node argument is a pointer to an hlist_node which is embedded in the +instance's data structure. The pointer is handed to the multi-instance +state callbacks and can be used by the callback to retrieve the instance +via container_of(). + +The functions differ in the way how the installed callbacks are treated: + + * cpuhp_state_add_instance_nocalls() and only adds the instance to the + multi-instance state's node list. + + * cpuhp_state_add_instance() adds the instance and invokes the startup + callback (if not NULL) associated with @state for all online CPUs which + have currently a state greater than @state. The callback is only + invoked for the to be added instance. Depending on the state section + the callback is either invoked on the current CPU (PREPARE section) or + on each online CPU (ONLINE section) in the context of the CPU's hotplug + thread. + + If a callback fails for CPU N then the teardown callback for CPU + 0 .. N-1 is invoked to rollback the operation, the function fails and + the instance is not added to the node list of the multi-instance state. + +To remove an instance from the state's node list these functions are +available: + + * cpuhp_state_remove_instance(state, node) + * cpuhp_state_remove_instance_nocalls(state, node) + +The arguments are the same as for the the cpuhp_state_add_instance*() +variants above. + +The functions differ in the way how the installed callbacks are treated: + + * cpuhp_state_remove_instance_nocalls() only removes the instance from the + state's node list. + + * cpuhp_state_remove_instance() removes the instance and invokes the + teardown callback (if not NULL) associated with @state for all online + CPUs which have currently a state greater than @state. The callback is + only invoked for the to be removed instance. Depending on the state + section the callback is either invoked on the current CPU (PREPARE + section) or on each online CPU (ONLINE section) in the context of the + CPU's hotplug thread. + + In order to complete the removal, the teardown callback should not fail. + +The node list add/remove operations and the callback invocations are +serialized against CPU hotplug operations. These functions cannot be used +from within CPU hotplug callbacks and CPU hotplug read locked regions. + +Examples +-------- + +Setup and teardown a statically allocated state in the STARTING section for +notifications on online and offline operations:: + + ret = cpuhp_setup_state(CPUHP_SUBSYS_STARTING, "subsys:starting", subsys_cpu_starting, subsys_cpu_dying); + if (ret < 0) + return ret; + .... + cpuhp_remove_state(CPUHP_SUBSYS_STARTING); + +Setup and teardown a dynamically allocated state in the ONLINE section +for notifications on offline operations:: + + state = cpuhp_setup_state(CPUHP_ONLINE_DYN, "subsys:offline", NULL, subsys_cpu_offline); + if (state < 0) + return state; + .... + cpuhp_remove_state(state); + +Setup and teardown a dynamically allocated state in the ONLINE section +for notifications on online operations without invoking the callbacks:: + + state = cpuhp_setup_state_nocalls(CPUHP_ONLINE_DYN, "subsys:online", subsys_cpu_online, NULL); + if (state < 0) + return state; + .... + cpuhp_remove_state_nocalls(state); + +Setup, use and teardown a dynamically allocated multi-instance state in the +ONLINE section for notifications on online and offline operation:: + + state = cpuhp_setup_state_multi(CPUHP_ONLINE_DYN, "subsys:online", subsys_cpu_online, subsys_cpu_offline); + if (state < 0) + return state; + .... + ret = cpuhp_state_add_instance(state, &inst1->node); + if (ret) + return ret; + .... + ret = cpuhp_state_add_instance(state, &inst2->node); + if (ret) + return ret; + .... + cpuhp_remove_instance(state, &inst1->node); + .... + cpuhp_remove_instance(state, &inst2->node); + .... + remove_multi_state(state); + Testing of hotplug states ========================= diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 39cf84a30b9f..832d8a74fa59 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -22,8 +22,42 @@ * AP_ACTIVE AP_ACTIVE */ +/* + * CPU hotplug states. The state machine invokes the installed state + * startup callbacks sequentially from CPUHP_OFFLINE + 1 to CPUHP_ONLINE + * during a CPU online operation. During a CPU offline operation the + * installed teardown callbacks are invoked in the reverse order from + * CPU_ONLINE - 1 down to CPUHP_OFFLINE. + * + * The state space has three sections: PREPARE, STARTING and ONLINE. + * + * PREPARE: The callbacks are invoked on a control CPU before the + * hotplugged CPU is started up or after the hotplugged CPU has died. + * + * STARTING: The callbacks are invoked on the hotplugged CPU from the low level + * hotplug startup/teardown code with interrupts disabled. + * + * ONLINE: The callbacks are invoked on the hotplugged CPU from the per CPU + * hotplug thread with interrupts and preemption enabled. + * + * Adding explicit states to this enum is only necessary when: + * + * 1) The state is within the STARTING section + * + * 2) The state has ordering constraints vs. other states in the + * same section. + * + * If neither #1 nor #2 apply, please use the dynamic state space when + * setting up a state by using CPUHP_PREPARE_DYN or CPUHP_PREPARE_ONLINE + * for the @state argument of the setup function. + * + * See Documentation/core-api/cpu_hotplug.rst for further information and + * examples. + */ enum cpuhp_state { CPUHP_INVALID = -1, + + /* PREPARE section invoked on a control CPU */ CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, CPUHP_PERF_PREPARE, @@ -95,6 +129,11 @@ enum cpuhp_state { CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, CPUHP_BRINGUP_CPU, + + /* + * STARTING section invoked on the hotplugged CPU in low level + * bringup and teardown code. + */ CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, CPUHP_AP_SCHED_STARTING, @@ -155,6 +194,8 @@ enum cpuhp_state { CPUHP_AP_ARM_CACHE_B15_RAC_DYING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, + + /* Online section invoked on the hotplugged CPU from the hotplug thread */ CPUHP_AP_ONLINE_IDLE, CPUHP_AP_SCHED_WAIT_EMPTY, CPUHP_AP_SMPBOOT_THREADS, @@ -216,14 +257,15 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, int (*teardown)(unsigned int cpu), bool multi_instance); /** - * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks + * cpuhp_setup_state - Setup hotplug state callbacks with calling the @startup + * callback * @state: The state for which the calls are installed * @name: Name of the callback (will be used in debug output) - * @startup: startup callback function - * @teardown: teardown callback function + * @startup: startup callback function or NULL if not required + * @teardown: teardown callback function or NULL if not required * - * Installs the callback functions and invokes the startup callback on - * the present cpus which have already reached the @state. + * Installs the callback functions and invokes the @startup callback on + * the online cpus which have already reached the @state. */ static inline int cpuhp_setup_state(enum cpuhp_state state, const char *name, @@ -233,6 +275,18 @@ static inline int cpuhp_setup_state(enum cpuhp_state state, return __cpuhp_setup_state(state, name, true, startup, teardown, false); } +/** + * cpuhp_setup_state_cpuslocked - Setup hotplug state callbacks with calling + * @startup callback from a cpus_read_lock() + * held region + * @state: The state for which the calls are installed + * @name: Name of the callback (will be used in debug output) + * @startup: startup callback function or NULL if not required + * @teardown: teardown callback function or NULL if not required + * + * Same as cpuhp_setup_state() except that it must be invoked from within a + * cpus_read_lock() held region. + */ static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), @@ -244,14 +298,14 @@ static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state, /** * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the - * callbacks + * @startup callback * @state: The state for which the calls are installed * @name: Name of the callback. - * @startup: startup callback function - * @teardown: teardown callback function + * @startup: startup callback function or NULL if not required + * @teardown: teardown callback function or NULL if not required * - * Same as @cpuhp_setup_state except that no calls are executed are invoked - * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n. + * Same as cpuhp_setup_state() except that the @startup callback is not + * invoked during installation. NOP if SMP=n or HOTPLUG_CPU=n. */ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, const char *name, @@ -262,6 +316,19 @@ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, false); } +/** + * cpuhp_setup_state_nocalls_cpuslocked - Setup hotplug state callbacks without + * invoking the @startup callback from + * a cpus_read_lock() held region + * callbacks + * @state: The state for which the calls are installed + * @name: Name of the callback. + * @startup: startup callback function or NULL if not required + * @teardown: teardown callback function or NULL if not required + * + * Same as cpuhp_setup_state_nocalls() except that it must be invoked from + * within a cpus_read_lock() held region. + */ static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), @@ -275,13 +342,13 @@ static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state, * cpuhp_setup_state_multi - Add callbacks for multi state * @state: The state for which the calls are installed * @name: Name of the callback. - * @startup: startup callback function - * @teardown: teardown callback function + * @startup: startup callback function or NULL if not required + * @teardown: teardown callback function or NULL if not required * * Sets the internal multi_instance flag and prepares a state to work as a multi * instance callback. No callbacks are invoked at this point. The callbacks are * invoked once an instance for this state are registered via - * @cpuhp_state_add_instance or @cpuhp_state_add_instance_nocalls. + * cpuhp_state_add_instance() or cpuhp_state_add_instance_nocalls() */ static inline int cpuhp_setup_state_multi(enum cpuhp_state state, const char *name, @@ -306,9 +373,10 @@ int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, * @state: The state for which the instance is installed * @node: The node for this individual state. * - * Installs the instance for the @state and invokes the startup callback on - * the present cpus which have already reached the @state. The @state must have - * been earlier marked as multi-instance by @cpuhp_setup_state_multi. + * Installs the instance for the @state and invokes the registered startup + * callback on the online cpus which have already reached the @state. The + * @state must have been earlier marked as multi-instance by + * cpuhp_setup_state_multi(). */ static inline int cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node) @@ -322,8 +390,9 @@ static inline int cpuhp_state_add_instance(enum cpuhp_state state, * @state: The state for which the instance is installed * @node: The node for this individual state. * - * Installs the instance for the @state The @state must have been earlier - * marked as multi-instance by @cpuhp_setup_state_multi. + * Installs the instance for the @state. The @state must have been earlier + * marked as multi-instance by cpuhp_setup_state_multi. NOP if SMP=n or + * HOTPLUG_CPU=n. */ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, struct hlist_node *node) @@ -331,6 +400,17 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, return __cpuhp_state_add_instance(state, node, false); } +/** + * cpuhp_state_add_instance_nocalls_cpuslocked - Add an instance for a state + * without invoking the startup + * callback from a cpus_read_lock() + * held region. + * @state: The state for which the instance is installed + * @node: The node for this individual state. + * + * Same as cpuhp_state_add_instance_nocalls() except that it must be + * invoked from within a cpus_read_lock() held region. + */ static inline int cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state, struct hlist_node *node) @@ -346,7 +426,7 @@ void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke); * @state: The state for which the calls are removed * * Removes the callback functions and invokes the teardown callback on - * the present cpus which have already reached the @state. + * the online cpus which have already reached the @state. */ static inline void cpuhp_remove_state(enum cpuhp_state state) { @@ -355,7 +435,7 @@ static inline void cpuhp_remove_state(enum cpuhp_state state) /** * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking - * teardown + * the teardown callback * @state: The state for which the calls are removed */ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) @@ -363,6 +443,14 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) __cpuhp_remove_state(state, false); } +/** + * cpuhp_remove_state_nocalls_cpuslocked - Remove hotplug state callbacks without invoking + * teardown from a cpus_read_lock() held region. + * @state: The state for which the calls are removed + * + * Same as cpuhp_remove_state nocalls() except that it must be invoked + * from within a cpus_read_lock() held region. + */ static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state) { __cpuhp_remove_state_cpuslocked(state, false); @@ -390,8 +478,8 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, * @state: The state from which the instance is removed * @node: The node for this individual state. * - * Removes the instance and invokes the teardown callback on the present cpus - * which have already reached the @state. + * Removes the instance and invokes the teardown callback on the online cpus + * which have already reached @state. */ static inline int cpuhp_state_remove_instance(enum cpuhp_state state, struct hlist_node *node) -- cgit v1.2.3-71-gd317 From 4eb6bd55cfb22ffc20652732340c4962f3ac9a91 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 10 Sep 2021 16:40:39 -0700 Subject: compiler.h: drop fallback overflow checkers Once upgrading the minimum supported version of GCC to 5.1, we can drop the fallback code for !COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW. This is effectively a revert of commit f0907827a8a9 ("compiler.h: enable builtin overflow checkers and add fallback code") Link: https://github.com/ClangBuiltLinux/linux/issues/1438#issuecomment-916745801 Suggested-by: Rasmus Villemoes Signed-off-by: Nick Desaulniers Acked-by: Kees Cook Reviewed-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 13 ---- include/linux/compiler-gcc.h | 4 -- include/linux/overflow.h | 138 +----------------------------------- tools/include/linux/compiler-gcc.h | 4 -- tools/include/linux/overflow.h | 140 +------------------------------------ 5 files changed, 6 insertions(+), 293 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 49b0ac8b6fd3..3c4de9b6c6e3 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -62,19 +62,6 @@ #define __no_sanitize_coverage #endif -/* - * Not all versions of clang implement the type-generic versions - * of the builtin overflow checkers. Fortunately, clang implements - * __has_builtin allowing us to avoid awkward version - * checks. Unfortunately, we don't know which version of gcc clang - * pretends to be, so the macro may or may not be defined. - */ -#if __has_builtin(__builtin_mul_overflow) && \ - __has_builtin(__builtin_add_overflow) && \ - __has_builtin(__builtin_sub_overflow) -#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 -#endif - #if __has_feature(shadow_call_stack) # define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index cb9217fc60af..3f7f6fa0e051 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -128,10 +128,6 @@ #define __no_sanitize_coverage #endif -#if GCC_VERSION >= 50100 -#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 -#endif - /* * Turn individual warnings and errors on and off locally, depending * on version. diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 0f12345c21fb..4669632bd72b 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -6,12 +6,9 @@ #include /* - * In the fallback code below, we need to compute the minimum and - * maximum values representable in a given type. These macros may also - * be useful elsewhere, so we provide them outside the - * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. - * - * It would seem more obvious to do something like + * We need to compute the minimum and maximum values representable in a given + * type. These macros may also be useful elsewhere. It would seem more obvious + * to do something like: * * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) @@ -54,7 +51,6 @@ static inline bool __must_check __must_check_overflow(bool overflow) return unlikely(overflow); } -#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW /* * For simplicity and code hygiene, the fallback code below insists on * a, b and *d having the same type (similar to the min() and max() @@ -90,134 +86,6 @@ static inline bool __must_check __must_check_overflow(bool overflow) __builtin_mul_overflow(__a, __b, __d); \ })) -#else - - -/* Checking for unsigned overflow is relatively easy without causing UB. */ -#define __unsigned_add_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a + __b; \ - *__d < __a; \ -}) -#define __unsigned_sub_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a - __b; \ - __a < __b; \ -}) -/* - * If one of a or b is a compile-time constant, this avoids a division. - */ -#define __unsigned_mul_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a * __b; \ - __builtin_constant_p(__b) ? \ - __b > 0 && __a > type_max(typeof(__a)) / __b : \ - __a > 0 && __b > type_max(typeof(__b)) / __a; \ -}) - -/* - * For signed types, detecting overflow is much harder, especially if - * we want to avoid UB. But the interface of these macros is such that - * we must provide a result in *d, and in fact we must produce the - * result promised by gcc's builtins, which is simply the possibly - * wrapped-around value. Fortunately, we can just formally do the - * operations in the widest relevant unsigned type (u64) and then - * truncate the result - gcc is smart enough to generate the same code - * with and without the (u64) casts. - */ - -/* - * Adding two signed integers can overflow only if they have the same - * sign, and overflow has happened iff the result has the opposite - * sign. - */ -#define __signed_add_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a + (u64)__b; \ - (((~(__a ^ __b)) & (*__d ^ __a)) \ - & type_min(typeof(__a))) != 0; \ -}) - -/* - * Subtraction is similar, except that overflow can now happen only - * when the signs are opposite. In this case, overflow has happened if - * the result has the opposite sign of a. - */ -#define __signed_sub_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a - (u64)__b; \ - ((((__a ^ __b)) & (*__d ^ __a)) \ - & type_min(typeof(__a))) != 0; \ -}) - -/* - * Signed multiplication is rather hard. gcc always follows C99, so - * division is truncated towards 0. This means that we can write the - * overflow check like this: - * - * (a > 0 && (b > MAX/a || b < MIN/a)) || - * (a < -1 && (b > MIN/a || b < MAX/a) || - * (a == -1 && b == MIN) - * - * The redundant casts of -1 are to silence an annoying -Wtype-limits - * (included in -Wextra) warning: When the type is u8 or u16, the - * __b_c_e in check_mul_overflow obviously selects - * __unsigned_mul_overflow, but unfortunately gcc still parses this - * code and warns about the limited range of __b. - */ - -#define __signed_mul_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - typeof(a) __tmax = type_max(typeof(a)); \ - typeof(a) __tmin = type_min(typeof(a)); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a * (u64)__b; \ - (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ - (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ - (__b == (typeof(__b))-1 && __a == __tmin); \ -}) - - -#define check_add_overflow(a, b, d) __must_check_overflow( \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_add_overflow(a, b, d), \ - __unsigned_add_overflow(a, b, d))) - -#define check_sub_overflow(a, b, d) __must_check_overflow( \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_sub_overflow(a, b, d), \ - __unsigned_sub_overflow(a, b, d))) - -#define check_mul_overflow(a, b, d) __must_check_overflow( \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_mul_overflow(a, b, d), \ - __unsigned_mul_overflow(a, b, d))) - -#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ - /** check_shl_overflow() - Calculate a left-shifted value and check overflow * * @a: Value to be shifted diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h index 95c072b70d0e..a590a1dfafd9 100644 --- a/tools/include/linux/compiler-gcc.h +++ b/tools/include/linux/compiler-gcc.h @@ -38,7 +38,3 @@ #endif #define __printf(a, b) __attribute__((format(printf, a, b))) #define __scanf(a, b) __attribute__((format(scanf, a, b))) - -#if GCC_VERSION >= 50100 -#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 -#endif diff --git a/tools/include/linux/overflow.h b/tools/include/linux/overflow.h index 8712ff70995f..dcb0c1bf6866 100644 --- a/tools/include/linux/overflow.h +++ b/tools/include/linux/overflow.h @@ -5,12 +5,9 @@ #include /* - * In the fallback code below, we need to compute the minimum and - * maximum values representable in a given type. These macros may also - * be useful elsewhere, so we provide them outside the - * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. - * - * It would seem more obvious to do something like + * We need to compute the minimum and maximum values representable in a given + * type. These macros may also be useful elsewhere. It would seem more obvious + * to do something like: * * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) @@ -36,8 +33,6 @@ #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) #define type_min(T) ((T)((T)-type_max(T)-(T)1)) - -#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW /* * For simplicity and code hygiene, the fallback code below insists on * a, b and *d having the same type (similar to the min() and max() @@ -73,135 +68,6 @@ __builtin_mul_overflow(__a, __b, __d); \ }) -#else - - -/* Checking for unsigned overflow is relatively easy without causing UB. */ -#define __unsigned_add_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a + __b; \ - *__d < __a; \ -}) -#define __unsigned_sub_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a - __b; \ - __a < __b; \ -}) -/* - * If one of a or b is a compile-time constant, this avoids a division. - */ -#define __unsigned_mul_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a * __b; \ - __builtin_constant_p(__b) ? \ - __b > 0 && __a > type_max(typeof(__a)) / __b : \ - __a > 0 && __b > type_max(typeof(__b)) / __a; \ -}) - -/* - * For signed types, detecting overflow is much harder, especially if - * we want to avoid UB. But the interface of these macros is such that - * we must provide a result in *d, and in fact we must produce the - * result promised by gcc's builtins, which is simply the possibly - * wrapped-around value. Fortunately, we can just formally do the - * operations in the widest relevant unsigned type (u64) and then - * truncate the result - gcc is smart enough to generate the same code - * with and without the (u64) casts. - */ - -/* - * Adding two signed integers can overflow only if they have the same - * sign, and overflow has happened iff the result has the opposite - * sign. - */ -#define __signed_add_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a + (u64)__b; \ - (((~(__a ^ __b)) & (*__d ^ __a)) \ - & type_min(typeof(__a))) != 0; \ -}) - -/* - * Subtraction is similar, except that overflow can now happen only - * when the signs are opposite. In this case, overflow has happened if - * the result has the opposite sign of a. - */ -#define __signed_sub_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a - (u64)__b; \ - ((((__a ^ __b)) & (*__d ^ __a)) \ - & type_min(typeof(__a))) != 0; \ -}) - -/* - * Signed multiplication is rather hard. gcc always follows C99, so - * division is truncated towards 0. This means that we can write the - * overflow check like this: - * - * (a > 0 && (b > MAX/a || b < MIN/a)) || - * (a < -1 && (b > MIN/a || b < MAX/a) || - * (a == -1 && b == MIN) - * - * The redundant casts of -1 are to silence an annoying -Wtype-limits - * (included in -Wextra) warning: When the type is u8 or u16, the - * __b_c_e in check_mul_overflow obviously selects - * __unsigned_mul_overflow, but unfortunately gcc still parses this - * code and warns about the limited range of __b. - */ - -#define __signed_mul_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - typeof(a) __tmax = type_max(typeof(a)); \ - typeof(a) __tmin = type_min(typeof(a)); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a * (u64)__b; \ - (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ - (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ - (__b == (typeof(__b))-1 && __a == __tmin); \ -}) - - -#define check_add_overflow(a, b, d) \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_add_overflow(a, b, d), \ - __unsigned_add_overflow(a, b, d)) - -#define check_sub_overflow(a, b, d) \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_sub_overflow(a, b, d), \ - __unsigned_sub_overflow(a, b, d)) - -#define check_mul_overflow(a, b, d) \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_mul_overflow(a, b, d), \ - __unsigned_mul_overflow(a, b, d)) - - -#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ - /** * array_size() - Calculate size of 2-dimensional array. * -- cgit v1.2.3-71-gd317 From 4e59869aa6550657cb148ad49835605660ec9b88 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 10 Sep 2021 16:40:46 -0700 Subject: compiler-gcc.h: drop checks for older GCC versions Now that GCC 5.1 is the minimally supported default, drop the values we don't use. Signed-off-by: Nick Desaulniers Reviewed-by: Kees Cook Reviewed-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 4 +--- tools/include/linux/compiler-gcc.h | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 3f7f6fa0e051..fd82ce169ce9 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -98,10 +98,8 @@ #if GCC_VERSION >= 70000 #define KASAN_ABI_VERSION 5 -#elif GCC_VERSION >= 50000 +#else #define KASAN_ABI_VERSION 4 -#elif GCC_VERSION >= 40902 -#define KASAN_ABI_VERSION 3 #endif #if __has_attribute(__no_sanitize_address__) diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h index a590a1dfafd9..43d9a46d36f0 100644 --- a/tools/include/linux/compiler-gcc.h +++ b/tools/include/linux/compiler-gcc.h @@ -16,9 +16,7 @@ # define __fallthrough __attribute__ ((fallthrough)) #endif -#if GCC_VERSION >= 40300 -# define __compiletime_error(message) __attribute__((error(message))) -#endif /* GCC_VERSION >= 40300 */ +#define __compiletime_error(message) __attribute__((error(message))) /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) -- cgit v1.2.3-71-gd317 From 6d2ef226f2f18d530e48ead0cb5704505628b797 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 13 Sep 2021 10:20:01 -0700 Subject: compiler_attributes.h: drop __has_attribute() support for gcc4 Now that GCC 5.1 is the minimally supported default, the manual workaround for older gcc versions not having __has_attribute() are no longer relevant and can be removed. Signed-off-by: Linus Torvalds --- include/linux/compiler_attributes.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 2487be0e7199..ba417a5c80af 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -20,26 +20,6 @@ * Provide links to the documentation of each supported compiler, if it exists. */ -/* - * __has_attribute is supported on gcc >= 5, clang >= 2.9 and icc >= 17. - * In the meantime, to support gcc < 5, we implement __has_attribute - * by hand. - */ -#ifndef __has_attribute -# define __has_attribute(x) __GCC4_has_attribute_##x -# define __GCC4_has_attribute___assume_aligned__ 1 -# define __GCC4_has_attribute___copy__ 0 -# define __GCC4_has_attribute___designated_init__ 0 -# define __GCC4_has_attribute___externally_visible__ 1 -# define __GCC4_has_attribute___no_caller_saved_registers__ 0 -# define __GCC4_has_attribute___noclone__ 1 -# define __GCC4_has_attribute___no_profile_instrument_function__ 0 -# define __GCC4_has_attribute___nonstring__ 0 -# define __GCC4_has_attribute___no_sanitize_address__ 1 -# define __GCC4_has_attribute___no_sanitize_undefined__ 1 -# define __GCC4_has_attribute___fallthrough__ 0 -#endif - /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alias-function-attribute */ -- cgit v1.2.3-71-gd317 From df26327ea097eb78e7967c45df6b23010c43c28d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 13 Sep 2021 10:29:44 -0700 Subject: Drop some straggling mentions of gcc-4.9 as being stale Fix up the admin-guide README file to the new gcc-5.1 requirement, and remove a stale comment about gcc support for the __assume_aligned__ attribute. Signed-off-by: Linus Torvalds --- Documentation/admin-guide/README.rst | 2 +- Documentation/translations/zh_CN/admin-guide/README.rst | 2 +- Documentation/translations/zh_TW/admin-guide/README.rst | 2 +- include/linux/compiler_attributes.h | 1 - 4 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index 35314b63008c..caa3c09a5c3f 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -259,7 +259,7 @@ Configuring the kernel Compiling the kernel -------------------- - - Make sure you have at least gcc 4.9 available. + - Make sure you have at least gcc 5.1 available. For more information, refer to :ref:`Documentation/process/changes.rst `. Please note that you can still run a.out user programs with this kernel. diff --git a/Documentation/translations/zh_CN/admin-guide/README.rst b/Documentation/translations/zh_CN/admin-guide/README.rst index 669a022f6817..980eb20521cf 100644 --- a/Documentation/translations/zh_CN/admin-guide/README.rst +++ b/Documentation/translations/zh_CN/admin-guide/README.rst @@ -223,7 +223,7 @@ Linux内核5.x版本 编译内核 --------- - - 确保您至少有gcc 4.9可用。 + - 确保您至少有gcc 5.1可用。 有关更多信息,请参阅 :ref:`Documentation/process/changes.rst ` 。 请注意,您仍然可以使用此内核运行a.out用户程序。 diff --git a/Documentation/translations/zh_TW/admin-guide/README.rst b/Documentation/translations/zh_TW/admin-guide/README.rst index b752e50359e6..6ce97edbab37 100644 --- a/Documentation/translations/zh_TW/admin-guide/README.rst +++ b/Documentation/translations/zh_TW/admin-guide/README.rst @@ -226,7 +226,7 @@ Linux內核5.x版本 編譯內核 --------- - - 確保您至少有gcc 4.9可用。 + - 確保您至少有gcc 5.1可用。 有關更多信息,請參閱 :ref:`Documentation/process/changes.rst ` 。 請注意,您仍然可以使用此內核運行a.out用戶程序。 diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index ba417a5c80af..ee19cebabcf5 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -54,7 +54,6 @@ * compiler should see some alignment anyway, when the return value is * massaged by 'flags = ptr & 3; ptr &= ~3;'). * - * Optional: only supported since gcc >= 4.9 * Optional: not supported by icc * * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-assume_005faligned-function-attribute -- cgit v1.2.3-71-gd317 From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Sep 2021 01:07:57 +0200 Subject: bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used. Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") embedded per-socket cgroup information into sock->sk_cgrp_data and in order to save 8 bytes in struct sock made both mutually exclusive, that is, when cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2 falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp). The assumption made was "there is no reason to mix the two and this is in line with how legacy and v2 compatibility is handled" as stated in bd1060a1d671. However, with Kubernetes more widely supporting cgroups v2 as well nowadays, this assumption no longer holds, and the possibility of the v1/v2 mixed mode with the v2 root fallback being hit becomes a real security issue. Many of the cgroup v2 BPF programs are also used for policy enforcement, just to pick _one_ example, that is, to programmatically deny socket related system calls like connect(2) or bind(2). A v2 root fallback would implicitly cause a policy bypass for the affected Pods. In production environments, we have recently seen this case due to various circumstances: i) a different 3rd party agent and/or ii) a container runtime such as [0] in the user's environment configuring legacy cgroup v1 net_cls tags, which triggered implicitly mentioned root fallback. Another case is Kubernetes projects like kind [1] which create Kubernetes nodes in a container and also add cgroup namespaces to the mix, meaning programs which are attached to the cgroup v2 root of the cgroup namespace get attached to a non-root cgroup v2 path from init namespace point of view. And the latter's root is out of reach for agents on a kind Kubernetes node to configure. Meaning, any entity on the node setting cgroup v1 net_cls tag will trigger the bypass despite cgroup v2 BPF programs attached to the namespace root. Generally, this mutual exclusiveness does not hold anymore in today's user environments and makes cgroup v2 usage from BPF side fragile and unreliable. This fix adds proper struct cgroup pointer for the cgroup v2 case to struct sock_cgroup_data in order to address these issues; this implicitly also fixes the tradeoffs being made back then with regards to races and refcount leaks as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF programs always operate as expected. [0] https://github.com/nestybox/sysbox/ [1] https://kind.sigs.k8s.io/ Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Stanislav Fomichev Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net --- include/linux/cgroup-defs.h | 107 +++++++++++-------------------------------- include/linux/cgroup.h | 22 +-------- kernel/cgroup/cgroup.c | 50 ++++---------------- net/core/netclassid_cgroup.c | 7 +-- net/core/netprio_cgroup.c | 10 +--- 5 files changed, 41 insertions(+), 155 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e1c705fdfa7c..db2e147e069f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains * per-socket cgroup information except for memcg association. * - * On legacy hierarchies, net_prio and net_cls controllers directly set - * attributes on each sock which can then be tested by the network layer. - * On the default hierarchy, each sock is associated with the cgroup it was - * created in and the networking layer can match the cgroup directly. - * - * To avoid carrying all three cgroup related fields separately in sock, - * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer. - * On boot, sock_cgroup_data records the cgroup that the sock was created - * in so that cgroup2 matches can be made; however, once either net_prio or - * net_cls starts being used, the area is overridden to carry prioidx and/or - * classid. The two modes are distinguished by whether the lowest bit is - * set. Clear bit indicates cgroup pointer while set bit prioidx and - * classid. - * - * While userland may start using net_prio or net_cls at any time, once - * either is used, cgroup2 matching no longer works. There is no reason to - * mix the two and this is in line with how legacy and v2 compatibility is - * handled. On mode switch, cgroup references which are already being - * pointed to by socks may be leaked. While this can be remedied by adding - * synchronization around sock_cgroup_data, given that the number of leaked - * cgroups is bound and highly unlikely to be high, this seems to be the - * better trade-off. + * On legacy hierarchies, net_prio and net_cls controllers directly + * set attributes on each sock which can then be tested by the network + * layer. On the default hierarchy, each sock is associated with the + * cgroup it was created in and the networking layer can match the + * cgroup directly. */ struct sock_cgroup_data { - union { -#ifdef __LITTLE_ENDIAN - struct { - u8 is_data : 1; - u8 no_refcnt : 1; - u8 unused : 6; - u8 padding; - u16 prioidx; - u32 classid; - } __packed; -#else - struct { - u32 classid; - u16 prioidx; - u8 padding; - u8 unused : 6; - u8 no_refcnt : 1; - u8 is_data : 1; - } __packed; + struct cgroup *cgroup; /* v2 */ +#ifdef CONFIG_CGROUP_NET_CLASSID + u32 classid; /* v1 */ +#endif +#ifdef CONFIG_CGROUP_NET_PRIO + u16 prioidx; /* v1 */ #endif - u64 val; - }; }; -/* - * There's a theoretical window where the following accessors race with - * updaters and return part of the previous pointer as the prioidx or - * classid. Such races are short-lived and the result isn't critical. - */ static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) { - /* fallback to 1 which is always the ID of the root cgroup */ - return (skcd->is_data & 1) ? skcd->prioidx : 1; +#ifdef CONFIG_CGROUP_NET_PRIO + return READ_ONCE(skcd->prioidx); +#else + return 1; +#endif } static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { - /* fallback to 0 which is the unconfigured default classid */ - return (skcd->is_data & 1) ? skcd->classid : 0; +#ifdef CONFIG_CGROUP_NET_CLASSID + return READ_ONCE(skcd->classid); +#else + return 0; +#endif } -/* - * If invoked concurrently, the updaters may clobber each other. The - * caller is responsible for synchronization. - */ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; - - if (sock_cgroup_prioidx(&skcd_buf) == prioidx) - return; - - if (!(skcd_buf.is_data & 1)) { - skcd_buf.val = 0; - skcd_buf.is_data = 1; - } - - skcd_buf.prioidx = prioidx; - WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +#ifdef CONFIG_CGROUP_NET_PRIO + WRITE_ONCE(skcd->prioidx, prioidx); +#endif } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; - - if (sock_cgroup_classid(&skcd_buf) == classid) - return; - - if (!(skcd_buf.is_data & 1)) { - skcd_buf.val = 0; - skcd_buf.is_data = 1; - } - - skcd_buf.classid = classid; - WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +#ifdef CONFIG_CGROUP_NET_CLASSID + WRITE_ONCE(skcd->classid, classid); +#endif } #else /* CONFIG_SOCK_CGROUP_DATA */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7bf60454a313..75c151413fda 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -829,33 +829,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task, */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) -extern spinlock_t cgroup_sk_update_lock; -#endif - -void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - unsigned long v; - - /* - * @skcd->val is 64bit but the following is safe on 32bit too as we - * just need the lower ulong to be written and read atomically. - */ - v = READ_ONCE(skcd->val); - - if (v & 3) - return &cgrp_dfl_root.cgrp; - - return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; -#else - return (struct cgroup *)(unsigned long)skcd->val; -#endif + return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 881ce1470beb..8afa8690d288 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - -DEFINE_SPINLOCK(cgroup_sk_update_lock); -static bool cgroup_sk_alloc_disabled __read_mostly; - -void cgroup_sk_alloc_disable(void) -{ - if (cgroup_sk_alloc_disabled) - return; - pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); - cgroup_sk_alloc_disabled = true; -} - -#else - -#define cgroup_sk_alloc_disabled false - -#endif - void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) { - skcd->no_refcnt = 1; - return; - } - /* Don't associate the sock with unrelated interrupted task's cgroup. */ if (in_interrupt()) return; rcu_read_lock(); - while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->val = (unsigned long)cset->dfl_cgrp; + skcd->cgroup = cset->dfl_cgrp; cgroup_bpf_get(cset->dfl_cgrp); break; } cpu_relax(); } - rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { - if (skcd->val) { - if (skcd->no_refcnt) - return; - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); - } + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(cgrp); + cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); - if (skcd->no_refcnt) - return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index b49c57d35a88..1a6a86693b74 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -71,11 +71,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file); - if (sock) { - spin_lock(&cgroup_sk_update_lock); + if (sock) sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); - spin_unlock(&cgroup_sk_update_lock); - } if (--ctx->batch == 0) { ctx->batch = UPDATE_CLASSID_BATCH; return n + 1; @@ -121,8 +118,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, struct css_task_iter it; struct task_struct *p; - cgroup_sk_alloc_disable(); - cs->classid = (u32)value; css_task_iter_start(css, 0, &it); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 99a431c56f23..8456dfbe2eb4 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of, if (!dev) return -ENODEV; - cgroup_sk_alloc_disable(); - rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); @@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kernfs_open_file *of, static int update_netprio(const void *v, struct file *file, unsigned n) { struct socket *sock = sock_from_file(file); - if (sock) { - spin_lock(&cgroup_sk_update_lock); + + if (sock) sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, (unsigned long)v); - spin_unlock(&cgroup_sk_update_lock); - } return 0; } @@ -235,8 +231,6 @@ static void net_prio_attach(struct cgroup_taskset *tset) struct task_struct *p; struct cgroup_subsys_state *css; - cgroup_sk_alloc_disable(); - cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->id; -- cgit v1.2.3-71-gd317 From 77e02cf57b6cff9919949defb7fd9b8ac16399a2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 14 Sep 2021 13:23:22 -0700 Subject: memblock: introduce saner 'memblock_free_ptr()' interface The boot-time allocation interface for memblock is a mess, with 'memblock_alloc()' returning a virtual pointer, but then you are supposed to free it with 'memblock_free()' that takes a _physical_ address. Not only is that all kinds of strange and illogical, but it actually causes bugs, when people then use it like a normal allocation function, and it fails spectacularly on a NULL pointer: https://lore.kernel.org/all/20210912140820.GD25450@xsang-OptiPlex-9020/ or just random memory corruption if the debug checks don't catch it: https://lore.kernel.org/all/61ab2d0c-3313-aaab-514c-e15b7aa054a0@suse.cz/ I really don't want to apply patches that treat the symptoms, when the fundamental cause is this horribly confusing interface. I started out looking at just automating a sane replacement sequence, but because of this mix or virtual and physical addresses, and because people have used the "__pa()" macro that can take either a regular kernel pointer, or just the raw "unsigned long" address, it's all quite messy. So this just introduces a new saner interface for freeing a virtual address that was allocated using 'memblock_alloc()', and that was kept as a regular kernel pointer. And then it converts a couple of users that are obvious and easy to test, including the 'xbc_nodes' case in lib/bootconfig.c that caused problems. Reported-by: kernel test robot Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed") Cc: Steven Rostedt Cc: Mike Rapoport Cc: Andrew Morton Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Vlastimil Babka Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/mm/kasan_init_64.c | 6 ++---- arch/x86/mm/numa.c | 2 +- arch/x86/mm/numa_emulation.c | 3 +-- drivers/base/arch_numa.c | 2 +- drivers/macintosh/smu.c | 2 +- include/linux/memblock.h | 1 + init/main.c | 2 +- kernel/printk/printk.c | 4 ++-- lib/bootconfig.c | 2 +- mm/memblock.c | 16 +++++++++++++++- 11 files changed, 27 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 78a32b956e81..5afd98559193 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_free_ptr(ptr, size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 1a50434c8a4d..ef885370719a 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -49,8 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, p = early_alloc(PMD_SIZE, nid, false); if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) return; - else if (p) - memblock_free(__pa(p), PMD_SIZE); + memblock_free_ptr(p, PMD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); @@ -86,8 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, p = early_alloc(PUD_SIZE, nid, false); if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) return; - else if (p) - memblock_free(__pa(p), PUD_SIZE); + memblock_free_ptr(p, PUD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index a1b5c71099e6..1e9b93b088db 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -355,7 +355,7 @@ void __init numa_reset_distance(void) /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) - memblock_free(__pa(numa_distance), size); + memblock_free_ptr(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 737491b13728..e801e30089c4 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -517,8 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } /* free the copied physical distance table */ - if (phys_dist) - memblock_free(__pa(phys_dist), phys_size); + memblock_free_ptr(phys_dist, phys_size); return; no_emu: diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 46c503486e96..00fb4120a5b3 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -264,7 +264,7 @@ void __init numa_free_distance(void) size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); - memblock_free(__pa(numa_distance), size); + memblock_free_ptr(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; } diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index 94fb63a7b357..fe63d5ee201b 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -570,7 +570,7 @@ fail_msg_node: fail_db_node: of_node_put(smu->db_node); fail_bootmem: - memblock_free(__pa(smu), sizeof(struct smu_device)); + memblock_free_ptr(smu, sizeof(struct smu_device)); smu = NULL; fail_np: of_node_put(np); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b066024c62e3..34de69b3b8ba 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -118,6 +118,7 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); void memblock_free_all(void); +void memblock_free_ptr(void *ptr, size_t size); void reset_node_managed_pages(pg_data_t *pgdat); void reset_all_zones_managed_pages(void); diff --git a/init/main.c b/init/main.c index 5c9a48df90e1..3f7216934441 100644 --- a/init/main.c +++ b/init/main.c @@ -924,7 +924,7 @@ static void __init print_unknown_bootoptions(void) end += sprintf(end, " %s", *p); pr_notice("Unknown command line parameters:%s\n", unknown_options); - memblock_free(__pa(unknown_options), len); + memblock_free_ptr(unknown_options, len); } asmlinkage __visible void __init __no_sanitize_address start_kernel(void) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 825277e1e742..a8d0a58deebc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early) return; err_free_descs: - memblock_free(__pa(new_descs), new_descs_size); + memblock_free_ptr(new_descs, new_descs_size); err_free_log_buf: - memblock_free(__pa(new_log_buf), new_log_buf_len); + memblock_free_ptr(new_log_buf, new_log_buf_len); } static bool __read_mostly ignore_loglevel; diff --git a/lib/bootconfig.c b/lib/bootconfig.c index f8419cff1147..5ae248b29373 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -792,7 +792,7 @@ void __init xbc_destroy_all(void) xbc_data = NULL; xbc_data_size = 0; xbc_node_num = 0; - memblock_free(__pa(xbc_nodes), sizeof(struct xbc_node) * XBC_NODE_MAX); + memblock_free_ptr(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX); xbc_nodes = NULL; brace_index = 0; } diff --git a/mm/memblock.c b/mm/memblock.c index 0ab5a749bfa6..184dcd2e5d99 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -472,7 +472,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, kfree(old_array); else if (old_array != memblock_memory_init_regions && old_array != memblock_reserved_init_regions) - memblock_free(__pa(old_array), old_alloc_size); + memblock_free_ptr(old_array, old_alloc_size); /* * Reserve the new array if that comes from the memblock. Otherwise, we @@ -795,6 +795,20 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.memory, base, size); } +/** + * memblock_free_ptr - free boot memory allocation + * @ptr: starting address of the boot memory allocation + * @size: size of the boot memory block in bytes + * + * Free boot memory block previously allocated by memblock_alloc_xx() API. + * The freeing memory will not be released to the buddy allocator. + */ +void __init_memblock memblock_free_ptr(void *ptr, size_t size) +{ + if (ptr) + memblock_free(__pa(ptr), size); +} + /** * memblock_free - free boot memory block * @base: phys starting address of the boot memory block -- cgit v1.2.3-71-gd317 From f6b5f1a56987de837f8e25cd560847106b8632a8 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 14 Sep 2021 20:52:24 -0700 Subject: compiler.h: Introduce absolute_pointer macro absolute_pointer() disassociates a pointer from its originating symbol type and context. Use it to prevent compiler warnings/errors such as drivers/net/ethernet/i825xx/82596.c: In function 'i82596_probe': arch/m68k/include/asm/string.h:72:25: error: '__builtin_memcpy' reading 6 bytes from a region of size 0 [-Werror=stringop-overread] Such warnings may be reported by gcc 11.x for string and memory operations on fixed addresses. Suggested-by: Linus Torvalds Signed-off-by: Guenter Roeck Reviewed-by: Geert Uytterhoeven Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index b67261a1e3e9..3d5af56337bd 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -188,6 +188,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, (typeof(ptr)) (__ptr + (off)); }) #endif +#define absolute_pointer(val) RELOC_HIDE((void *)(val), 0) + #ifndef OPTIMIZER_HIDE_VAR /* Make the optimizer believe the variable can be manipulated arbitrarily. */ #define OPTIMIZER_HIDE_VAR(var) \ -- cgit v1.2.3-71-gd317