From 5f867db63473f32cce1b868e281ebd42a41f8fad Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 26 Jun 2015 19:43:58 -0500 Subject: mtd: nand: Fix NAND_USE_BOUNCE_BUFFER flag conflict Commit 66507c7bc8895f0da6b ("mtd: nand: Add support to use nand_base poi databuf as bounce buffer") added a flag NAND_USE_BOUNCE_BUFFER using the same bit value as the existing NAND_BUSWIDTH_AUTO. Cc: Kamal Dasu Fixes: 66507c7bc8895f0da6b ("mtd: nand: Add support to use nand_base poi databuf as bounce buffer") Signed-off-by: Scott Wood Signed-off-by: Brian Norris --- include/linux/mtd/nand.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index f25e2bdd188c..272f42952f34 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -177,11 +177,6 @@ typedef enum { #define NAND_OWN_BUFFERS 0x00020000 /* Chip may not exist, so silence any errors in scan */ #define NAND_SCAN_SILENT_NODEV 0x00040000 -/* - * This option could be defined by controller drivers to protect against - * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers - */ -#define NAND_USE_BOUNCE_BUFFER 0x00080000 /* * Autodetect nand buswidth with readid/onfi. * This suppose the driver will configure the hardware in 8 bits mode @@ -189,6 +184,11 @@ typedef enum { * before calling nand_scan_tail. */ #define NAND_BUSWIDTH_AUTO 0x00080000 +/* + * This option could be defined by controller drivers to protect against + * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers + */ +#define NAND_USE_BOUNCE_BUFFER 0x00100000 /* Options set by nand scan */ /* Nand scan has allocated controller struct */ -- cgit v1.2.3-71-gd317 From 4c62360d7562a20c996836d163259c87d9378120 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Tue, 30 Jun 2015 15:57:51 -0700 Subject: efi: Handle memory error structures produced based on old versions of standard The memory error record structure includes as its first field a bitmask of which subsequent fields are valid. The allows new fields to be added to the structure while keeping compatibility with older software that parses these records. This mechanism was used between versions 2.2 and 2.3 to add four new fields, growing the size of the structure from 73 bytes to 80. But Linux just added all the new fields so this test: if (gdata->error_data_length >= sizeof(*mem_err)) cper_print_mem(newpfx, mem_err); else goto err_section_too_small; now make Linux complain about old format records being too short. Add a definition for the old format of the structure and use that for the minimum size check. Pass the actual size to cper_print_mem() so it can sanity check the validation_bits field to ensure that if a BIOS using the old format sets bits as if it were new, we won't access fields beyond the end of the structure. Signed-off-by: Tony Luck Cc: Signed-off-by: Matt Fleming --- drivers/firmware/efi/cper.c | 15 ++++++++++++--- include/linux/cper.h | 22 +++++++++++++++++++++- 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 4fd9961d552e..d42537425438 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -305,10 +305,17 @@ const char *cper_mem_err_unpack(struct trace_seq *p, return ret; } -static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem) +static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem, + int len) { struct cper_mem_err_compact cmem; + /* Don't trust UEFI 2.1/2.2 structure with bad validation bits */ + if (len == sizeof(struct cper_sec_mem_err_old) && + (mem->validation_bits & ~(CPER_MEM_VALID_RANK_NUMBER - 1))) { + pr_err(FW_WARN "valid bits set for fields beyond structure\n"); + return; + } if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS) printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status); if (mem->validation_bits & CPER_MEM_VALID_PA) @@ -405,8 +412,10 @@ static void cper_estatus_print_section( } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = (void *)(gdata + 1); printk("%s""section_type: memory error\n", newpfx); - if (gdata->error_data_length >= sizeof(*mem_err)) - cper_print_mem(newpfx, mem_err); + if (gdata->error_data_length >= + sizeof(struct cper_sec_mem_err_old)) + cper_print_mem(newpfx, mem_err, + gdata->error_data_length); else goto err_section_too_small; } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) { diff --git a/include/linux/cper.h b/include/linux/cper.h index 76abba4b238e..dcacb1a72e26 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -340,7 +340,27 @@ struct cper_ia_proc_ctx { __u64 mm_reg_addr; }; -/* Memory Error Section */ +/* Old Memory Error Section UEFI 2.1, 2.2 */ +struct cper_sec_mem_err_old { + __u64 validation_bits; + __u64 error_status; + __u64 physical_addr; + __u64 physical_addr_mask; + __u16 node; + __u16 card; + __u16 module; + __u16 bank; + __u16 device; + __u16 row; + __u16 column; + __u16 bit_pos; + __u64 requestor_id; + __u64 responder_id; + __u64 target_id; + __u8 error_type; +}; + +/* Memory Error Section UEFI >= 2.3 */ struct cper_sec_mem_err { __u64 validation_bits; __u64 error_status; -- cgit v1.2.3-71-gd317 From 71d126fd28de2d4d9b7b2088dbccd7ca62fad6e0 Mon Sep 17 00:00:00 2001 From: Arne Fitzenreiter Date: Wed, 15 Jul 2015 13:54:36 +0200 Subject: libata: add ATA_HORKAGE_NOTRIM Some devices lose data on TRIM whether queued or not. This patch adds a horkage to disable TRIM. tj: Collapsed unnecessary if() nesting. Signed-off-by: Arne Fitzenreiter Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- drivers/ata/libata-scsi.c | 3 ++- drivers/ata/libata-transport.c | 2 ++ include/linux/libata.h | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 3131adcc1f87..641a61a59e89 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -2568,7 +2568,8 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf) rbuf[14] = (lowest_aligned >> 8) & 0x3f; rbuf[15] = lowest_aligned; - if (ata_id_has_trim(args->id)) { + if (ata_id_has_trim(args->id) && + !(dev->horkage & ATA_HORKAGE_NOTRIM)) { rbuf[14] |= 0x80; /* LBPME */ if (ata_id_has_zero_after_trim(args->id) && diff --git a/drivers/ata/libata-transport.c b/drivers/ata/libata-transport.c index d6c37bcd416d..e2d94972962d 100644 --- a/drivers/ata/libata-transport.c +++ b/drivers/ata/libata-transport.c @@ -569,6 +569,8 @@ show_ata_dev_trim(struct device *dev, if (!ata_id_has_trim(ata_dev->id)) mode = "unsupported"; + else if (ata_dev->horkage & ATA_HORKAGE_NOTRIM) + mode = "forced_unsupported"; else if (ata_dev->horkage & ATA_HORKAGE_NO_NCQ_TRIM) mode = "forced_unqueued"; else if (ata_fpdma_dsm_supported(ata_dev)) diff --git a/include/linux/libata.h b/include/linux/libata.h index 36ce37bcc963..5c8bac6225a6 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -431,6 +431,8 @@ enum { ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21), /* some WDs have broken LPM */ ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */ ATA_HORKAGE_NO_NCQ_LOG = (1 << 23), /* don't use NCQ for log read */ + ATA_HORKAGE_NOTRIM = (1 << 24), /* don't use TRIM */ + /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3-71-gd317 From af34d637637eabaf49406eb35c948cd51ba262a6 Mon Sep 17 00:00:00 2001 From: David Milburn Date: Mon, 13 Jul 2015 11:48:23 -0500 Subject: libata: add ATA_HORKAGE_MAX_SEC_1024 to revert back to previous max_sectors limit Since no longer limiting max_sectors to BLK_DEF_MAX_SECTORS (commit 34b48db66e08), data corruption may occur on ST380013AS drive configured on 82801JI (ICH10 Family) SATA controller. This patch will allow the driver to limit max_sectors as before # cat /sys/block/sdb/queue/max_sectors_kb 512 I was able to double the max_sectors_kb value up to 16384 on linux-4.2.0-rc2 before seeing corruption, but seems safer to use previous limit. Without this patch max_sectors_kb will be 32767. tj: Minor comment update. Reported-by: Jeff Moyer Signed-off-by: David Milburn Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v3.19 and later Fixes: 34b48db66e08 ("block: remove artifical max_hw_sectors cap") --- drivers/ata/libata-core.c | 10 ++++++++++ include/linux/ata.h | 1 + include/linux/libata.h | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index ed2b218ea64d..68202a8a3a0b 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2478,6 +2478,10 @@ int ata_dev_configure(struct ata_device *dev) dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_128, dev->max_sectors); + if (dev->horkage & ATA_HORKAGE_MAX_SEC_1024) + dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_1024, + dev->max_sectors); + if (dev->horkage & ATA_HORKAGE_MAX_SEC_LBA48) dev->max_sectors = ATA_MAX_SECTORS_LBA48; @@ -4146,6 +4150,12 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "Slimtype DVD A DS8A8SH", NULL, ATA_HORKAGE_MAX_SEC_LBA48 }, { "Slimtype DVD A DS8A9SH", NULL, ATA_HORKAGE_MAX_SEC_LBA48 }, + /* + * Causes silent data corruption with higher max sects. + * http://lkml.kernel.org/g/x49wpy40ysk.fsf@segfault.boston.devel.redhat.com + */ + { "ST380013AS", "3.20", ATA_HORKAGE_MAX_SEC_1024 }, + /* Devices we expect to fail diagnostics */ /* Devices where NCQ should be avoided */ diff --git a/include/linux/ata.h b/include/linux/ata.h index fed36418dd1c..6c78956aa470 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -45,6 +45,7 @@ enum { ATA_SECT_SIZE = 512, ATA_MAX_SECTORS_128 = 128, ATA_MAX_SECTORS = 256, + ATA_MAX_SECTORS_1024 = 1024, ATA_MAX_SECTORS_LBA48 = 65535,/* TODO: 65536? */ ATA_MAX_SECTORS_TAPE = 65535, diff --git a/include/linux/libata.h b/include/linux/libata.h index 5c8bac6225a6..c9cfbcdb8d14 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -432,7 +432,7 @@ enum { ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */ ATA_HORKAGE_NO_NCQ_LOG = (1 << 23), /* don't use NCQ for log read */ ATA_HORKAGE_NOTRIM = (1 << 24), /* don't use TRIM */ - + ATA_HORKAGE_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3-71-gd317 From cd812599796f500b042f5464b6665755eca21137 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 5 Jul 2015 11:12:07 -0400 Subject: NFS: Remove the "NFS_CAP_CHANGE_ATTR" capability Setting the change attribute has been mandatory for all NFS versions, since commit 3a1556e8662c ("NFSv2/v3: Simulate the change attribute"). We should therefore not have anything be conditional on it being set/unset. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 +- fs/nfs/inode.c | 4 ++-- fs/nfs/nfs4proc.c | 3 --- include/linux/nfs_fs_sb.h | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ecebb406cc1a..4a90c9bb3135 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -775,7 +775,7 @@ static int nfs_init_server(struct nfs_server *server, server->options = data->options; server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| - NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR; + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 426e4f8207ef..0adc7d245b3d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -442,7 +442,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode->i_version = fattr->change_attr; - else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) + else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) @@ -1692,7 +1692,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); inode->i_version = fattr->change_attr; } - } else if (server->caps & NFS_CAP_CHANGE_ATTR) + } else nfsi->cache_validity |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9264994ec9d3..c85ffe67b5f3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8591,7 +8591,6 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK, .init_client = nfs40_init_client, .shutdown_client = nfs40_shutdown_client, @@ -8617,7 +8616,6 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1, @@ -8640,7 +8638,6 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .minor_version = 2, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index a2ea1491d3df..20bc8e51b161 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -220,7 +220,7 @@ struct nfs_server { #define NFS_CAP_SYMLINKS (1U << 2) #define NFS_CAP_ACLS (1U << 3) #define NFS_CAP_ATOMIC_OPEN (1U << 4) -#define NFS_CAP_CHANGE_ATTR (1U << 5) +/* #define NFS_CAP_CHANGE_ATTR (1U << 5) */ #define NFS_CAP_FILEID (1U << 6) #define NFS_CAP_MODE (1U << 7) #define NFS_CAP_NLINK (1U << 8) -- cgit v1.2.3-71-gd317 From 115c48d7a5351abeadd0c8a3dc87eca3d66a6475 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 5 Jul 2015 12:36:34 -0400 Subject: NFS: nfs_mark_for_revalidate should always set NFS_INO_REVAL_PAGECACHE I'm not aware of any existing bugs around this, but the expectation is that nfs_mark_for_revalidate() should always force a revalidation of the cached metadata. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f91b5ade30c9..874b77228fb9 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -292,9 +292,12 @@ static inline void nfs_mark_for_revalidate(struct inode *inode) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR | + NFS_INO_REVAL_PAGECACHE | + NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL; if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; + nfsi->cache_validity |= NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); } -- cgit v1.2.3-71-gd317 From a3bd4f989f532694337dd30538b635d5213ab86a Mon Sep 17 00:00:00 2001 From: Dong Aisheng Date: Wed, 22 Jul 2015 20:53:09 +0800 Subject: mmc: sdhci-esdhc-imx: clear f_max in boarddata After commit 8d86e4fcccf6 ("mmc: sdhci-esdhc-imx: Call mmc_of_parse()"), it's not used anymore. Signed-off-by: Dong Aisheng Reviewed-by: Johan Derycke Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-esdhc-imx.c | 7 +------ include/linux/platform_data/mmc-esdhc-imx.h | 1 - 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index 1b0e61847e73..c6b9f6492e1a 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -581,13 +581,8 @@ static void esdhc_writeb_le(struct sdhci_host *host, u8 val, int reg) static unsigned int esdhc_pltfm_get_max_clock(struct sdhci_host *host) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct pltfm_imx_data *imx_data = pltfm_host->priv; - struct esdhc_platform_data *boarddata = &imx_data->boarddata; - if (boarddata->f_max && (boarddata->f_max < pltfm_host->clock)) - return boarddata->f_max; - else - return pltfm_host->clock; + return pltfm_host->clock; } static unsigned int esdhc_pltfm_get_min_clock(struct sdhci_host *host) diff --git a/include/linux/platform_data/mmc-esdhc-imx.h b/include/linux/platform_data/mmc-esdhc-imx.h index 75f70f6ac137..e1571efa3f2b 100644 --- a/include/linux/platform_data/mmc-esdhc-imx.h +++ b/include/linux/platform_data/mmc-esdhc-imx.h @@ -43,7 +43,6 @@ struct esdhc_platform_data { enum wp_types wp_type; enum cd_types cd_type; int max_bus_width; - unsigned int f_max; bool support_vsel; unsigned int delay_line; }; -- cgit v1.2.3-71-gd317 From e3eea1404f5ff7a2ceb7b5e7ba412a6fd94f2935 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 24 Jul 2015 10:38:12 -0400 Subject: ftrace: Fix breakage of set_ftrace_pid Commit 4104d326b670 ("ftrace: Remove global function list and call function directly") simplified the ftrace code by removing the global_ops list with a new design. But this cleanup also broke the filtering of PIDs that are added to the set_ftrace_pid file. Add back the proper hooks to have pid filtering working once again. Cc: stable@vger.kernel.org # 3.16+ Reported-by: Matt Fleming Reported-by: Richard Weinberger Tested-by: Matt Fleming Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 +++ kernel/trace/ftrace.c | 52 +++++++++++++++++++++++++++++++++----------------- 2 files changed, 37 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1da602982cf9..6cd8c0ee4b6f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -116,6 +116,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * SAVE_REGS. If another ops with this flag set is already registered * for any of the functions that this ops will be registered for, then * this ops will fail to register or set_filter_ip. + * PID - Is affected by set_ftrace_pid (allows filtering on those pids) */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -132,6 +133,7 @@ enum { FTRACE_OPS_FL_MODIFYING = 1 << 11, FTRACE_OPS_FL_ALLOC_TRAMP = 1 << 12, FTRACE_OPS_FL_IPMODIFY = 1 << 13, + FTRACE_OPS_FL_PID = 1 << 14, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -159,6 +161,7 @@ struct ftrace_ops { struct ftrace_ops *next; unsigned long flags; void *private; + ftrace_func_t saved_func; int __percpu *disabled; #ifdef CONFIG_DYNAMIC_FTRACE int nr_trampolines; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02bece4a99ea..eb11011b5292 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -98,6 +98,13 @@ struct ftrace_pid { struct pid *pid; }; +static bool ftrace_pids_enabled(void) +{ + return !list_empty(&ftrace_pids); +} + +static void ftrace_update_trampoline(struct ftrace_ops *ops); + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -109,7 +116,6 @@ static DEFINE_MUTEX(ftrace_lock); static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; static struct ftrace_ops control_ops; @@ -183,14 +189,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, if (!test_tsk_trace_trace(current)) return; - ftrace_pid_function(ip, parent_ip, op, regs); -} - -static void set_ftrace_pid_function(ftrace_func_t func) -{ - /* do not set ftrace_pid_function to itself! */ - if (func != ftrace_pid_func) - ftrace_pid_function = func; + op->saved_func(ip, parent_ip, op, regs); } /** @@ -202,7 +201,6 @@ static void set_ftrace_pid_function(ftrace_func_t func) void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; - ftrace_pid_function = ftrace_stub; } static void control_ops_disable_all(struct ftrace_ops *ops) @@ -436,6 +434,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops) } else add_ftrace_ops(&ftrace_ops_list, ops); + /* Always save the function, and reset at unregistering */ + ops->saved_func = ops->func; + + if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) + ops->func = ftrace_pid_func; + ftrace_update_trampoline(ops); if (ftrace_enabled) @@ -463,15 +467,28 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) update_ftrace_function(); + ops->func = ops->saved_func; + return 0; } static void ftrace_update_pid_func(void) { + bool enabled = ftrace_pids_enabled(); + struct ftrace_ops *op; + /* Only do something if we are tracing something */ if (ftrace_trace_function == ftrace_stub) return; + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->flags & FTRACE_OPS_FL_PID) { + op->func = enabled ? ftrace_pid_func : + op->saved_func; + ftrace_update_trampoline(op); + } + } while_for_each_ftrace_op(op); + update_ftrace_function(); } @@ -1133,7 +1150,8 @@ static struct ftrace_ops global_ops = { .local_hash.filter_hash = EMPTY_HASH, INIT_OPS_HASH(global_ops) .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED, + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_PID, }; /* @@ -5023,7 +5041,9 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) static struct ftrace_ops global_ops = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_PID, }; static int __init ftrace_nodyn_init(void) @@ -5080,11 +5100,6 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) if (WARN_ON(tr->ops->func != ftrace_stub)) printk("ftrace ops had %pS for function\n", tr->ops->func); - /* Only the top level instance does pid tracing */ - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } } tr->ops->func = func; tr->ops->private = tr; @@ -5371,7 +5386,7 @@ static void *fpid_start(struct seq_file *m, loff_t *pos) { mutex_lock(&ftrace_lock); - if (list_empty(&ftrace_pids) && (!*pos)) + if (!ftrace_pids_enabled() && (!*pos)) return (void *) 1; return seq_list_start(&ftrace_pids, *pos); @@ -5610,6 +5625,7 @@ static struct ftrace_ops graph_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_PID | FTRACE_OPS_FL_STUB, #ifdef FTRACE_GRAPH_TRAMP_ADDR .trampoline = FTRACE_GRAPH_TRAMP_ADDR, -- cgit v1.2.3-71-gd317 From e018a0cce3d849bc73e72686c571420adc40bad2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jul 2015 21:24:04 +0300 Subject: net/macb: convert to kernel doc This patch coverts struct description to the kernel doc format. There is no functional change. Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/platform_data/macb.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/macb.h b/include/linux/platform_data/macb.h index 044a124bfbbc..21b15f6fee25 100644 --- a/include/linux/platform_data/macb.h +++ b/include/linux/platform_data/macb.h @@ -8,11 +8,19 @@ #ifndef __MACB_PDATA_H__ #define __MACB_PDATA_H__ +/** + * struct macb_platform_data - platform data for MACB Ethernet + * @phy_mask: phy mask passed when register the MDIO bus + * within the driver + * @phy_irq_pin: PHY IRQ + * @is_rmii: using RMII interface? + * @rev_eth_addr: reverse Ethernet address byte order + */ struct macb_platform_data { u32 phy_mask; - int phy_irq_pin; /* PHY IRQ */ - u8 is_rmii; /* using RMII interface? */ - u8 rev_eth_addr; /* reverse Ethernet address byte order */ + int phy_irq_pin; + u8 is_rmii; + u8 rev_eth_addr; }; #endif /* __MACB_PDATA_H__ */ -- cgit v1.2.3-71-gd317 From 35068ce8cbf1749ef1a4b9b1493af83b8488c37b Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Wed, 1 Jul 2015 09:10:43 +0200 Subject: of: constify drv arg of of_driver_match_device stub With this change the stub has the same signature as the actual function, preventing this compiler warning when building without CONFIG_OF: drivers/base/property.c: In function 'fwnode_driver_match_device': >> drivers/base/property.c:608:38: warning: passing argument 2 of 'of_driver_match_device' discards 'const' qualifier from pointer target type return of_driver_match_device(dev, drv); ^ In file included from drivers/base/property.c:18:0: include/linux/of_device.h:61:19: note: expected 'struct device_driver *' but argument is of type 'const struct device_driver *' static inline int of_driver_match_device(struct device *dev, ^ Signed-off-by: Tomeu Vizoso Signed-off-by: Rob Herring --- include/linux/of_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_device.h b/include/linux/of_device.h index 4c508549833a..cc7dd687a89d 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -59,7 +59,7 @@ void of_dma_configure(struct device *dev, struct device_node *np); #else /* CONFIG_OF */ static inline int of_driver_match_device(struct device *dev, - struct device_driver *drv) + const struct device_driver *drv) { return 0; } -- cgit v1.2.3-71-gd317 From 559ed40752dc63e68f9b9ad301b20e6a3fe5cf21 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 26 Jul 2015 02:07:47 +0200 Subject: cpufreq: Avoid attempts to create duplicate symbolic links After commit 87549141d516 (cpufreq: Stop migrating sysfs files on hotplug) there is a problem with CPUs that share cpufreq policy objects with other CPUs and are initially offline. Say CPU1 shares a policy with CPU0 which is online and is registered first. As part of the registration process, cpufreq_add_dev() is called for it. It creates the policy object and a symbolic link to it from the CPU1's sysfs directory. If CPU1 is registered subsequently and it is offline at that time, cpufreq_add_dev() will attempt to create a symbolic link to the policy object for it, but that link is present already, so a warning about that will be triggered. To avoid that warning, make cpufreq use an additional CPU mask containing related CPUs that are actually present for each policy object. That mask is initialized when the policy object is populated after its creation (for the first online CPU using it) and it includes CPUs from the "policy CPUs" mask returned by the cpufreq driver's ->init() callback that are physically present at that time. Symbolic links to the policy are created only for the CPUs in that mask. If cpufreq_add_dev() is invoked for an offline CPU, it checks the new mask and only creates the symlink if the CPU was not in it (the CPU is added to the mask at the same time). In turn, cpufreq_remove_dev() drops the given CPU from the new mask, removes its symlink to the policy object and returns, unless it is the CPU owning the policy object. In that case, the policy object is moved to a new CPU's sysfs directory or deleted if the CPU being removed was the last user of the policy. While at it, notice that cpufreq_remove_dev() can't fail, because its return value is ignored, so make it ignore return values from __cpufreq_remove_dev_prepare() and __cpufreq_remove_dev_finish() and prevent these functions from aborting on errors returned by __cpufreq_governor(). Also drop the now unused sif argument from them. Fixes: 87549141d516 (cpufreq: Stop migrating sysfs files on hotplug) Signed-off-by: Rafael J. Wysocki Reported-and-tested-by: Russell King Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 108 +++++++++++++++++++++++----------------------- include/linux/cpufreq.h | 1 + 2 files changed, 56 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 26063afb3eba..7a3c30c4336f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1002,7 +1002,7 @@ static int cpufreq_add_dev_symlink(struct cpufreq_policy *policy) int ret = 0; /* Some related CPUs might not be present (physically hotplugged) */ - for_each_cpu_and(j, policy->related_cpus, cpu_present_mask) { + for_each_cpu(j, policy->real_cpus) { if (j == policy->kobj_cpu) continue; @@ -1019,7 +1019,7 @@ static void cpufreq_remove_dev_symlink(struct cpufreq_policy *policy) unsigned int j; /* Some related CPUs might not be present (physically hotplugged) */ - for_each_cpu_and(j, policy->related_cpus, cpu_present_mask) { + for_each_cpu(j, policy->real_cpus) { if (j == policy->kobj_cpu) continue; @@ -1163,11 +1163,14 @@ static struct cpufreq_policy *cpufreq_policy_alloc(struct device *dev) if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) goto err_free_cpumask; + if (!zalloc_cpumask_var(&policy->real_cpus, GFP_KERNEL)) + goto err_free_rcpumask; + ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &dev->kobj, "cpufreq"); if (ret) { pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret); - goto err_free_rcpumask; + goto err_free_real_cpus; } INIT_LIST_HEAD(&policy->policy_list); @@ -1184,6 +1187,8 @@ static struct cpufreq_policy *cpufreq_policy_alloc(struct device *dev) return policy; +err_free_real_cpus: + free_cpumask_var(policy->real_cpus); err_free_rcpumask: free_cpumask_var(policy->related_cpus); err_free_cpumask: @@ -1234,6 +1239,7 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy, bool notify) write_unlock_irqrestore(&cpufreq_driver_lock, flags); cpufreq_policy_put_kobj(policy, notify); + free_cpumask_var(policy->real_cpus); free_cpumask_var(policy->related_cpus); free_cpumask_var(policy->cpus); kfree(policy); @@ -1258,14 +1264,17 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif) pr_debug("adding CPU %u\n", cpu); - /* - * Only possible if 'cpu' wasn't physically present earlier and we are - * here from subsys_interface add callback. A hotplug notifier will - * follow and we will handle it like logical CPU hotplug then. For now, - * just create the sysfs link. - */ - if (cpu_is_offline(cpu)) - return add_cpu_dev_symlink(per_cpu(cpufreq_cpu_data, cpu), cpu); + if (cpu_is_offline(cpu)) { + /* + * Only possible if we are here from the subsys_interface add + * callback. A hotplug notifier will follow and we will handle + * it as CPU online then. For now, just create the sysfs link, + * unless there is no policy or the link is already present. + */ + policy = per_cpu(cpufreq_cpu_data, cpu); + return policy && !cpumask_test_and_set_cpu(cpu, policy->real_cpus) + ? add_cpu_dev_symlink(policy, cpu) : 0; + } if (!down_read_trylock(&cpufreq_rwsem)) return 0; @@ -1307,6 +1316,10 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif) /* related cpus should atleast have policy->cpus */ cpumask_or(policy->related_cpus, policy->related_cpus, policy->cpus); + /* Remember which CPUs have been present at the policy creation time. */ + if (!recover_policy) + cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask); + /* * affected cpus must always be the one, which are online. We aren't * managing offline cpus here. @@ -1420,8 +1433,7 @@ nomem_out: return ret; } -static int __cpufreq_remove_dev_prepare(struct device *dev, - struct subsys_interface *sif) +static int __cpufreq_remove_dev_prepare(struct device *dev) { unsigned int cpu = dev->id; int ret = 0; @@ -1437,10 +1449,8 @@ static int __cpufreq_remove_dev_prepare(struct device *dev, if (has_target()) { ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP); - if (ret) { + if (ret) pr_err("%s: Failed to stop governor\n", __func__); - return ret; - } } down_write(&policy->rwsem); @@ -1473,8 +1483,7 @@ static int __cpufreq_remove_dev_prepare(struct device *dev, return ret; } -static int __cpufreq_remove_dev_finish(struct device *dev, - struct subsys_interface *sif) +static int __cpufreq_remove_dev_finish(struct device *dev) { unsigned int cpu = dev->id; int ret; @@ -1492,10 +1501,8 @@ static int __cpufreq_remove_dev_finish(struct device *dev, /* If cpu is last user of policy, free policy */ if (has_target()) { ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT); - if (ret) { + if (ret) pr_err("%s: Failed to exit governor\n", __func__); - return ret; - } } /* @@ -1506,10 +1513,6 @@ static int __cpufreq_remove_dev_finish(struct device *dev, if (cpufreq_driver->exit) cpufreq_driver->exit(policy); - /* Free the policy only if the driver is getting removed. */ - if (sif) - cpufreq_policy_free(policy, true); - return 0; } @@ -1521,42 +1524,41 @@ static int __cpufreq_remove_dev_finish(struct device *dev, static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) { unsigned int cpu = dev->id; - int ret; - - /* - * Only possible if 'cpu' is getting physically removed now. A hotplug - * notifier should have already been called and we just need to remove - * link or free policy here. - */ - if (cpu_is_offline(cpu)) { - struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu); - struct cpumask mask; + struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu); - if (!policy) - return 0; + if (!policy) + return 0; - cpumask_copy(&mask, policy->related_cpus); - cpumask_clear_cpu(cpu, &mask); + if (cpu_online(cpu)) { + __cpufreq_remove_dev_prepare(dev); + __cpufreq_remove_dev_finish(dev); + } - /* - * Free policy only if all policy->related_cpus are removed - * physically. - */ - if (cpumask_intersects(&mask, cpu_present_mask)) { - remove_cpu_dev_symlink(policy, cpu); - return 0; - } + cpumask_clear_cpu(cpu, policy->real_cpus); + if (cpumask_empty(policy->real_cpus)) { cpufreq_policy_free(policy, true); return 0; } - ret = __cpufreq_remove_dev_prepare(dev, sif); + if (cpu != policy->kobj_cpu) { + remove_cpu_dev_symlink(policy, cpu); + } else { + /* + * The CPU owning the policy object is going away. Move it to + * another suitable CPU. + */ + unsigned int new_cpu = cpumask_first(policy->real_cpus); + struct device *new_dev = get_cpu_device(new_cpu); + + dev_dbg(dev, "%s: Moving policy object to CPU%u\n", __func__, new_cpu); - if (!ret) - ret = __cpufreq_remove_dev_finish(dev, sif); + sysfs_remove_link(&new_dev->kobj, "cpufreq"); + policy->kobj_cpu = new_cpu; + WARN_ON(kobject_move(&policy->kobj, &new_dev->kobj)); + } - return ret; + return 0; } static void handle_update(struct work_struct *work) @@ -2395,11 +2397,11 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb, break; case CPU_DOWN_PREPARE: - __cpufreq_remove_dev_prepare(dev, NULL); + __cpufreq_remove_dev_prepare(dev); break; case CPU_POST_DEAD: - __cpufreq_remove_dev_finish(dev, NULL); + __cpufreq_remove_dev_finish(dev); break; case CPU_DOWN_FAILED: diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 29ad97c34fd5..bde1e567b3a9 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -62,6 +62,7 @@ struct cpufreq_policy { /* CPUs sharing clock, require sw coordination */ cpumask_var_t cpus; /* Online CPUs only */ cpumask_var_t related_cpus; /* Online + Offline CPUs */ + cpumask_var_t real_cpus; /* Related and present */ unsigned int shared_type; /* ACPI: ANY or ALL affected CPUs should set cpufreq */ -- cgit v1.2.3-71-gd317 From 4248b0da460839e30eaaad78992b9a1dd3e63e21 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 6 Aug 2015 15:46:20 -0700 Subject: fs, file table: reinit files_stat.max_files after deferred memory initialisation Dave Hansen reported the following; My laptop has been behaving strangely with 4.2-rc2. Once I log in to my X session, I start getting all kinds of strange errors from applications and see this in my dmesg: VFS: file-max limit 8192 reached The problem is that the file-max is calculated before memory is fully initialised and miscalculates how much memory the kernel is using. This patch recalculates file-max after deferred memory initialisation. Note that using memory hotplug infrastructure would not have avoided this problem as the value is not recalculated after memory hot-add. 4.1: files_stat.max_files = 6582781 4.2-rc2: files_stat.max_files = 8192 4.2-rc2 patched: files_stat.max_files = 6562467 Small differences with the patch applied and 4.1 but not enough to matter. Signed-off-by: Mel Gorman Reported-by: Dave Hansen Cc: Nicolai Stange Cc: Dave Hansen Cc: Alex Ng Cc: Fengguang Wu Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 13 +++---------- fs/file_table.c | 24 +++++++++++++++--------- include/linux/fs.h | 5 +++-- init/main.c | 2 +- mm/page_alloc.c | 3 +++ 5 files changed, 25 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index 5c8ea15e73a5..9b5fe503f6cb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3442,22 +3442,15 @@ void __init vfs_caches_init_early(void) inode_init_early(); } -void __init vfs_caches_init(unsigned long mempages) +void __init vfs_caches_init(void) { - unsigned long reserve; - - /* Base hash sizes on available memory, with a reserve equal to - 150% of current kernel size */ - - reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1); - mempages -= reserve; - names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); dcache_init(); inode_init(); - files_init(mempages); + files_init(); + files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); diff --git a/fs/file_table.c b/fs/file_table.c index 7f9d407c7595..ad17e05ebf95 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -308,19 +309,24 @@ void put_filp(struct file *file) } } -void __init files_init(unsigned long mempages) +void __init files_init(void) { - unsigned long n; - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); +} - /* - * One file with associated inode and dcache is very roughly 1K. - * Per default don't use more than 10% of our memory for files. - */ +/* + * One file with associated inode and dcache is very roughly 1K. Per default + * do not use more than 10% of our memory for files. + */ +void __init files_maxfiles_init(void) +{ + unsigned long n; + unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2; + + memreserve = min(memreserve, totalram_pages - 1); + n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; - n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/include/linux/fs.h b/include/linux/fs.h index cc008c338f5a..84b783f277f7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -55,7 +55,8 @@ struct vm_fault; extern void __init inode_init(void); extern void __init inode_init_early(void); -extern void __init files_init(unsigned long); +extern void __init files_init(void); +extern void __init files_maxfiles_init(void); extern struct files_stat_struct files_stat; extern unsigned long get_max_files(void); @@ -2245,7 +2246,7 @@ extern int ioctl_preallocate(struct file *filp, void __user *argp); /* fs/dcache.c */ extern void __init vfs_caches_init_early(void); -extern void __init vfs_caches_init(unsigned long); +extern void __init vfs_caches_init(void); extern struct kmem_cache *names_cachep; diff --git a/init/main.c b/init/main.c index c5d5626289ce..56506553d4d8 100644 --- a/init/main.c +++ b/init/main.c @@ -656,7 +656,7 @@ asmlinkage __visible void __init start_kernel(void) key_init(); security_init(); dbg_late_init(); - vfs_caches_init(totalram_pages); + vfs_caches_init(); signals_init(); /* rootfs populating might need page-writeback */ page_writeback_init(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 322628278ae4..cb61f44eb3fc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1201,6 +1201,9 @@ void __init page_alloc_init_late(void) /* Block until all are initialised */ wait_for_completion(&pgdat_init_all_done_comp); + + /* Reinit limits that are based on free pages after the kernel is up */ + files_maxfiles_init(); } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ -- cgit v1.2.3-71-gd317 From f4c18e6f7b5bbb5b528b3334115806b0d76f50f9 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 6 Aug 2015 15:47:08 -0700 Subject: mm: check __PG_HWPOISON separately from PAGE_FLAGS_CHECK_AT_* The race condition addressed in commit add05cecef80 ("mm: soft-offline: don't free target page in successful page migration") was not closed completely, because that can happen not only for soft-offline, but also for hard-offline. Consider that a slab page is about to be freed into buddy pool, and then an uncorrected memory error hits the page just after entering __free_one_page(), then VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP) is triggered, despite the fact that it's not necessary because the data on the affected page is not consumed. To solve it, this patch drops __PG_HWPOISON from page flag checks at allocation/free time. I think it's justified because __PG_HWPOISON flags is defined to prevent the page from being reused, and setting it outside the page's alloc-free cycle is a designed behavior (not a bug.) For recent months, I was annoyed about BUG_ON when soft-offlined page remains on lru cache list for a while, which is avoided by calling put_page() instead of putback_lru_page() in page migration's success path. This means that this patch reverts a major change from commit add05cecef80 about the new refcounting rule of soft-offlined pages, so "reuse window" revives. This will be closed by a subsequent patch. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Dean Nelson Cc: Tony Luck Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 10 +++++++--- mm/huge_memory.c | 7 +------ mm/migrate.c | 5 ++++- mm/page_alloc.c | 4 ++++ 4 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f34e040b34e9..41c93844fb1d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -631,15 +631,19 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \ + 1 << PG_unevictable | __PG_MLOCKED | \ __PG_COMPOUND_LOCK) /* * Flags checked when a page is prepped for return by the page allocator. - * Pages being prepped should not have any flags set. It they are set, + * Pages being prepped should not have these flags set. It they are set, * there has been a kernel bug or struct page corruption. + * + * __PG_HWPOISON is exceptional because it needs to be kept beyond page's + * alloc-free cycle to prevent from reusing the page. */ -#define PAGE_FLAGS_CHECK_AT_PREP ((1 << NR_PAGEFLAGS) - 1) +#define PAGE_FLAGS_CHECK_AT_PREP \ + (((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) #define PAGE_FLAGS_PRIVATE \ (1 << PG_private | 1 << PG_private_2) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c107094f79ba..097c7a4bfbd9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1676,12 +1676,7 @@ static void __split_huge_page_refcount(struct page *page, /* after clearing PageTail the gup refcount can be released */ smp_mb__after_atomic(); - /* - * retain hwpoison flag of the poisoned tail page: - * fix for the unsuitable process killed on Guest Machine(KVM) - * by the memory-failure. - */ - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (page->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | diff --git a/mm/migrate.c b/mm/migrate.c index ee401e4e5ef1..f2415be7d93b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -950,7 +950,10 @@ out: list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - if (reason != MR_MEMORY_FAILURE) + /* Soft-offlined page shouldn't go through lru cache list */ + if (reason == MR_MEMORY_FAILURE) + put_page(page); + else putback_lru_page(page); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cb61f44eb3fc..beda41710802 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1296,6 +1296,10 @@ static inline int check_new_page(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(atomic_read(&page->_count) != 0)) bad_reason = "nonzero _count"; + if (unlikely(page->flags & __PG_HWPOISON)) { + bad_reason = "HWPoisoned (hardware-corrupted)"; + bad_flags = __PG_HWPOISON; + } if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; bad_flags = PAGE_FLAGS_CHECK_AT_PREP; -- cgit v1.2.3-71-gd317 From 7a76a021cd5a292be875fbc616daf03eab1e6996 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Fri, 7 Aug 2015 09:32:21 -0700 Subject: net-timestamp: Update skb_complete_tx_timestamp comment After "62bccb8 net-timestamp: Make the clone operation stand-alone from phy timestamping" the hwtstamps parameter of skb_complete_tx_timestamp() may no longer be NULL. Signed-off-by: Benjamin Poirier Cc: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d6cdd6e87d53..22b6d9ca1654 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2884,11 +2884,11 @@ static inline bool skb_defer_rx_timestamp(struct sk_buff *skb) * * PHY drivers may accept clones of transmitted packets for * timestamping via their phy_driver.txtstamp method. These drivers - * must call this function to return the skb back to the stack, with - * or without a timestamp. + * must call this function to return the skb back to the stack with a + * timestamp. * * @skb: clone of the the original outgoing packet - * @hwtstamps: hardware time stamps, may be NULL if not available + * @hwtstamps: hardware time stamps * */ void skb_complete_tx_timestamp(struct sk_buff *skb, -- cgit v1.2.3-71-gd317