From 490194269665d6d4915a4a5774f002885c5a2d8f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 21 Apr 2017 15:35:26 -0700 Subject: module: Pass struct load_info into symbol checks Since we're already using values from struct load_info, just pass this pointer in directly and use what's needed as we need it. This allows us to access future fields in struct load_info too. Signed-off-by: Kees Cook Signed-off-by: Jessica Yu --- kernel/module.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..ca4509b13400 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1278,12 +1278,13 @@ static u32 resolve_rel_crc(const s32 *crc) return *(u32 *)((void *)crc + *crc); } -static int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static int check_version(const struct load_info *info, const char *symname, struct module *mod, const s32 *crc) { + Elf_Shdr *sechdrs = info->sechdrs; + unsigned int versindex = info->index.vers; unsigned int i, num_versions; struct modversion_info *versions; @@ -1326,8 +1327,7 @@ bad_version: return 0; } -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_modstruct_version(const struct load_info *info, struct module *mod) { const s32 *crc; @@ -1343,8 +1343,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, BUG(); } preempt_enable(); - return check_version(sechdrs, versindex, - VMLINUX_SYMBOL_STR(module_layout), mod, crc); + return check_version(info, VMLINUX_SYMBOL_STR(module_layout), + mod, crc); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -1358,8 +1358,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, return strcmp(amagic, bmagic) == 0; } #else -static inline int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_version(const struct load_info *info, const char *symname, struct module *mod, const s32 *crc) @@ -1367,8 +1366,7 @@ static inline int check_version(Elf_Shdr *sechdrs, return 1; } -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_modstruct_version(const struct load_info *info, struct module *mod) { return 1; @@ -1404,7 +1402,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, if (!sym) goto unlock; - if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) { + if (!check_version(info, name, mod, crc)) { sym = ERR_PTR(-EINVAL); goto getname; } @@ -2971,7 +2969,7 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.pcpu = find_pcpusec(info); /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) + if (!check_modstruct_version(info, mod)) return ERR_PTR(-ENOEXEC); return mod; -- cgit v1.2.3-71-gd317 From 3e2e857f9c3a19d55ee0ba7b428b8be5008960bf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 21 Apr 2017 15:35:27 -0700 Subject: module: Add module name to modinfo Accessing the mod structure (e.g. for mod->name) prior to having completed check_modstruct_version() can result in writing garbage to the error logs if the layout of the mod structure loaded from disk doesn't match the running kernel's mod structure layout. This kind of mismatch will become much more likely if a kernel is built with different randomization seed for the struct layout randomization plugin. Instead, add and use a new modinfo string for logging the module name. Signed-off-by: Kees Cook Signed-off-by: Jessica Yu --- kernel/module.c | 29 ++++++++++++++++++++++------- scripts/mod/modpost.c | 1 + 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ca4509b13400..3803449ca219 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -302,6 +302,7 @@ int unregister_module_notifier(struct notifier_block *nb) EXPORT_SYMBOL(unregister_module_notifier); struct load_info { + char *name; Elf_Ehdr *hdr; unsigned long len; Elf_Shdr *sechdrs; @@ -1318,12 +1319,12 @@ static int check_version(const struct load_info *info, } /* Broken toolchain. Warn once, then let it go.. */ - pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); + pr_warn_once("%s: no symbol version for %s\n", info->name, symname); return 1; bad_version: pr_warn("%s: disagrees about version of symbol %s\n", - mod->name, symname); + info->name, symname); return 0; } @@ -2913,9 +2914,15 @@ static int rewrite_section_headers(struct load_info *info, int flags) info->index.vers = 0; /* Pretend no __versions section! */ else info->index.vers = find_sec(info, "__versions"); + info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->index.info = find_sec(info, ".modinfo"); + if (!info->index.info) + info->name = "(missing .modinfo section)"; + else + info->name = get_modinfo(info, "name"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; - info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + return 0; } @@ -2955,14 +2962,22 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { - pr_warn("No module found in object\n"); + pr_warn("%s: No module found in object\n", + info->name ?: "(missing .modinfo name field)"); return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; + /* + * If we didn't load the .modinfo 'name' field, fall back to + * on-disk struct mod 'name' field. + */ + if (!info->name) + info->name = mod->name; + if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", mod->name); + pr_warn("%s: module has no symbols (stripped?)\n", info->name); return ERR_PTR(-ENOEXEC); } @@ -2990,7 +3005,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return err; } else if (!same_magic(modmagic, vermagic, info->index.vers)) { pr_err("%s: version magic '%s' should be '%s'\n", - mod->name, modmagic, vermagic); + info->name, modmagic, vermagic); return -ENOEXEC; } @@ -3270,7 +3285,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (IS_ERR(mod)) return mod; - if (blacklisted(mod->name)) + if (blacklisted(info->name)) return ERR_PTR(-EPERM); err = check_modinfo(mod, info, flags); diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 30d752a4a6a6..48397feb08fb 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2126,6 +2126,7 @@ static void add_header(struct buffer *b, struct module *mod) buf_printf(b, "#include \n"); buf_printf(b, "\n"); buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n"); + buf_printf(b, "MODULE_INFO(name, KBUILD_MODNAME);\n"); buf_printf(b, "\n"); buf_printf(b, "__visible struct module __this_module\n"); buf_printf(b, "__attribute__((section(\".gnu.linkonce.this_module\"))) = {\n"); -- cgit v1.2.3-71-gd317 From 1ba5c08b58a0c21fca222f1bf2fde184aa26103f Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 6 Jun 2017 14:17:39 +0200 Subject: kernel/module.c: suppress warning about unused nowarn variable This patch fix the following warning: kernel/module.c: In function 'add_usage_links': kernel/module.c:1653:6: warning: variable 'nowarn' set but not used [-Wunused-but-set-variable] [jeyu: folded in first patch since it only swapped the function order so that del_usage_links can be called from add_usage_links] Signed-off-by: Corentin Labbe Signed-off-by: Jessica Yu --- kernel/module.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3803449ca219..f546d574f436 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1666,31 +1666,36 @@ static inline void remove_notes_attrs(struct module *mod) } #endif /* CONFIG_KALLSYMS */ -static void add_usage_links(struct module *mod) +static void del_usage_links(struct module *mod) { #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; - int nowarn; mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) { - nowarn = sysfs_create_link(use->target->holders_dir, - &mod->mkobj.kobj, mod->name); - } + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); mutex_unlock(&module_mutex); #endif } -static void del_usage_links(struct module *mod) +static int add_usage_links(struct module *mod) { + int ret = 0; #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) - sysfs_remove_link(use->target->holders_dir, mod->name); + list_for_each_entry(use, &mod->target_list, target_list) { + ret = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + if (ret) + break; + } mutex_unlock(&module_mutex); + if (ret) + del_usage_links(mod); #endif + return ret; } static int module_add_modinfo_attrs(struct module *mod) @@ -1801,13 +1806,18 @@ static int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_param; - add_usage_links(mod); + err = add_usage_links(mod); + if (err) + goto out_unreg_modinfo_attrs; + add_sect_attrs(mod, info); add_notes_attrs(mod, info); kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; +out_unreg_modinfo_attrs: + module_remove_modinfo_attrs(mod); out_unreg_param: module_param_sysfs_remove(mod); out_unreg_holders: -- cgit v1.2.3-71-gd317 From 93437353daeff31bd5b11810daa4d2d509d1a64e Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 26 May 2017 14:12:25 -0700 Subject: module: use list_for_each_entry_rcu() on find_module_all() The module list has been using RCU in a lot of other calls for a while now, we just overlooked changing this one over to use RCU. Signed-off-by: Luis R. Rodriguez Signed-off-by: Jessica Yu --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f546d574f436..afc6ede7bcdf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -603,7 +603,7 @@ static struct module *find_module_all(const char *name, size_t len, module_assert_mutex_or_preempt(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) continue; if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) -- cgit v1.2.3-71-gd317 From 165d1cc0074b2f938586274776d029b9bce914c4 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 23 Jun 2017 12:19:12 -0700 Subject: kmod: reduce atomic operations on kmod_concurrent and simplify When checking if we want to allow a kmod thread to kick off we increment, then read to see if we should enable a thread. If we were over the allowed limit limit we decrement. Splitting the increment far apart from decrement means there could be a time where two increments happen potentially giving a false failure on a thread which should have been allowed. CPU1 CPU2 atomic_inc() atomic_inc() atomic_read() atomic_read() atomic_dec() atomic_dec() In this case a read on CPU1 gets the atomic_inc()'s and we could negate it from getting a kmod thread. We could try to prevent this with a lock or preemption but that is overkill. We can fix by reducing the number of atomic operations. We do this by inverting the logic of of the enabler, instead of incrementing kmod_concurrent as we get new kmod users, define the variable kmod_concurrent_max as the max number of currently allowed kmod users and as we get new kmod users just decrement it if its still positive. This combines the dec and read in one atomic operation. In this case we no longer get the same false failure: CPU1 CPU2 atomic_dec_if_positive() atomic_dec_if_positive() atomic_inc() atomic_inc() The number of threads is computed at init, and since the current computation of kmod_concurrent includes the thread count we can avoid setting kmod_concurrent_max later in boot through an init call by simply sticking to 50 as the kmod_concurrent_max. The assumption here is a system with modules must at least have ~16 MiB of RAM. Suggested-by: Petr Mladek Suggested-by: Dmitry Torokhov Signed-off-by: Luis R. Rodriguez Reviewed-by: Petr Mladek Signed-off-by: Jessica Yu --- kernel/kmod.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 563f97e2be36..ff68198fe83b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -45,8 +45,6 @@ #include -extern int max_threads; - #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -56,6 +54,20 @@ static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); #ifdef CONFIG_MODULES +/* + * Assuming: + * + * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + * (u64) THREAD_SIZE * 8UL); + * + * If you need less than 50 threads would mean we're dealing with systems + * smaller than 3200 pages. This assuems you are capable of having ~13M memory, + * and this would only be an be an upper limit, after which the OOM killer + * would take effect. Systems like these are very unlikely if modules are + * enabled. + */ +#define MAX_KMOD_CONCURRENT 50 +static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); /* modprobe_path is set via /proc/sys. @@ -127,10 +139,7 @@ int __request_module(bool wait, const char *fmt, ...) { va_list args; char module_name[MODULE_NAME_LEN]; - unsigned int max_modprobes; int ret; - static atomic_t kmod_concurrent = ATOMIC_INIT(0); -#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; /* @@ -154,21 +163,7 @@ int __request_module(bool wait, const char *fmt, ...) if (ret) return ret; - /* If modprobe needs a service that is in a module, we get a recursive - * loop. Limit the number of running kmod threads to max_threads/2 or - * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method - * would be to run the parents of this process, counting how many times - * kmod was invoked. That would mean accessing the internals of the - * process tables to get the command line, proc_pid_cmdline is static - * and it is not worth changing the proc code just to handle this case. - * KAO. - * - * "trace the ppid" is simple, but will fail if someone's - * parent exits. I think this is as good as it gets. --RR - */ - max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); - atomic_inc(&kmod_concurrent); - if (atomic_read(&kmod_concurrent) > max_modprobes) { + if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { /* We may be blaming an innocent here, but unlikely */ if (kmod_loop_msg < 5) { printk(KERN_ERR @@ -176,7 +171,6 @@ int __request_module(bool wait, const char *fmt, ...) module_name); kmod_loop_msg++; } - atomic_dec(&kmod_concurrent); return -ENOMEM; } @@ -184,10 +178,12 @@ int __request_module(bool wait, const char *fmt, ...) ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); - atomic_dec(&kmod_concurrent); + atomic_inc(&kmod_concurrent_max); + return ret; } EXPORT_SYMBOL(__request_module); + #endif /* CONFIG_MODULES */ static void call_usermodehelper_freeinfo(struct subprocess_info *info) -- cgit v1.2.3-71-gd317 From 96b5b19459b3c2aed2872bac42cbe19edfae710f Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 28 Jun 2017 18:32:31 -0700 Subject: module: make the modinfo name const This can be accomplished by making blacklisted() also accept const. Signed-off-by: Luis R. Rodriguez Acked-by: Kees Cook [jeyu: fix typo] Signed-off-by: Jessica Yu --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index afc6ede7bcdf..d07287707557 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -302,7 +302,7 @@ int unregister_module_notifier(struct notifier_block *nb) EXPORT_SYMBOL(unregister_module_notifier); struct load_info { - char *name; + const char *name; Elf_Ehdr *hdr; unsigned long len; Elf_Shdr *sechdrs; @@ -3265,7 +3265,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, /* module_blacklist is a comma-separated list of module names */ static char *module_blacklist; -static bool blacklisted(char *module_name) +static bool blacklisted(const char *module_name) { const char *p; size_t len; -- cgit v1.2.3-71-gd317 From 3859a271a003aba01e45b85c9d8b355eb7bf25f9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 28 Oct 2016 01:22:25 -0700 Subject: randstruct: Mark various structs for randomization This marks many critical kernel structures for randomization. These are structures that have been targeted in the past in security exploits, or contain functions pointers, pointers to function pointer tables, lists, workqueues, ref-counters, credentials, permissions, or are otherwise sensitive. This initial list was extracted from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Left out of this list is task_struct, which requires special handling and will be covered in a subsequent patch. Signed-off-by: Kees Cook --- arch/x86/include/asm/processor.h | 2 +- fs/mount.h | 4 ++-- fs/namei.c | 2 +- fs/proc/internal.h | 6 +++--- include/linux/binfmts.h | 4 ++-- include/linux/cdev.h | 2 +- include/linux/cred.h | 4 ++-- include/linux/dcache.h | 2 +- include/linux/fs.h | 17 +++++++++-------- include/linux/fs_struct.h | 2 +- include/linux/ipc.h | 2 +- include/linux/ipc_namespace.h | 2 +- include/linux/key-type.h | 4 ++-- include/linux/kmod.h | 2 +- include/linux/kobject.h | 2 +- include/linux/lsm_hooks.h | 4 ++-- include/linux/mm_types.h | 4 ++-- include/linux/module.h | 4 ++-- include/linux/mount.h | 2 +- include/linux/msg.h | 2 +- include/linux/path.h | 2 +- include/linux/pid_namespace.h | 2 +- include/linux/proc_ns.h | 2 +- include/linux/sched.h | 2 +- include/linux/sched/signal.h | 2 +- include/linux/sem.h | 2 +- include/linux/shm.h | 2 +- include/linux/sysctl.h | 2 +- include/linux/tty.h | 2 +- include/linux/tty_driver.h | 4 ++-- include/linux/user_namespace.h | 2 +- include/linux/utsname.h | 2 +- include/net/af_unix.h | 2 +- include/net/neighbour.h | 2 +- include/net/net_namespace.h | 2 +- include/net/sock.h | 2 +- kernel/futex.c | 4 ++-- security/keys/internal.h | 2 +- 38 files changed, 57 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3cada998a402..e2335edb9fc5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -129,7 +129,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; -}; +} __randomize_layout; struct cpuid_regs { u32 eax, ebx, ecx, edx; diff --git a/fs/mount.h b/fs/mount.h index bf1fda6eed8f..e406b286fba1 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -16,7 +16,7 @@ struct mnt_namespace { u64 event; unsigned int mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; -}; +} __randomize_layout; struct mnt_pcp { int mnt_count; @@ -68,7 +68,7 @@ struct mount { struct hlist_head mnt_pins; struct fs_pin mnt_umount; struct dentry *mnt_ex_mountpoint; -}; +} __randomize_layout; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ diff --git a/fs/namei.c b/fs/namei.c index 6571a5f5112e..1764620ac383 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -524,7 +524,7 @@ struct nameidata { struct inode *link_inode; unsigned root_seq; int dfd; -}; +} __randomize_layout; static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c5ae09b6c726..07b16318223f 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -51,7 +51,7 @@ struct proc_dir_entry { spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ u8 namelen; char name[]; -}; +} __randomize_layout; union proc_op { int (*proc_get_link)(struct dentry *, struct path *); @@ -70,7 +70,7 @@ struct proc_inode { struct list_head sysctl_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; -}; +} __randomize_layout; /* * General functions @@ -279,7 +279,7 @@ struct proc_maps_private { #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif -}; +} __randomize_layout; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 05488da3aee9..3ae9013eeaaa 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -46,7 +46,7 @@ struct linux_binprm { unsigned interp_flags; unsigned interp_data; unsigned long loader, exec; -}; +} __randomize_layout; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT) @@ -81,7 +81,7 @@ struct linux_binfmt { int (*load_shlib)(struct file *); int (*core_dump)(struct coredump_params *cprm); unsigned long min_coredump; /* minimal dump size */ -}; +} __randomize_layout; extern void __register_binfmt(struct linux_binfmt *fmt, int insert); diff --git a/include/linux/cdev.h b/include/linux/cdev.h index 408bc09ce497..cb28eb21e3ca 100644 --- a/include/linux/cdev.h +++ b/include/linux/cdev.h @@ -17,7 +17,7 @@ struct cdev { struct list_head list; dev_t dev; unsigned int count; -}; +} __randomize_layout; void cdev_init(struct cdev *, const struct file_operations *); diff --git a/include/linux/cred.h b/include/linux/cred.h index b03e7d049a64..82c8a9e1aabb 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -31,7 +31,7 @@ struct group_info { atomic_t usage; int ngroups; kgid_t gid[0]; -}; +} __randomize_layout; /** * get_group_info - Get a reference to a group info structure @@ -145,7 +145,7 @@ struct cred { struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ struct rcu_head rcu; /* RCU deletion hook */ -}; +} __randomize_layout; extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d2e38dc6172c..7eb262e13d3c 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -113,7 +113,7 @@ struct dentry { struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; -}; +} __randomize_layout; /* * dentry->d_lock spinlock nesting subclasses: diff --git a/include/linux/fs.h b/include/linux/fs.h index 803e5a9b2654..8f28143486c4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -275,7 +275,7 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; -}; +} __randomize_layout; static inline bool is_sync_kiocb(struct kiocb *kiocb) { @@ -392,7 +392,7 @@ struct address_space { gfp_t gfp_mask; /* implicit gfp mask for allocations */ struct list_head private_list; /* ditto */ void *private_data; /* ditto */ -} __attribute__((aligned(sizeof(long)))); +} __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit @@ -435,7 +435,7 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; -}; +} __randomize_layout; /* * Radix-tree tags, for tagging dirty and writeback pages within the pagecache @@ -653,7 +653,7 @@ struct inode { #endif void *i_private; /* fs or device private pointer */ -}; +} __randomize_layout; static inline unsigned int i_blocksize(const struct inode *node) { @@ -868,7 +868,8 @@ struct file { struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; -} __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ +} __randomize_layout + __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; @@ -1005,7 +1006,7 @@ struct file_lock { int state; /* state of grant or error if -ve */ } afs; } fl_u; -}; +} __randomize_layout; struct file_lock_context { spinlock_t flc_lock; @@ -1404,7 +1405,7 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ -}; +} __randomize_layout; /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can @@ -1690,7 +1691,7 @@ struct file_operations { u64); ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, u64); -}; +} __randomize_layout; struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 0efc3e62843a..7a026240cbb1 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -12,7 +12,7 @@ struct fs_struct { int umask; int in_exec; struct path root, pwd; -}; +} __randomize_layout; extern struct kmem_cache *fs_cachep; diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 71fd92d81b26..ea0eb0b5f98c 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -20,6 +20,6 @@ struct kern_ipc_perm { umode_t mode; unsigned long seq; void *security; -} ____cacheline_aligned_in_smp; +} ____cacheline_aligned_in_smp __randomize_layout; #endif /* _LINUX_IPC_H */ diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 848e5796400e..65327ee0936b 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -61,7 +61,7 @@ struct ipc_namespace { struct ucounts *ucounts; struct ns_common ns; -}; +} __randomize_layout; extern struct ipc_namespace init_ipc_ns; extern spinlock_t mq_lock; diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 8496cf64575c..9520fc3c3b9a 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -45,7 +45,7 @@ struct key_preparsed_payload { size_t datalen; /* Raw datalen */ size_t quotalen; /* Quota length for proposed payload */ time_t expiry; /* Expiry time of key */ -}; +} __randomize_layout; typedef int (*request_key_actor_t)(struct key_construction *key, const char *op, void *aux); @@ -158,7 +158,7 @@ struct key_type { /* internal fields */ struct list_head link; /* link in types list */ struct lock_class_key lock_class; /* key->sem lock class */ -}; +} __randomize_layout; extern struct key_type key_type_keyring; diff --git a/include/linux/kmod.h b/include/linux/kmod.h index c4e441e00db5..655082c88fd9 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -64,7 +64,7 @@ struct subprocess_info { int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; -}; +} __randomize_layout; extern int call_usermodehelper(const char *path, char **argv, char **envp, int wait); diff --git a/include/linux/kobject.h b/include/linux/kobject.h index ca85cb80e99a..084513350317 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -172,7 +172,7 @@ struct kset { spinlock_t list_lock; struct kobject kobj; const struct kset_uevent_ops *uevent_ops; -}; +} __randomize_layout; extern void kset_init(struct kset *kset); extern int __must_check kset_register(struct kset *kset); diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 080f34e66017..565163fc9ad4 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1876,7 +1876,7 @@ struct security_hook_heads { struct list_head audit_rule_match; struct list_head audit_rule_free; #endif /* CONFIG_AUDIT */ -}; +} __randomize_layout; /* * Security module hook list structure. @@ -1887,7 +1887,7 @@ struct security_hook_list { struct list_head *head; union security_list_options hook; char *lsm; -}; +} __randomize_layout; /* * Initializing a security_hook_list structure takes diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 45cdb27791a3..ff151814a02d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -342,7 +342,7 @@ struct vm_area_struct { struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; -}; +} __randomize_layout; struct core_thread { struct task_struct *task; @@ -500,7 +500,7 @@ struct mm_struct { atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; -}; +} __randomize_layout; extern struct mm_struct init_mm; diff --git a/include/linux/module.h b/include/linux/module.h index 21f56393602f..d93111d7def6 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -45,7 +45,7 @@ struct module_kobject { struct kobject *drivers_dir; struct module_param_attrs *mp; struct completion *kobj_completion; -}; +} __randomize_layout; struct module_attribute { struct attribute attr; @@ -475,7 +475,7 @@ struct module { ctor_fn_t *ctors; unsigned int num_ctors; #endif -} ____cacheline_aligned; +} ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} #endif diff --git a/include/linux/mount.h b/include/linux/mount.h index 8e0352af06b7..1ce85e6fd95f 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -67,7 +67,7 @@ struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; -}; +} __randomize_layout; struct file; /* forward dec */ struct path; diff --git a/include/linux/msg.h b/include/linux/msg.h index f3f302f9c197..a001305f5a79 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -29,7 +29,7 @@ struct msg_queue { struct list_head q_messages; struct list_head q_receivers; struct list_head q_senders; -}; +} __randomize_layout; /* Helper routines for sys_msgsnd and sys_msgrcv */ extern long do_msgsnd(int msqid, long mtype, void __user *mtext, diff --git a/include/linux/path.h b/include/linux/path.h index d1372186f431..cde895cc4af4 100644 --- a/include/linux/path.h +++ b/include/linux/path.h @@ -7,7 +7,7 @@ struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; -}; +} __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c2a989dee876..b09136f88cf4 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -52,7 +52,7 @@ struct pid_namespace { int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; -}; +} __randomize_layout; extern struct pid_namespace init_pid_ns; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 58ab28d81fc2..06844b54dfc1 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -21,7 +21,7 @@ struct proc_ns_operations { int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); struct user_namespace *(*owner)(struct ns_common *ns); struct ns_common *(*get_parent)(struct ns_common *ns); -}; +} __randomize_layout; extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b69fc650201..f833254fce00 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -408,7 +408,7 @@ struct sched_rt_entity { /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif -}; +} __randomize_layout; struct sched_dl_entity { struct rb_node rb_node; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index c06d63b3a583..2a0dd40b15db 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -222,7 +222,7 @@ struct signal_struct { struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) */ -}; +} __randomize_layout; /* * Bits in flags field of signal_struct. diff --git a/include/linux/sem.h b/include/linux/sem.h index 9edec926e9d9..23bcbdfad4a6 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -21,7 +21,7 @@ struct sem_array { int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ unsigned int use_global_lock;/* >0: global lock required */ -}; +} __randomize_layout; #ifdef CONFIG_SYSVIPC diff --git a/include/linux/shm.h b/include/linux/shm.h index 04e881829625..0fb7061ec54c 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -22,7 +22,7 @@ struct shmid_kernel /* private to the kernel */ /* The task created the shm object. NULL if the task is dead. */ struct task_struct *shm_creator; struct list_head shm_clist; /* list by creator */ -}; +} __randomize_layout; /* shm_mode upper byte flags */ #define SHM_DEST 01000 /* segment will be destroyed on last detach */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 80d07816def0..9ddeef2c03e2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -117,7 +117,7 @@ struct ctl_table struct ctl_table_poll *poll; void *extra1; void *extra2; -}; +} __randomize_layout; struct ctl_node { struct rb_node node; diff --git a/include/linux/tty.h b/include/linux/tty.h index d07cd2105a6c..73f8d0977bb0 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -333,7 +333,7 @@ struct tty_struct { /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; struct tty_port *port; -}; +} __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index b742b5e47cc2..00b2213f6a35 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -291,7 +291,7 @@ struct tty_operations { void (*poll_put_char)(struct tty_driver *driver, int line, char ch); #endif const struct file_operations *proc_fops; -}; +} __randomize_layout; struct tty_driver { int magic; /* magic number for this structure */ @@ -325,7 +325,7 @@ struct tty_driver { const struct tty_operations *ops; struct list_head tty_drivers; -}; +} __randomize_layout; extern struct list_head tty_drivers; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 32354b4b4b2b..b3575ce29148 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -66,7 +66,7 @@ struct user_namespace { #endif struct ucounts *ucounts; int ucount_max[UCOUNT_COUNTS]; -}; +} __randomize_layout; struct ucounts { struct hlist_node node; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 60f0bb83b313..da826ed059cf 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -26,7 +26,7 @@ struct uts_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; -}; +} __randomize_layout; extern struct uts_namespace init_uts_ns; #ifdef CONFIG_UTS_NS diff --git a/include/net/af_unix.h b/include/net/af_unix.h index fd60eccb59a6..64e2a1e24a2c 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -36,7 +36,7 @@ struct unix_skb_parms { u32 secid; /* Security ID */ #endif u32 consumed; -}; +} __randomize_layout; #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index e4dd3a214034..a62959d2b3f7 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -155,7 +155,7 @@ struct neighbour { struct rcu_head rcu; struct net_device *dev; u8 primary_key[0]; -}; +} __randomize_layout; struct neigh_ops { int family; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fe80bb48ab1f..a224196d16ac 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -147,7 +147,7 @@ struct net { #endif struct sock *diag_nlsk; atomic_t fnhe_genid; -}; +} __randomize_layout; #include diff --git a/include/net/sock.h b/include/net/sock.h index f33e3d134e0b..d349297db9e9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1113,7 +1113,7 @@ struct proto { atomic_t socks; #endif int (*diag_destroy)(struct sock *sk, int err); -}; +} __randomize_layout; int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf6b..5616511abf39 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -212,7 +212,7 @@ struct futex_pi_state { atomic_t refcount; union futex_key key; -}; +} __randomize_layout; /** * struct futex_q - The hashed futex queue entry, one per waiting task @@ -246,7 +246,7 @@ struct futex_q { struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; -}; +} __randomize_layout; static const struct futex_q futex_q_init = { /* list gets initialized in queue_me()*/ diff --git a/security/keys/internal.h b/security/keys/internal.h index c0f8682eba69..6494954e9980 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -197,7 +197,7 @@ struct request_key_auth { void *callout_info; size_t callout_len; pid_t pid; -}; +} __randomize_layout; extern struct key_type key_type_request_key_auth; extern struct key *request_key_auth_new(struct key *target, -- cgit v1.2.3-71-gd317 From 1d0c6e593023ac5dafc2ea2b3f23d96f1c1f2fa2 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Fri, 30 Jun 2017 10:22:14 +0530 Subject: PM / sleep: constify attribute_group structures attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 3802 624 32 4458 116a kernel/power/main.o File size After adding 'const': text data bss dec hex filename 3866 560 32 4458 116a kernel/power/main.o Signed-off-by: Arvind Yadav Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index d401c21136d1..42bd800a6755 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -705,7 +705,7 @@ static struct attribute * g[] = { NULL, }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; -- cgit v1.2.3-71-gd317 From 4cc7c1864bbd4cf80f6bdc8ba3217de5aa5f4688 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Jul 2017 16:24:49 +0100 Subject: bpf: Implement show_options Implement the show_options superblock op for bpf as part of a bid to get rid of s_options and generic_show_options() to make it easier to implement a context-based mount where the mount options can be passed individually over a file descriptor. Signed-off-by: David Howells cc: Alexei Starovoitov cc: Daniel Borkmann cc: netdev@vger.kernel.org Signed-off-by: Al Viro --- kernel/bpf/inode.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9bbd33497d3d..e833ed914358 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode) bpf_any_put(inode->i_private, type); } +/* + * Display the mount options in /proc/mounts. + */ +static int bpf_show_options(struct seq_file *m, struct dentry *root) +{ + umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + + if (mode != S_IRWXUGO) + seq_printf(m, ",mode=%o", mode); + return 0; +} + static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, - .show_options = generic_show_options, + .show_options = bpf_show_options, .evict_inode = bpf_evict_inode, }; @@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode; int ret; - save_mount_options(sb, data); - ret = bpf_parse_options(data, &opts); if (ret) return ret; -- cgit v1.2.3-71-gd317 From 99c621d704cf1c4eb74c3c42e674edf3df64f92d Mon Sep 17 00:00:00 2001 From: Michael Sartain Date: Wed, 5 Jul 2017 22:07:15 -0600 Subject: tracing: Add saved_tgids file to show cached pid to tgid mappings Export the cached pid / tgid mappings in debugfs tracing saved_tgids file. This allows user apps to translate the pids from a trace to their respective thread group. Example saved_tgids file with pid / tgid values separated by ' ': # cat saved_tgids 1048 1048 1047 1047 7 7 1049 1047 1054 1047 1053 1047 Link: http://lkml.kernel.org/r/20170630004023.064965233@goodmis.org Link: http://lkml.kernel.org/r/20170706040713.unwkumbta5menygi@mikesart-cos Reviewed-by: Joel Fernandes Signed-off-by: Michael Sartain Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 00e2e4169b1e..f079a8ca1117 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4688,6 +4688,76 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; +static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) +{ + int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) { + if (trace_find_tgid(*ptr)) + return ptr; + } + + return NULL; +} + +static void *saved_tgids_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + if (!tgid_map) + return NULL; + + v = &tgid_map[0]; + while (l <= *pos) { + v = saved_tgids_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_tgids_stop(struct seq_file *m, void *v) +{ +} + +static int saved_tgids_show(struct seq_file *m, void *v) +{ + int pid = (int *)v - tgid_map; + + seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid)); + return 0; +} + +static const struct seq_operations tracing_saved_tgids_seq_ops = { + .start = saved_tgids_start, + .stop = saved_tgids_stop, + .next = saved_tgids_next, + .show = saved_tgids_show, +}; + +static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_tgids_seq_ops); +} + + +static const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_saved_tgids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { unsigned int *ptr = v; @@ -7920,6 +7990,9 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_cmdlines_size", 0644, d_tracer, NULL, &tracing_saved_cmdlines_size_fops); + trace_create_file("saved_tgids", 0444, d_tracer, + NULL, &tracing_saved_tgids_fops); + trace_eval_init(); trace_create_eval_file(d_tracer); -- cgit v1.2.3-71-gd317 From eaf260ac04d9b4cf9f458d5c97555bfff2da526e Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:21 -0700 Subject: tracing: Treat recording comm for idle task as a success Currently we stop recording comm for non-idle tasks when switching from/to idle task since we treat that as a record failure. Fix that by treat recording of comm for idle task as a success. Link: http://lkml.kernel.org/r/20170706230023.17942-1-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Reported-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f079a8ca1117..6722d86f2af5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1916,7 +1916,11 @@ static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; - if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(tsk->pid > PID_MAX_DEFAULT)) return 0; /* -- cgit v1.2.3-71-gd317 From bd45d34d25720a820021c8ea45de5cd607eace64 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:22 -0700 Subject: tracing: Treat recording tgid for idle task as a success Currently we stop recording tgid for non-idle tasks when switching from/to idle task since we treat that as a record failure. Fix that by treat recording of tgid for idle task as a success. Link: http://lkml.kernel.org/r/20170706230023.17942-2-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Reported-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6722d86f2af5..aee11e3a394f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2006,7 +2006,11 @@ int trace_find_tgid(int pid) static int trace_save_tgid(struct task_struct *tsk) { - if (unlikely(!tgid_map || !tsk->pid || tsk->pid > PID_MAX_DEFAULT)) + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT)) return 0; tgid_map[tsk->pid] = tsk->tgid; -- cgit v1.2.3-71-gd317 From 29b1a8ad7df4528b862a79e3d5fb0936f4d199c7 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:23 -0700 Subject: tracing: Attempt to record other information even if some fail In recent patches where we record comm and tgid at the same time, we skip continuing to record if any fail. Fix that by trying to record as many things as we can even if some couldn't be recorded. If any information isn't recorded, then we don't set trace_taskinfo_save as before. Link: http://lkml.kernel.org/r/20170706230023.17942-3-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aee11e3a394f..92af8fd1429b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2037,11 +2037,20 @@ static bool tracing_record_taskinfo_skip(int flags) */ void tracing_record_taskinfo(struct task_struct *task, int flags) { + bool done; + if (tracing_record_taskinfo_skip(flags)) return; - if ((flags & TRACE_RECORD_CMDLINE) && !trace_save_cmdline(task)) - return; - if ((flags & TRACE_RECORD_TGID) && !trace_save_tgid(task)) + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); + + /* If recording any information failed, retry again soon. */ + if (!done) return; __this_cpu_write(trace_taskinfo_save, false); @@ -2058,15 +2067,22 @@ void tracing_record_taskinfo(struct task_struct *task, int flags) void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) { + bool done; + if (tracing_record_taskinfo_skip(flags)) return; - if ((flags & TRACE_RECORD_CMDLINE) && - (!trace_save_cmdline(prev) || !trace_save_cmdline(next))) - return; + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); + done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); - if ((flags & TRACE_RECORD_TGID) && - (!trace_save_tgid(prev) || !trace_save_tgid(next))) + /* If recording any information failed, retry again soon. */ + if (!done) return; __this_cpu_write(trace_taskinfo_save, false); -- cgit v1.2.3-71-gd317 From fca18a47cf3eb8425ec19c2dfc374f3d04f5219f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Sat, 8 Jul 2017 00:27:30 +0530 Subject: trace/kprobes: Sanitize derived event names When we derive event names, convert some expected symbols (such as ':' used to specify module:name and '.' present in some symbols) into underscores so that the event name is not rejected. Before this patch: # echo 'p kobject_example:foo_store' > kprobe_events trace_kprobe: Failed to allocate trace_probe.(-22) -sh: write error: Invalid argument After this patch: # echo 'p kobject_example:foo_store' > kprobe_events # cat kprobe_events p:kprobes/p_kobject_example_foo_store_0 kobject_example:foo_store Link: http://lkml.kernel.org/r/66c189e09e71361aba91dd4a5bd146a1b62a7a51.1499453040.git.naveen.n.rao@linux.vnet.ibm.com Acked-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c129fca6ec99..44fd819aa33d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; +/* Convert certain expected symbols into '_' when generating event names */ +static inline void sanitize_event_name(char *name) +{ + while (*name++ != '\0') + if (*name == ':' || *name == '.') + *name = '_'; +} + static int create_trace_kprobe(int argc, char **argv) { /* @@ -740,6 +748,7 @@ static int create_trace_kprobe(int argc, char **argv) else snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); + sanitize_event_name(buf); event = buf; } tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, -- cgit v1.2.3-71-gd317 From 1860033237d4be09c5d7382585f0c7229367a534 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:48:02 -0700 Subject: mm: make PR_SET_THP_DISABLE immediately active PR_SET_THP_DISABLE has a rather subtle semantic. It doesn't affect any existing mapping because it only updated mm->def_flags which is a template for new mappings. The mappings created after prctl(PR_SET_THP_DISABLE) have VM_NOHUGEPAGE flag set. This can be quite surprising for all those applications which do not do prctl(); fork() & exec() and want to control their own THP behavior. Another usecase when the immediate semantic of the prctl might be useful is a combination of pre- and post-copy migration of containers with CRIU. In this case CRIU populates a part of a memory region with data that was saved during the pre-copy stage. Afterwards, the region is registered with userfaultfd and CRIU expects to get page faults for the parts of the region that were not yet populated. However, khugepaged collapses the pages and the expected page faults do not occur. In more general case, the prctl(PR_SET_THP_DISABLE) could be used as a temporary mechanism for enabling/disabling THP process wide. Implementation wise, a new MMF_DISABLE_THP flag is added. This flag is tested when decision whether to use huge pages is taken either during page fault of at the time of THP collapse. It should be noted, that the new implementation makes PR_SET_THP_DISABLE master override to any per-VMA setting, which was not the case previously. Fixes: a0715cc22601 ("mm, thp: add VM_INIT_DEF_MASK and PRCTL_THP_DISABLE") Link: http://lkml.kernel.org/r/1496415802-30944-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Michal Hocko Signed-off-by: Mike Rapoport Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: "Kirill A. Shutemov" Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 1 + include/linux/khugepaged.h | 3 ++- include/linux/sched/coredump.h | 5 ++++- kernel/sys.c | 6 +++--- mm/khugepaged.c | 3 ++- mm/shmem.c | 8 +++++--- 6 files changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d3b3e8fcc717..40d7b7dd2653 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -92,6 +92,7 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); (1<vm_flags & VM_HUGEPAGE))) && \ !((__vma)->vm_flags & VM_NOHUGEPAGE) && \ + !test_bit(MMF_DISABLE_THP, &(__vma)->vm_mm->flags) && \ !is_vma_temporary_stack(__vma)) #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 5d9a400af509..f0d7335336cd 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -48,7 +48,8 @@ static inline int khugepaged_enter(struct vm_area_struct *vma, if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) if ((khugepaged_always() || (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && - !(vm_flags & VM_NOHUGEPAGE)) + !(vm_flags & VM_NOHUGEPAGE) && + !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 69eedcef8f03..98ae0d05aa32 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -68,7 +68,10 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ + MMF_DISABLE_THP_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 47d901586b4e..73fc0af147d0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2360,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) @@ -2368,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (down_write_killable(&me->mm->mmap_sem)) return -EINTR; if (arg2) - me->mm->def_flags |= VM_NOHUGEPAGE; + set_bit(MMF_DISABLE_THP, &me->mm->flags); else - me->mm->def_flags &= ~VM_NOHUGEPAGE; + clear_bit(MMF_DISABLE_THP, &me->mm->flags); up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index df4ebdb2b10a..c01f177a1120 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -816,7 +816,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) static bool hugepage_vma_check(struct vm_area_struct *vma) { if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) + (vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; if (shmem_file(vma->vm_file)) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) diff --git a/mm/shmem.c b/mm/shmem.c index 9418f5a9bc46..b0aa6075d164 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1977,10 +1977,12 @@ static int shmem_fault(struct vm_fault *vmf) } sgp = SGP_CACHE; - if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - else if (vma->vm_flags & VM_NOHUGEPAGE) + + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) sgp = SGP_NOHUGE; + else if (vma->vm_flags & VM_HUGEPAGE) + sgp = SGP_HUGE; error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); -- cgit v1.2.3-71-gd317 From 9dcdcea11491f6eee65bd1b352293ca01e4b7997 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Mon, 10 Jul 2017 15:51:14 -0700 Subject: kernel/ksysfs.c: constify attribute_group structures. attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 1120 544 16 1680 690 kernel/ksysfs.o File size After adding 'const': text data bss dec hex filename 1160 480 16 1656 678 kernel/ksysfs.o Link: http://lkml.kernel.org/r/aa224b3cc923fdbb3edd0c41b2c639c85408c9e8.1498737347.git.arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Acked-by: Kees Cook Cc: Russell King Cc: Dave Young Cc: Hari Bathini Cc: Petr Tesarik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 23cd70651238..df1a9aa602a0 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = { NULL }; -static struct attribute_group kernel_attr_group = { +static const struct attribute_group kernel_attr_group = { .attrs = kernel_attrs, }; -- cgit v1.2.3-71-gd317 From b7b2562f7252878e18de60c24f320052076f9de8 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 10 Jul 2017 15:51:17 -0700 Subject: kernel/groups.c: use sort library function setgroups is not exactly a hot path, so we might as well use the library function instead of open-coding the sorting. Saves ~150 bytes. Link: http://lkml.kernel.org/r/1497301378-22739-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index d09727692a2a..434f6665f187 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info, return 0; } -/* a simple Shell sort */ +static int gid_cmp(const void *_a, const void *_b) +{ + kgid_t a = *(kgid_t *)_a; + kgid_t b = *(kgid_t *)_b; + + return gid_gt(a, b) - gid_lt(a, b); +} + static void groups_sort(struct group_info *group_info) { - int base, max, stride; - int gidsetsize = group_info->ngroups; - - for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) - ; /* nothing */ - stride /= 3; - - while (stride) { - max = gidsetsize - stride; - for (base = 0; base < max; base++) { - int left = base; - int right = left + stride; - kgid_t tmp = group_info->gid[right]; - - while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { - group_info->gid[right] = group_info->gid[left]; - right = left; - left -= stride; - } - group_info->gid[right] = tmp; - } - stride /= 3; - } + sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), + gid_cmp, NULL); } /* a simple bsearch */ -- cgit v1.2.3-71-gd317 From 63b23e2cbc8e80de3e40184ecb2c3bfb705776fa Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 10 Jul 2017 15:51:20 -0700 Subject: kernel/kallsyms.c: replace all_var with IS_ENABLED(CONFIG_KALLSYMS_ALL) 'all_var' looks like a variable, but is actually a macro. Use IS_ENABLED(CONFIG_KALLSYMS_ALL) for clarification. Link: http://lkml.kernel.org/r/1497577591-3434-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6a3b249a2ae1..127e7cfafa55 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -28,12 +28,6 @@ #include -#ifdef CONFIG_KALLSYMS_ALL -#define all_var 1 -#else -#define all_var 0 -#endif - /* * These will be re-linked against their real values * during the second link stage. @@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr) static int is_ksym_addr(unsigned long addr) { - if (all_var) + if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); @@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr, if (!symbol_end) { if (is_kernel_inittext(addr)) symbol_end = (unsigned long)_einittext; - else if (all_var) + else if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) symbol_end = (unsigned long)_end; else symbol_end = (unsigned long)_etext; -- cgit v1.2.3-71-gd317 From a94c33dd1f677d16c4f1a162b4b3e9eba1b07c24 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Mon, 10 Jul 2017 15:51:58 -0700 Subject: lib/extable.c: use bsearch() library function in search_extable() [thomas@m3y3r.de: v3: fix arch specific implementations] Link: http://lkml.kernel.org/r/1497890858.12931.7.camel@m3y3r.de Signed-off-by: Thomas Meyer Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/module.c | 3 ++- arch/mips/kernel/traps.c | 3 ++- arch/sh/mm/extable_64.c | 34 ++++++++++++++++++---------------- arch/sparc/mm/extable.c | 28 ++++++++++++++-------------- include/linux/extable.h | 5 +++-- kernel/extable.c | 3 ++- kernel/module.c | 2 +- lib/extable.c | 41 +++++++++++++++++++++-------------------- 8 files changed, 63 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 94627a3a6a0d..50c020c47e54 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -317,7 +317,8 @@ const struct exception_table_entry *search_module_dbetables(unsigned long addr) spin_lock_irqsave(&dbe_lock, flags); list_for_each_entry(dbe, &dbe_list, dbe_list) { - e = search_extable(dbe->dbe_start, dbe->dbe_end - 1, addr); + e = search_extable(dbe->dbe_start, + dbe->dbe_end - dbe->dbe_start, addr); if (e) break; } diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 38dfa27730ff..b68b4d0726d3 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -429,7 +429,8 @@ static const struct exception_table_entry *search_dbe_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___dbe_table, __stop___dbe_table - 1, addr); + e = search_extable(__start___dbe_table, + __stop___dbe_table - __start___dbe_table, addr); if (!e) e = search_module_dbetables(addr); return e; diff --git a/arch/sh/mm/extable_64.c b/arch/sh/mm/extable_64.c index b90cdfad2c78..7a3b4d33d2e7 100644 --- a/arch/sh/mm/extable_64.c +++ b/arch/sh/mm/extable_64.c @@ -10,6 +10,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#include #include #include #include @@ -40,10 +41,23 @@ static const struct exception_table_entry *check_exception_ranges(unsigned long return NULL; } +static int cmp_ex_search(const void *key, const void *elt) +{ + const struct exception_table_entry *_elt = elt; + unsigned long _key = *(unsigned long *)key; + + /* avoid overflow */ + if (_key > _elt->insn) + return 1; + if (_key < _elt->insn) + return -1; + return 0; +} + /* Simple binary search */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { const struct exception_table_entry *mid; @@ -52,20 +66,8 @@ search_extable(const struct exception_table_entry *first, if (mid) return mid; - while (first <= last) { - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - - return NULL; + return bsearch(&value, base, num, + sizeof(struct exception_table_entry), cmp_ex_search); } int fixup_exception(struct pt_regs *regs) diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c index db214e9931d9..2422511dc8c5 100644 --- a/arch/sparc/mm/extable.c +++ b/arch/sparc/mm/extable.c @@ -13,11 +13,11 @@ void sort_extable(struct exception_table_entry *start, /* Caller knows they are in a range if ret->fixup == 0 */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *start, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { - const struct exception_table_entry *walk; + int i; /* Single insn entries are encoded as: * word 1: insn address @@ -37,30 +37,30 @@ search_extable(const struct exception_table_entry *start, */ /* 1. Try to find an exact match. */ - for (walk = start; walk <= last; walk++) { - if (walk->fixup == 0) { + for (i = 0; i < num; i++) { + if (base[i].fixup == 0) { /* A range entry, skip both parts. */ - walk++; + i++; continue; } /* A deleted entry; see trim_init_extable */ - if (walk->fixup == -1) + if (base[i].fixup == -1) continue; - if (walk->insn == value) - return walk; + if (base[i].insn == value) + return &base[i]; } /* 2. Try to find a range match. */ - for (walk = start; walk <= (last - 1); walk++) { - if (walk->fixup) + for (i = 0; i < (num - 1); i++) { + if (base[i].fixup) continue; - if (walk[0].insn <= value && walk[1].insn > value) - return walk; + if (base[i].insn <= value && base[i + 1].insn > value) + return &base[i]; - walk++; + i++; } return NULL; diff --git a/include/linux/extable.h b/include/linux/extable.h index 7effea4b257d..28addad0dda7 100644 --- a/include/linux/extable.h +++ b/include/linux/extable.h @@ -2,13 +2,14 @@ #define _LINUX_EXTABLE_H #include /* for NULL */ +#include struct module; struct exception_table_entry; const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value); void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish); diff --git a/kernel/extable.c b/kernel/extable.c index 223df4a328a4..38c2412401a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___ex_table, __stop___ex_table-1, addr); + e = search_extable(__start___ex_table, + __stop___ex_table - __start___ex_table, addr); if (!e) e = search_module_extables(addr); return e; diff --git a/kernel/module.c b/kernel/module.c index b3dbdde82e80..b0f92a365140 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4196,7 +4196,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) goto out; e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, + mod->num_exentries, addr); out: preempt_enable(); diff --git a/lib/extable.c b/lib/extable.c index 62968daa66a9..f54996fdd0b8 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -51,7 +52,7 @@ static void swap_ex(void *a, void *b, int size) * This is used both for the kernel exception table and for * the exception tables of modules that get loaded. */ -static int cmp_ex(const void *a, const void *b) +static int cmp_ex_sort(const void *a, const void *b) { const struct exception_table_entry *x = a, *y = b; @@ -67,7 +68,7 @@ void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish) { sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, swap_ex); + cmp_ex_sort, swap_ex); } #ifdef CONFIG_MODULES @@ -93,6 +94,20 @@ void trim_init_extable(struct module *m) #endif /* !ARCH_HAS_SORT_EXTABLE */ #ifndef ARCH_HAS_SEARCH_EXTABLE + +static int cmp_ex_search(const void *key, const void *elt) +{ + const struct exception_table_entry *_elt = elt; + unsigned long _key = *(unsigned long *)key; + + /* avoid overflow */ + if (_key > ex_to_insn(_elt)) + return 1; + if (_key < ex_to_insn(_elt)) + return -1; + return 0; +} + /* * Search one exception table for an entry corresponding to the * given instruction address, and return the address of the entry, @@ -101,25 +116,11 @@ void trim_init_extable(struct module *m) * already sorted. */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { - while (first <= last) { - const struct exception_table_entry *mid; - - mid = ((last - first) >> 1) + first; - /* - * careful, the distance between value and insn - * can be larger than MAX_LONG: - */ - if (ex_to_insn(mid) < value) - first = mid + 1; - else if (ex_to_insn(mid) > value) - last = mid - 1; - else - return mid; - } - return NULL; + return bsearch(&value, base, num, + sizeof(struct exception_table_entry), cmp_ex_search); } #endif -- cgit v1.2.3-71-gd317 From 4ea77014af0d6205b05503d1c7aac6eace11d473 Mon Sep 17 00:00:00 2001 From: zhongjiang Date: Mon, 10 Jul 2017 15:52:57 -0700 Subject: kernel/signal.c: avoid undefined behaviour in kill_something_info When running kill(72057458746458112, 0) in userspace I hit the following issue. UBSAN: Undefined behaviour in kernel/signal.c:1462:11 negation of -2147483648 cannot be represented in type 'int': CPU: 226 PID: 9849 Comm: test Tainted: G B ---- ------- 3.10.0-327.53.58.70.x86_64_ubsan+ #116 Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, BIOS BLHSV028 11/11/2014 Call Trace: dump_stack+0x19/0x1b ubsan_epilogue+0xd/0x50 __ubsan_handle_negate_overflow+0x109/0x14e SYSC_kill+0x43e/0x4d0 SyS_kill+0xe/0x10 system_call_fastpath+0x16/0x1b Add code to avoid the UBSAN detection. [akpm@linux-foundation.org: tweak comment] Link: http://lkml.kernel.org/r/1496670008-59084-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhongjiang Cc: Oleg Nesterov Cc: Michal Hocko Cc: Vlastimil Babka Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 48a59eefd8ad..caed9133ae52 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1402,6 +1402,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) return ret; } + /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ + if (pid == INT_MIN) + return -ESRCH; + read_lock(&tasklist_lock); if (pid != -1) { ret = __kill_pgrp_info(sig, info, -- cgit v1.2.3-71-gd317 From dd83c161fbcc5d8be637ab159c0de015cbff5ba4 Mon Sep 17 00:00:00 2001 From: zhongjiang Date: Mon, 10 Jul 2017 15:53:01 -0700 Subject: kernel/exit.c: avoid undefined behaviour when calling wait4() wait4(-2147483648, 0x20, 0, 0xdd0000) triggers: UBSAN: Undefined behaviour in kernel/exit.c:1651:9 The related calltrace is as follows: negation of -2147483648 cannot be represented in type 'int': CPU: 9 PID: 16482 Comm: zj Tainted: G B ---- ------- 3.10.0-327.53.58.71.x86_64+ #66 Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285 /BC11BTSA , BIOS CTSAV036 04/27/2011 Call Trace: dump_stack+0x19/0x1b ubsan_epilogue+0xd/0x50 __ubsan_handle_negate_overflow+0x109/0x14e SyS_wait4+0x1cb/0x1e0 system_call_fastpath+0x16/0x1b Exclude the overflow to avoid the UBSAN warning. Link: http://lkml.kernel.org/r/1497264618-20212-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhongjiang Cc: Oleg Nesterov Cc: David Rientjes Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 608c9775a37b..c5548faa9f37 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1639,6 +1639,10 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options, __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; + /* -INT_MIN is not defined */ + if (upid == INT_MIN) + return -ESRCH; + if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { -- cgit v1.2.3-71-gd317 From 6a8a75f3235724c5941a33e287b2f98966ad14c5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Jul 2017 10:56:54 +0200 Subject: Revert "perf/core: Drop kernel samples even though :u is specified" This reverts commit cc1582c231ea041fbc68861dfaf957eaf902b829. This commit introduced a regression that broke rr-project, which uses sampling events to receive a signal on overflow (but does not care about the contents of the sample). These signals are critical to the correct operation of rr. There's been some back and forth about how to fix it - but to not keep applications in limbo queue up a revert. Reported-by: Kyle Huey Acked-by: Kyle Huey Acked-by: Peter Zijlstra Cc: Jin Yao Cc: Vince Weaver Cc: Linus Torvalds Cc: Will Deacon Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Stephane Eranian Cc: Namhyung Kim Cc: Jiri Olsa Cc: Link: http://lkml.kernel.org/r/20170628105600.GC5981@leverpostej Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d2c32f98482..9747e422ab20 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7308,21 +7308,6 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } -static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) -{ - /* - * Due to interrupt latency (AKA "skid"), we may enter the - * kernel before taking an overflow, even if the PMU is only - * counting user events. - * To avoid leaking information to userspace, we must always - * reject kernel samples when exclude_kernel is set. - */ - if (event->attr.exclude_kernel && !user_mode(regs)) - return false; - - return true; -} - /* * Generic event overflow handling, sampling. */ @@ -7343,12 +7328,6 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); - /* - * For security, drop the skid kernel samples if necessary. - */ - if (!sample_is_allowed(event, regs)) - return ret; - /* * XXX event_limit might not quite work as expected on inherited * events -- cgit v1.2.3-71-gd317 From dea1d0f5f1284e3defee4b8484d9fc230686cd42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 11 Jul 2017 22:06:24 +0200 Subject: smp/hotplug: Replace BUG_ON and react useful The move of the unpark functions to the control thread moved the BUG_ON() there as well. While it made some sense in the idle thread of the upcoming CPU, it's bogus to crash the control thread on the already online CPU, especially as the function has a return value and the callsite is prepared to handle an error return. Replace it with a WARN_ON_ONCE() and return a proper error code. Fixes: 9cd4f1a4e7a8 ("smp/hotplug: Move unparking of percpu threads to the control CPU") Rightfully-ranted-at-by: Linux Torvalds Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index ab860453841d..eee033134262 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -279,7 +279,8 @@ static int bringup_wait_for_ap(unsigned int cpu) /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ wait_for_completion(&st->done); - BUG_ON(!cpu_online(cpu)); + if (WARN_ON_ONCE((!cpu_online(cpu)))) + return -ECANCELED; /* Unpark the stopper thread and the hotplug thread of the target cpu */ stop_machine_unpark(cpu); -- cgit v1.2.3-71-gd317 From b11fb73743fc406204e0749ead18560aeda8b136 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 11 Jul 2017 15:43:24 -0400 Subject: tracing: Fixup trace file header alignment The addition of TGID to the tracing header added a check to see if TGID shoudl be displayed or not, and updated the header accordingly. Unfortunately, it broke the default header. Also add constant strings to use for spacing. This does remove the visibility of the header a bit, but cuts it down from the extended lines much greater than 80 characters. Before this change: # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU#|||| TIMESTAMP FUNCTION # | | | |||| | | swapper/0-1 [000] .... 0.277830: migration_init <-do_one_initcall swapper/0-1 [002] d... 13.861967: Unknown type 1201 swapper/0-1 [002] d..1 13.861970: Unknown type 1202 After this change: # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | swapper/0-1 [000] .... 0.278245: migration_init <-do_one_initcall swapper/0-1 [003] d... 13.861189: Unknown type 1201 swapper/0-1 [003] d..1 13.861192: Unknown type 1202 Cc: Joel Fernandes Fixes: 441dae8f2f29 ("tracing: Add support for display of tgid in trace output") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 92af8fd1429b..dabd810a10cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3358,14 +3358,23 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - - seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? " " : ""); - seq_printf(m, "# %s / _----=> need-resched\n", tgid ? " " : ""); - seq_printf(m, "# %s| / _---=> hardirq/softirq\n", tgid ? " " : ""); - seq_printf(m, "# %s|| / _--=> preempt-depth\n", tgid ? " " : ""); - seq_printf(m, "# %s||| / delay\n", tgid ? " " : ""); - seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); - seq_printf(m, "# | | | %s|||| | |\n", tgid ? " | " : ""); + const char tgid_space[] = " "; + const char space[] = " "; + + seq_printf(m, "# %s _-----=> irqs-off\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s / _----=> need-resched\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s| / _---=> hardirq/softirq\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s|| / _--=> preempt-depth\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s||| / delay\n", + tgid ? tgid_space : space); + seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", + tgid ? " TGID " : space); + seq_printf(m, "# | | | %s|||| | |\n", + tgid ? " | " : space); } void -- cgit v1.2.3-71-gd317 From bbd1d27d863d5c0acee65ecd0c2e34035e1df5ea Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 11 Jul 2017 19:21:04 -0400 Subject: tracing: Do note expose stack_trace_filter without DYNAMIC_FTRACE The "stack_trace_filter" file only makes sense if DYNAMIC_FTRACE is configured in. If it is not, then the user can not filter any functions. Not only that, the open function causes warnings when DYNAMIC_FTRACE is not set. Link: http://lkml.kernel.org/r/20170710110521.600806-1-arnd@arndb.de Reported-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b4a751e8f9d6..a4df67cbc711 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -406,6 +406,8 @@ static const struct file_operations stack_trace_fops = { .release = seq_release, }; +#ifdef CONFIG_DYNAMIC_FTRACE + static int stack_trace_filter_open(struct inode *inode, struct file *file) { @@ -423,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = { .release = ftrace_regex_release, }; +#endif /* CONFIG_DYNAMIC_FTRACE */ + int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -477,8 +481,10 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); +#ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("stack_trace_filter", 0444, d_tracer, &trace_ops, &stack_trace_filter_fops); +#endif if (stack_trace_filter_buf[0]) ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); -- cgit v1.2.3-71-gd317 From 69449bbd65687e8e5fb968a5a0c46089f6af6001 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 10 Jul 2017 10:44:03 +0200 Subject: ftrace: Hide cached module code for !CONFIG_MODULES When modules are disabled, we get a harmless build warning: kernel/trace/ftrace.c:4051:13: error: 'process_cached_mods' defined but not used [-Werror=unused-function] This adds the same #ifdef around the new code that exists around its caller. Link: http://lkml.kernel.org/r/20170710084413.1820568-1-arnd@arndb.de Fixes: d7fbf8df7ca0 ("ftrace: Implement cached modules tracing on module load") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2953d558bbee..4706f0ed193e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3978,6 +3978,7 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable); +#ifdef CONFIG_MODULES static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, char *mod, bool enable) { @@ -4068,6 +4069,7 @@ static void process_cached_mods(const char *mod_name) kfree(mod); } +#endif /* * We register the module command as a template to show others how -- cgit v1.2.3-71-gd317 From 19d39a3810e7032f311ef83effdac40339b9d022 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 11 Jul 2017 23:41:52 +0200 Subject: genirq: Keep chip buslock across irq_request/release_resources() Moving the irq_request/release_resources() callbacks out of the spinlocked, irq disabled and bus locked region, unearthed an interesting abuse of the irq_bus_lock/irq_bus_sync_unlock() callbacks. The OMAP GPIO driver does merily power management inside of them. The irq_request_resources() callback of this GPIO irqchip calls a function which reads a GPIO register. That read aborts now because the clock of the GPIO block is not magically enabled via the irq_bus_lock() callback. Move the callbacks under the bus lock again to prevent this. In the free_irq() path this requires to drop the bus_lock before calling synchronize_irq() and reaquiring it before calling the irq_release_resources() callback. The bus lock can't be held because: 1) The data which has been changed between bus_lock/un_lock is cached in the irq chip driver private data and needs to go out to the irq chip via the slow bus (usually SPI or I2C) before calling synchronize_irq(). That's the reason why this bus_lock/unlock magic exists in the first place, as you cannot do SPI/I2C transactions while holding desc->lock with interrupts disabled. 2) synchronize_irq() will actually deadlock, if there is a handler on flight. These chips use threaded handlers for obvious reasons, as they allow to do SPI/I2C communication. When the threaded handler returns then bus_lock needs to be taken in irq_finalize_oneshot() as we need to talk to the actual irq chip once more. After that the threaded handler is marked done, which makes synchronize_irq() return. So if we hold bus_lock accross the synchronize_irq() call, the handler cannot mark itself done because it blocks on the bus lock. That in turn makes synchronize_irq() wait forever on the threaded handler to complete.... Add the missing unlock of desc->request_mutex in the error path of __free_irq() and add a bunch of comments to explain the locking and protection rules. Fixes: 46e48e257360 ("genirq: Move irq resource handling out of spinlocked region") Reported-and-tested-by: Sebastian Reichel Reported-and-tested-by: Tony Lindgren Reported-by: Pavel Machek Signed-off-by: Thomas Gleixner Not-longer-ranted-at-by: Linus Torvalds Cc: Linus Walleij Cc: Grygorii Strashko Cc: Marc Zyngier --- kernel/irq/manage.c | 63 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5624b2dd6b58..1d1a5b945ab4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1090,6 +1090,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. + * + * Locking rules: + * + * desc->request_mutex Provides serialization against a concurrent free_irq() + * chip_bus_lock Provides serialization for slow bus operations + * desc->lock Provides serialization against hard interrupts + * + * chip_bus_lock and desc->lock are sufficient for all other management and + * interrupt related functions. desc->request_mutex solely serializes + * request/free_irq(). */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) @@ -1167,20 +1177,35 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; + /* + * Protects against a concurrent __free_irq() call which might wait + * for synchronize_irq() to complete without holding the optional + * chip bus lock and desc->lock. + */ mutex_lock(&desc->request_mutex); + + /* + * Acquire bus lock as the irq_request_resources() callback below + * might rely on the serialization or the magic power management + * functions which are abusing the irq_bus_lock() callback, + */ + chip_bus_lock(desc); + + /* First installed action requests resources. */ if (!desc->action) { ret = irq_request_resources(desc); if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); - goto out_mutex; + goto out_bus_unlock; } } - chip_bus_lock(desc); - /* * The following block of code has to be executed atomically + * protected against a concurrent interrupt and any of the other + * management calls which are not serialized via + * desc->request_mutex or the optional bus lock. */ raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; @@ -1286,10 +1311,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); - if (ret) { - irq_release_resources(desc); + if (ret) goto out_unlock; - } } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ @@ -1385,12 +1408,10 @@ mismatch: out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); - if (!desc->action) irq_release_resources(desc); - -out_mutex: +out_bus_unlock: + chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); out_thread: @@ -1472,6 +1493,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); return NULL; } @@ -1498,6 +1520,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif raw_spin_unlock_irqrestore(&desc->lock, flags); + /* + * Drop bus_lock here so the changes which were done in the chip + * callbacks above are synced out to the irq chips which hang + * behind a slow bus (I2C, SPI) before calling synchronize_irq(). + * + * Aside of that the bus_lock can also be taken from the threaded + * handler in irq_finalize_oneshot() which results in a deadlock + * because synchronize_irq() would wait forever for the thread to + * complete, which is blocked on the bus lock. + * + * The still held desc->request_mutex() protects against a + * concurrent request_irq() of this irq so the release of resources + * and timing data is properly serialized. + */ chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); @@ -1530,8 +1566,15 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + /* Last action releases resources */ if (!desc->action) { + /* + * Reaquire bus lock as irq_release_resources() might + * require it to deallocate resources over the slow bus. + */ + chip_bus_lock(desc); irq_release_resources(desc); + chip_bus_sync_unlock(desc); irq_remove_timings(desc); } -- cgit v1.2.3-71-gd317 From ab2f7cf141aa6734c4ca7525132d8cc236efee77 Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Thu, 6 Jul 2017 10:53:20 -0700 Subject: cpufreq: schedutil: Fix sugov_start() versus sugov_update_shared() race With a shared policy in place, when one of the CPUs in the policy is hotplugged out and then brought back online, sugov_stop() and sugov_start() are called in order. sugov_stop() removes utilization hooks for each CPU in the policy and does nothing else in the for_each_cpu() loop. sugov_start() on the other hand iterates through the CPUs in the policy and re-initializes the per-cpu structure _and_ adds the utilization hook. This implies that the scheduler is allowed to invoke a CPU's utilization update hook when the rest of the per-cpu structures have yet to be re-inited. Apart from some strange values in tracepoints this doesn't cause a problem, but if we do end up accessing a pointer from the per-cpu sugov_cpu structure somewhere in the sugov_update_shared() path, we will likely see crashes since the memset for another CPU in the policy is free to race with sugov_update_shared from the CPU that is ready to go. So let's fix this now to first init all per-cpu structures, and then add the per-cpu utilization update hooks all at once. Signed-off-by: Vikram Mulukutla Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 076a2e31951c..29a397067ffa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -610,6 +610,11 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + } + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, policy_is_shared(policy) ? sugov_update_shared : -- cgit v1.2.3-71-gd317 From 44925dfff05fd1a897992d278b15a6b6b55e79a7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jul 2017 10:33:40 +0300 Subject: ftrace: Remove an unneeded NULL check "func" can't be NULL and it doesn't make sense to check because we've already derefenced it. Link: http://lkml.kernel.org/r/20170712073340.4enzeojeoupuds5a@mwanda Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4706f0ed193e..5fb5b40b3ae8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3950,7 +3950,7 @@ static int cache_mod(struct trace_array *tr, continue; /* no func matches all */ - if (!func || strcmp(func, "*") == 0 || + if (strcmp(func, "*") == 0 || (ftrace_mod->func && strcmp(ftrace_mod->func, func) == 0)) { ret = 0; -- cgit v1.2.3-71-gd317 From 2e028c4fe12907f226b8221815f16c2486ad3aa7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jul 2017 10:35:57 +0300 Subject: ftrace: Fix uninitialized variable in match_records() My static checker complains that if "func" is NULL then "clear_filter" is uninitialized. This seems like it could be true, although it's possible something subtle is happening that I haven't seen. kernel/trace/ftrace.c:3844 match_records() error: uninitialized symbol 'clear_filter'. Link: http://lkml.kernel.org/r/20170712073556.h6tkpjcdzjaozozs@mwanda Cc: stable@vger.kernel.org Fixes: f0a3b154bd7 ("ftrace: Clarify code for mod command") Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5fb5b40b3ae8..53f6b6401cf0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3816,7 +3816,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) int exclude_mod = 0; int found = 0; int ret; - int clear_filter; + int clear_filter = 0; if (func) { func_g.type = filter_parse_regex(func, len, &func_g.search, -- cgit v1.2.3-71-gd317 From 58c7ffc0747a3a9145629d4966291f0586703767 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 12 Jul 2017 04:59:45 +0100 Subject: fix a braino in compat_sys_getrlimit() Reported-and-tested-by: Meelis Roos Fixes: commit d9e968cb9f84 "getrlimit()/setrlimit(): move compat to native" Signed-off-by: Al Viro Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 73fc0af147d0..2855ee73acd0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1362,7 +1362,7 @@ COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, ret = do_prlimit(current, resource, NULL, &r); if (!ret) { - struct rlimit r32; + struct compat_rlimit r32; if (r.rlim_cur > COMPAT_RLIM_INFINITY) r32.rlim_cur = COMPAT_RLIM_INFINITY; else -- cgit v1.2.3-71-gd317 From 112166f88cf83dd11486cf1818672d42b540865b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 12 Jul 2017 14:33:11 -0700 Subject: kernel/fork.c: virtually mapped stacks: do not disable interrupts The reason to disable interrupts seems to be to avoid switching to a different processor while handling per cpu data using individual loads and stores. If we use per cpu RMV primitives we will not have to disable interrupts. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1705171055130.5898@east.gentwo.org Signed-off-by: Christoph Lameter Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0f69a3e5281e..d2b9d7c31eaf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) void *stack; int i; - local_irq_disable(); for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s = this_cpu_read(cached_stacks[i]); + struct vm_struct *s; + + s = this_cpu_xchg(cached_stacks[i], NULL); if (!s) continue; - this_cpu_write(cached_stacks[i], NULL); tsk->stack_vm_area = s; - local_irq_enable(); return s->addr; } - local_irq_enable(); stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END, @@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK if (task_stack_vm_area(tsk)) { - unsigned long flags; int i; - local_irq_save(flags); for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_read(cached_stacks[i])) + if (this_cpu_cmpxchg(cached_stacks[i], + NULL, tsk->stack_vm_area) != NULL) continue; - this_cpu_write(cached_stacks[i], tsk->stack_vm_area); - local_irq_restore(flags); return; } - local_irq_restore(flags); vfree_atomic(tsk->stack); return; -- cgit v1.2.3-71-gd317 From 203e9e41219b4e7357104e525e91ac609fba2c6c Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:14 -0700 Subject: kexec: move vmcoreinfo out of the kernel's .bss section As Eric said, "what we need to do is move the variable vmcoreinfo_note out of the kernel's .bss section. And modify the code to regenerate and keep this information in something like the control page. Definitely something like this needs a page all to itself, and ideally far away from any other kernel data structures. I clearly was not watching closely the data someone decided to keep this silly thing in the kernel's .bss section." This patch allocates extra pages for these vmcoreinfo_XXX variables, one advantage is that it enhances some safety of vmcoreinfo, because vmcoreinfo now is kept far away from other kernel data structures. Link: http://lkml.kernel.org/r/1493281021-20737-1-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Tested-by: Michael Holzheu Reviewed-by: Juergen Gross Suggested-by: Eric Biederman Cc: Benjamin Herrenschmidt Cc: Dave Young Cc: Hari Bathini Cc: Mahesh Salgaonkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/machine_kexec.c | 5 ----- arch/s390/kernel/machine_kexec.c | 1 + arch/s390/kernel/setup.c | 6 ------ arch/x86/kernel/crash.c | 2 +- arch/x86/xen/mmu_pv.c | 4 ++-- include/linux/crash_core.h | 4 ++-- kernel/crash_core.c | 26 ++++++++++++++++++++++---- kernel/ksysfs.c | 2 +- 8 files changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index 599507bcec91..c14815dca747 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void) #endif } -phys_addr_t paddr_vmcoreinfo_note(void) -{ - return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note); -} - diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 49a6bd45957b..3d0b14afa232 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -246,6 +246,7 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(lowcore_ptr); VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); + mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); } void machine_shutdown(void) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 3ae756c0db3d..3d1d808ea8a9 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -496,11 +496,6 @@ static void __init setup_memory_end(void) pr_notice("The maximum memory size is %luMB\n", memory_end >> 20); } -static void __init setup_vmcoreinfo(void) -{ - mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); -} - #ifdef CONFIG_CRASH_DUMP /* @@ -939,7 +934,6 @@ void __init setup_arch(char **cmdline_p) #endif setup_resources(); - setup_vmcoreinfo(); setup_lowcore(); smp_fill_possible_mask(); cpu_detect_mhz_feature(); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 22217ece26c8..44404e2307bb 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -457,7 +457,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced, bufp += sizeof(Elf64_Phdr); phdr->p_type = PT_NOTE; phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; (ehdr->e_phnum)++; #ifdef CONFIG_X86_64 diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1d7a7213a310..cab28cf2cffb 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2693,8 +2693,8 @@ EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); phys_addr_t paddr_vmcoreinfo_note(void) { if (xen_pv_domain()) - return virt_to_machine(&vmcoreinfo_note).maddr; + return virt_to_machine(vmcoreinfo_note).maddr; else - return __pa_symbol(&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } #endif /* CONFIG_KEXEC_CORE */ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 4090a42578a8..87506a02e914 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -19,7 +19,7 @@ CRASH_CORE_NOTE_NAME_BYTES + \ CRASH_CORE_NOTE_DESC_BYTES) -#define VMCOREINFO_BYTES (4096) +#define VMCOREINFO_BYTES PAGE_SIZE #define VMCOREINFO_NOTE_NAME "VMCOREINFO" #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) #define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ @@ -56,7 +56,7 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +extern u32 *vmcoreinfo_note; extern size_t vmcoreinfo_size; extern size_t vmcoreinfo_max_size; diff --git a/kernel/crash_core.c b/kernel/crash_core.c index fcbd568f1e95..2837d6164db8 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,10 +14,10 @@ #include /* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +static unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES; +u32 *vmcoreinfo_note; /* * parsing the "crashkernel" commandline @@ -326,6 +326,9 @@ static void update_vmcoreinfo_note(void) void crash_save_vmcoreinfo(void) { + if (!vmcoreinfo_note) + return; + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } @@ -356,11 +359,26 @@ void __weak arch_crash_save_vmcoreinfo(void) phys_addr_t __weak paddr_vmcoreinfo_note(void) { - return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } static int __init crash_save_vmcoreinfo_init(void) { + vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + if (!vmcoreinfo_data) { + pr_warn("Memory allocation for vmcoreinfo_data failed\n"); + return -ENOMEM; + } + + vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, + GFP_KERNEL | __GFP_ZERO); + if (!vmcoreinfo_note) { + free_page((unsigned long)vmcoreinfo_data); + vmcoreinfo_data = NULL; + pr_warn("Memory allocation for vmcoreinfo_note failed\n"); + return -ENOMEM; + } + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); VMCOREINFO_PAGESIZE(PAGE_SIZE); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index df1a9aa602a0..46ba853656f6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); return sprintf(buf, "%pa %x\n", &vmcore_base, - (unsigned int)sizeof(vmcoreinfo_note)); + (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); -- cgit v1.2.3-71-gd317 From 5203f4995d9a87952a83c2ce7866adbbe8f97bb5 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:17 -0700 Subject: powerpc/fadump: use the correct VMCOREINFO_NOTE_SIZE for phdr vmcoreinfo_max_size stands for the vmcoreinfo_data, the correct one we should use is vmcoreinfo_note whose total size is VMCOREINFO_NOTE_SIZE. Like explained in commit 77019967f06b ("kdump: fix exported size of vmcoreinfo note"), it should not affect the actual function, but we better fix it, also this change should be safe and backward compatible. After this, we can get rid of variable vmcoreinfo_max_size, let's use the corresponding macros directly, fewer variables means more safety for vmcoreinfo operation. [xlpang@redhat.com: fix build warning] Link: http://lkml.kernel.org/r/1494830606-27736-1-git-send-email-xlpang@redhat.com Link: http://lkml.kernel.org/r/1493281021-20737-2-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Reviewed-by: Mahesh Salgaonkar Reviewed-by: Dave Young Cc: Hari Bathini Cc: Benjamin Herrenschmidt Cc: Eric Biederman Cc: Juergen Gross Cc: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/fadump.c | 3 +-- include/linux/crash_core.h | 1 - kernel/crash_core.c | 3 +-- 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 3079518f2245..dc0c49cfd90a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -999,8 +999,7 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); phdr->p_offset = phdr->p_paddr; - phdr->p_memsz = vmcoreinfo_max_size; - phdr->p_filesz = vmcoreinfo_max_size; + phdr->p_memsz = phdr->p_filesz = VMCOREINFO_NOTE_SIZE; /* Increment number of program headers. */ (elf->e_phnum)++; diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 87506a02e914..e5df1b3cf072 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -58,7 +58,6 @@ phys_addr_t paddr_vmcoreinfo_note(void); extern u32 *vmcoreinfo_note; extern size_t vmcoreinfo_size; -extern size_t vmcoreinfo_max_size; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 2837d6164db8..315adbf9cb68 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -16,7 +16,6 @@ /* vmcoreinfo stuff */ static unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = VMCOREINFO_BYTES; u32 *vmcoreinfo_note; /* @@ -343,7 +342,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) r = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); -- cgit v1.2.3-71-gd317 From 1229384f5b856d83698c38f9dedfd836e26711cb Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:21 -0700 Subject: kdump: protect vmcoreinfo data under the crash memory Currently vmcoreinfo data is updated at boot time subsys_initcall(), it has the risk of being modified by some wrong code during system is running. As a result, vmcore dumped may contain the wrong vmcoreinfo. Later on, when using "crash", "makedumpfile", etc utility to parse this vmcore, we probably will get "Segmentation fault" or other unexpected errors. E.g. 1) wrong code overwrites vmcoreinfo_data; 2) further crashes the system; 3) trigger kdump, then we obviously will fail to recognize the crash context correctly due to the corrupted vmcoreinfo. Now except for vmcoreinfo, all the crash data is well protected(including the cpu note which is fully updated in the crash path, thus its correctness is guaranteed). Given that vmcoreinfo data is a large chunk prepared for kdump, we better protect it as well. To solve this, we relocate and copy vmcoreinfo_data to the crash memory when kdump is loading via kexec syscalls. Because the whole crash memory will be protected by existing arch_kexec_protect_crashkres() mechanism, we naturally protect vmcoreinfo_data from write(even read) access under kernel direct mapping after kdump is loaded. Since kdump is usually loaded at the very early stage after boot, we can trust the correctness of the vmcoreinfo data copied. On the other hand, we still need to operate the vmcoreinfo safe copy when crash happens to generate vmcoreinfo_note again, we rely on vmap() to map out a new kernel virtual address and update to use this new one instead in the following crash_save_vmcoreinfo(). BTW, we do not touch vmcoreinfo_note, because it will be fully updated using the protected vmcoreinfo_data after crash which is surely correct just like the cpu crash note. Link: http://lkml.kernel.org/r/1493281021-20737-3-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Tested-by: Michael Holzheu Cc: Benjamin Herrenschmidt Cc: Dave Young Cc: Eric Biederman Cc: Hari Bathini Cc: Juergen Gross Cc: Mahesh Salgaonkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_core.h | 2 +- include/linux/kexec.h | 2 ++ kernel/crash_core.c | 17 ++++++++++++++++- kernel/kexec.c | 8 ++++++++ kernel/kexec_core.c | 39 +++++++++++++++++++++++++++++++++++++++ kernel/kexec_file.c | 8 ++++++++ 6 files changed, 74 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index e5df1b3cf072..2df2118fbe13 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -28,6 +28,7 @@ typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; +void crash_update_vmcoreinfo_safecopy(void *ptr); void crash_save_vmcoreinfo(void); void arch_crash_save_vmcoreinfo(void); __printf(1, 2) @@ -57,7 +58,6 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("CONFIG_%s=y\n", #name) extern u32 *vmcoreinfo_note; -extern size_t vmcoreinfo_size; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 65888418fb69..dd056fab9e35 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -172,6 +172,7 @@ struct kimage { unsigned long start; struct page *control_code_page; struct page *swap_page; + void *vmcoreinfo_data_copy; /* locates in the crash memory */ unsigned long nr_segments; struct kexec_segment segment[KEXEC_SEGMENT_MAX]; @@ -241,6 +242,7 @@ extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); int kexec_crash_loaded(void); void crash_save_cpu(struct pt_regs *regs, int cpu); +extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 315adbf9cb68..6db80fc0810b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -15,9 +15,12 @@ /* vmcoreinfo stuff */ static unsigned char *vmcoreinfo_data; -size_t vmcoreinfo_size; +static size_t vmcoreinfo_size; u32 *vmcoreinfo_note; +/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ +static unsigned char *vmcoreinfo_data_safecopy; + /* * parsing the "crashkernel" commandline * @@ -323,11 +326,23 @@ static void update_vmcoreinfo_note(void) final_note(buf); } +void crash_update_vmcoreinfo_safecopy(void *ptr) +{ + if (ptr) + memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); + + vmcoreinfo_data_safecopy = ptr; +} + void crash_save_vmcoreinfo(void) { if (!vmcoreinfo_note) return; + /* Use the safe copy to generate vmcoreinfo note if have */ + if (vmcoreinfo_data_safecopy) + vmcoreinfo_data = vmcoreinfo_data_safecopy; + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } diff --git a/kernel/kexec.c b/kernel/kexec.c index 980936a90ee6..e62ec4dc6620 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + for (i = 0; i < nr_segments; i++) { ret = kimage_load_segment(image, &image->segment[i]); if (ret) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 154ffb489b93..1ae7c41c33c1 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -482,6 +482,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image, return pages; } +int kimage_crash_copy_vmcoreinfo(struct kimage *image) +{ + struct page *vmcoreinfo_page; + void *safecopy; + + if (image->type != KEXEC_TYPE_CRASH) + return 0; + + /* + * For kdump, allocate one vmcoreinfo safe copy from the + * crash memory. as we have arch_kexec_protect_crashkres() + * after kexec syscall, we naturally protect it from write + * (even read) access under kernel direct mapping. But on + * the other hand, we still need to operate it when crash + * happens to generate vmcoreinfo note, hereby we rely on + * vmap for this purpose. + */ + vmcoreinfo_page = kimage_alloc_control_pages(image, 0); + if (!vmcoreinfo_page) { + pr_warn("Could not allocate vmcoreinfo buffer\n"); + return -ENOMEM; + } + safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + if (!safecopy) { + pr_warn("Could not vmap vmcoreinfo buffer\n"); + return -ENOMEM; + } + + image->vmcoreinfo_data_copy = safecopy; + crash_update_vmcoreinfo_safecopy(safecopy); + + return 0; +} + static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) @@ -569,6 +603,11 @@ void kimage_free(struct kimage *image) if (!image) return; + if (image->vmcoreinfo_data_copy) { + crash_update_vmcoreinfo_safecopy(NULL); + vunmap(image->vmcoreinfo_data_copy); + } + kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 766e7e4d3ad9..c8f7f77e9fa9 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -298,6 +298,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + ret = kexec_calculate_store_digests(image); if (ret) goto out; -- cgit v1.2.3-71-gd317 From a19ac3374995382a994653ff372b98ea7cbad548 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:30 -0700 Subject: sysctl: kdoc'ify sysctl_writes_strict Document the different sysctl_writes_strict modes in code. Link: http://lkml.kernel.org/r/20170519033554.18592-3-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dfba1a76cc3..02725178694a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -174,11 +174,32 @@ extern int no_unaligned_warning; #ifdef CONFIG_PROC_SYSCTL -#define SYSCTL_WRITES_LEGACY -1 -#define SYSCTL_WRITES_WARN 0 -#define SYSCTL_WRITES_STRICT 1 +/** + * enum sysctl_writes_mode - supported sysctl write modes + * + * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value + * to be written, and multiple writes on the same sysctl file descriptor + * will rewrite the sysctl value, regardless of file position. No warning + * is issued when the initial position is not 0. + * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is + * not 0. + * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at + * file position 0 and the value must be fully contained in the buffer + * sent to the write syscall. If dealing with strings respect the file + * position, but restrict this to the max length of the buffer, anything + * passed the max lenght will be ignored. Multiple writes will append + * to the buffer. + * + * These write modes control how current file position affects the behavior of + * updating sysctl values through the proc interface on each write. + */ +enum sysctl_writes_mode { + SYSCTL_WRITES_LEGACY = -1, + SYSCTL_WRITES_WARN = 0, + SYSCTL_WRITES_STRICT = 1, +}; -static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; +static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.3-71-gd317 From d383d48470819e86fe30eb72f0e9494e1ee0e2af Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:33 -0700 Subject: sysctl: fold sysctl_writes_strict checks into helper The mode sysctl_writes_strict positional checks keep being copy and pasted as we add new proc handlers. Just add a helper to avoid code duplication. Link: http://lkml.kernel.org/r/20170519033554.18592-4-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Suggested-by: Kees Cook Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 56 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 02725178694a..6f3bb1f099fa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1970,6 +1970,32 @@ static void warn_sysctl_write(struct ctl_table *table) current->comm, table->procname); } +/** + * proc_first_pos_non_zero_ignore - check if firs position is allowed + * @ppos: file position + * @table: the sysctl table + * + * Returns true if the first position is non-zero and the sysctl_writes_strict + * mode indicates this is not allowed for numeric input types. String proc + * hadlers can ignore the return value. + */ +static bool proc_first_pos_non_zero_ignore(loff_t *ppos, + struct ctl_table *table) +{ + if (!*ppos) + return false; + + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + return true; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + return false; + default: + return false; + } +} + /** * proc_dostring - read a string sysctl * @table: the sysctl table @@ -1990,8 +2016,8 @@ static void warn_sysctl_write(struct ctl_table *table) int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) - warn_sysctl_write(table); + if (write) + proc_first_pos_non_zero_ignore(ppos, table); return _proc_do_string((char *)(table->data), table->maxlen, write, (char __user *)buffer, lenp, ppos); @@ -2193,17 +2219,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; @@ -2468,17 +2485,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; -- cgit v1.2.3-71-gd317 From 4f2fec00afa60aa8e5d1b7f2a8e0526900f55623 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:36 -0700 Subject: sysctl: simplify unsigned int support Commit e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields") added proc_douintvec() to start help adding support for unsigned int, this however was only half the work needed. Two fixes have come in since then for the following issues: o Printing the values shows a negative value, this happens since do_proc_dointvec() and this uses proc_put_long() This was fixed by commit 5380e5644afbba9 ("sysctl: don't print negative flag for proc_douintvec"). o We can easily wrap around the int values: UINT_MAX is 4294967295, if we echo in 4294967295 + 1 we end up with 0, using 4294967295 + 2 we end up with 1. o We echo negative values in and they are accepted This was fixed by commit 425fffd886ba ("sysctl: report EINVAL if value is larger than UINT_MAX for proc_douintvec"). It still also failed to be added to sysctl_check_table()... instead of adding it with the current implementation just provide a proper and simplified unsigned int support without any array unsigned int support with no negative support at all. Historically sysctl proc helpers have supported arrays, due to the complexity this adds though we've taken a step back to evaluate array users to determine if its worth upkeeping for unsigned int. An evaluation using Coccinelle has been done to perform a grammatical search to ask ourselves: o How many sysctl proc_dointvec() (int) users exist which likely should be moved over to proc_douintvec() (unsigned int) ? Answer: about 8 - Of these how many are array users ? Answer: Probably only 1 o How many sysctl array users exist ? Answer: about 12 This last question gives us an idea just how popular arrays: they are not. Array support should probably just be kept for strings. The identified uint ports are: drivers/infiniband/core/ucma.c - max_backlog drivers/infiniband/core/iwcm.c - default_backlog net/core/sysctl_net_core.c - rps_sock_flow_sysctl() net/netfilter/nf_conntrack_timestamp.c - nf_conntrack_timestamp -- bool net/netfilter/nf_conntrack_acct.c nf_conntrack_acct -- bool net/netfilter/nf_conntrack_ecache.c - nf_conntrack_events -- bool net/netfilter/nf_conntrack_helper.c - nf_conntrack_helper -- bool net/phonet/sysctl.c proc_local_port_range() The only possible array users is proc_local_port_range() but it does not seem worth it to add array support just for this given the range support works just as well. Unsigned int support should be desirable more for when you *need* more than INT_MAX or using int min/max support then does not suffice for your ranges. If you forget and by mistake happen to register an unsigned int proc entry with an array, the driver will fail and you will get something as follows: sysctl table check failed: debug/test_sysctl//uint_0002 array now allowed CPU: 2 PID: 1342 Comm: modprobe Tainted: G W E Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Call Trace: dump_stack+0x63/0x81 __register_sysctl_table+0x350/0x650 ? kmem_cache_alloc_trace+0x107/0x240 __register_sysctl_paths+0x1b3/0x1e0 ? 0xffffffffc005f000 register_sysctl_table+0x1f/0x30 test_sysctl_init+0x10/0x1000 [test_sysctl] do_one_initcall+0x52/0x1a0 ? kmem_cache_alloc_trace+0x107/0x240 do_init_module+0x5f/0x200 load_module+0x1867/0x1bd0 ? __symbol_put+0x60/0x60 SYSC_finit_module+0xdf/0x110 SyS_finit_module+0xe/0x10 entry_SYSCALL_64_fastpath+0x1e/0xad RIP: 0033:0x7f042b22d119 Fixes: e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields") Link: http://lkml.kernel.org/r/20170519033554.18592-5-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Suggested-by: Alexey Dobriyan Cc: Subash Abhinov Kasiviswanathan Cc: Liping Zhang Cc: Alexey Dobriyan Cc: Heinrich Schuchardt Cc: Kees Cook Cc: "David S. Miller" Cc: Ingo Molnar Cc: Al Viro Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 14 +++++ kernel/sysctl.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 160 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 32c9c5630507..ee6feba8b6c0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1061,6 +1061,18 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) return -EINVAL; } +static int sysctl_check_table_array(const char *path, struct ctl_table *table) +{ + int err = 0; + + if (table->proc_handler == proc_douintvec) { + if (table->maxlen != sizeof(unsigned int)) + err |= sysctl_err(path, table, "array now allowed"); + } + + return err; +} + static int sysctl_check_table(const char *path, struct ctl_table *table) { int err = 0; @@ -1081,6 +1093,8 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) err |= sysctl_err(path, table, "No data"); if (!table->maxlen) err |= sysctl_err(path, table, "No maxlen"); + else + err |= sysctl_check_table_array(path, table); } if (!table->proc_handler) err |= sysctl_err(path, table, "No proc_handler"); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6f3bb1f099fa..d12078fc215f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2175,19 +2175,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } -static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) +static int do_proc_douintvec_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) { if (write) { - if (*negp) + if (*lvalp > UINT_MAX) return -EINVAL; if (*lvalp > UINT_MAX) return -EINVAL; *valp = *lvalp; } else { unsigned int val = *valp; - *negp = false; *lvalp = (unsigned long)val; } return 0; @@ -2287,6 +2286,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write, buffer, lenp, ppos, conv, data); } +static int do_proc_douintvec_w(unsigned int *tbl_data, + struct ctl_table *table, + void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + bool neg; + char *kbuf = NULL, *p; + + left = *lenp; + + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto bail_early; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return -EINVAL; + + left -= proc_skip_spaces(&p); + if (!left) { + err = -EINVAL; + goto out_free; + } + + err = proc_get_long(&p, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err || neg) { + err = -EINVAL; + goto out_free; + } + + if (conv(&lval, tbl_data, 1, data)) { + err = -EINVAL; + goto out_free; + } + + if (!err && left) + left -= proc_skip_spaces(&p); + +out_free: + kfree(kbuf); + if (err) + return -EINVAL; + + return 0; + + /* This is in keeping with old __do_proc_dointvec() */ +bail_early: + *ppos += *lenp; + return err; +} + +static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + + left = *lenp; + + if (conv(&lval, tbl_data, 0, data)) { + err = -EINVAL; + goto out; + } + + err = proc_put_long(&buffer, &left, lval, false); + if (err || !left) + goto out; + + err = proc_put_char(&buffer, &left, '\n'); + +out: + *lenp -= left; + *ppos += *lenp; + + return err; +} + +static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned int *i, vleft; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + + /* + * Arrays are not supported, keep this simple. *Do not* add + * support for them. + */ + if (vleft != 1) { + *lenp = 0; + return -EINVAL; + } + + if (!conv) + conv = do_proc_douintvec_conv; + + if (write) + return do_proc_douintvec_w(i, table, buffer, lenp, ppos, + conv, data); + return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); +} + +static int do_proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + return __do_proc_douintvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -2322,8 +2461,8 @@ int proc_dointvec(struct ctl_table *table, int write, int proc_douintvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_conv, NULL); + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* -- cgit v1.2.3-71-gd317 From 61d9b56a89208d8cccd0b4cfec7e6959717e16e3 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:40 -0700 Subject: sysctl: add unsigned int range support To keep parity with regular int interfaces provide the an unsigned int proc_douintvec_minmax() which allows you to specify a range of allowed valid numbers. Adding proc_douintvec_minmax_sysadmin() is easy but we can wait for an actual user for that. Link: http://lkml.kernel.org/r/20170519033554.18592-6-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Acked-by: Kees Cook Cc: Subash Abhinov Kasiviswanathan Cc: Heinrich Schuchardt Cc: Kees Cook Cc: "David S. Miller" Cc: Ingo Molnar Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 4 ++- include/linux/sysctl.h | 3 +++ kernel/sysctl.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ee6feba8b6c0..8f9d564d0969 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1065,7 +1065,8 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) { int err = 0; - if (table->proc_handler == proc_douintvec) { + if ((table->proc_handler == proc_douintvec) || + (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) err |= sysctl_err(path, table, "array now allowed"); } @@ -1083,6 +1084,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) if ((table->proc_handler == proc_dostring) || (table->proc_handler == proc_dointvec) || (table->proc_handler == proc_douintvec) || + (table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || (table->proc_handler == proc_dointvec_userhz_jiffies) || diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 80d07816def0..225001d437ae 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -47,6 +47,9 @@ extern int proc_douintvec(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_minmax(struct ctl_table *, int, void __user *, size_t *, loff_t *); +extern int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d12078fc215f..df9f2a367882 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2567,6 +2567,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +struct do_proc_douintvec_minmax_conv_param { + unsigned int *min; + unsigned int *max; +}; + +static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + struct do_proc_douintvec_minmax_conv_param *param = data; + + if (write) { + unsigned int val = *lvalp; + + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -ERANGE; + + if (*lvalp > UINT_MAX) + return -EINVAL; + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +/** + * proc_douintvec_minmax - read a vector of unsigned ints with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. Negative + * strings are not allowed. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). There is a final sanity + * check for UINT_MAX to avoid having to support wrap around uses from + * userspace. + * + * Returns 0 on success. + */ +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_douintvec_minmax_conv_param param = { + .min = (unsigned int *) table->extra1, + .max = (unsigned int *) table->extra2, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_minmax_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3066,6 +3125,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3108,6 +3173,7 @@ EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); +EXPORT_SYMBOL_GPL(proc_douintvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); -- cgit v1.2.3-71-gd317 From 9380fa60b10ebd6ee7c3fcdb2cf162f4d7cf9fc5 Mon Sep 17 00:00:00 2001 From: Mateusz Jurczyk Date: Wed, 12 Jul 2017 14:34:01 -0700 Subject: kernel/sysctl_binary.c: check name array length in deprecated_sysctl_warning() Prevent use of uninitialized memory (originating from the stack frame of do_sysctl()) by verifying that the name array is filled with sufficient input data before comparing its specific entries with integer constants. Through timing measurement or analyzing the kernel debug logs, a user-mode program could potentially infer the results of comparisons against the uninitialized memory, and acquire some (very limited) information about the state of the kernel stack. The change also eliminates possible future warnings by tools such as KMSAN and other code checkers / instrumentations. Link: http://lkml.kernel.org/r/20170524122139.21333-1-mjurczyk@google.com Signed-off-by: Mateusz Jurczyk Acked-by: Kees Cook Cc: "David S. Miller" Cc: Matthew Whitehead Cc: "Eric W. Biederman" Cc: Tetsuo Handa Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 939a158eab11..02e1859f2ca8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen) * CTL_KERN/KERN_VERSION is used by older glibc and cannot * ever go away. */ - if (name[0] == CTL_KERN && name[1] == KERN_VERSION) + if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) return; if (printk_ratelimit()) { -- cgit v1.2.3-71-gd317 From 0791e3644e5ef21646fe565b9061788d05ec71d4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 12 Jul 2017 14:34:28 -0700 Subject: kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files With current epoll architecture target files are addressed with file_struct and file descriptor number, where the last is not unique. Moreover files can be transferred from another process via unix socket, added into queue and closed then so we won't find this descriptor in the task fdinfo list. Thus to checkpoint and restore such processes CRIU needs to find out where exactly the target file is present to add it into epoll queue. For this sake one can use kcmp call where some particular target file from the queue is compared with arbitrary file passed as an argument. Because epoll target files can have same file descriptor number but different file_struct a caller should explicitly specify the offset within. To test if some particular file is matching entry inside epoll one have to - fill kcmp_epoll_slot structure with epoll file descriptor, target file number and target file offset (in case if only one target is present then it should be 0) - call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot) - the kernel fetch file pointer matching file descriptor @fd of pid1 - lookups for file struct in epoll queue of pid2 and returns traditional 0,1,2 result for sorting purpose Link: http://lkml.kernel.org/r/20170424154423.511592110@gmail.com Signed-off-by: Cyrill Gorcunov Acked-by: Andrey Vagin Cc: Al Viro Cc: Pavel Emelyanov Cc: Michael Kerrisk Cc: Jason Baron Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 42 ++++++++++++++++++++++++++++++++++ include/linux/eventpoll.h | 3 +++ include/uapi/linux/kcmp.h | 10 +++++++++ kernel/kcmp.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+) (limited to 'kernel') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 322904c3ebdf..e7e9901c3790 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1077,6 +1077,48 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) return epir; } +static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) +{ + struct rb_node *rbp; + struct epitem *epi; + + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (epi->ffd.fd == tfd) { + if (toff == 0) + return epi; + else + toff--; + } + cond_resched(); + } + + return NULL; +} + +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, + unsigned long toff) +{ + struct file *file_raw; + struct eventpoll *ep; + struct epitem *epi; + + if (!is_file_epoll(file)) + return ERR_PTR(-EINVAL); + + ep = file->private_data; + + mutex_lock(&ep->mtx); + epi = ep_find_tfd(ep, tfd, toff); + if (epi) + file_raw = epi->ffd.file; + else + file_raw = ERR_PTR(-ENOENT); + mutex_unlock(&ep->mtx); + + return file_raw; +} + /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 6daf6d4971f6..d8625d214ea7 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -14,6 +14,7 @@ #define _LINUX_EVENTPOLL_H #include +#include /* Forward declarations to avoid compiler errors */ @@ -22,6 +23,8 @@ struct file; #ifdef CONFIG_EPOLL +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); + /* Used to initialize the epoll bits inside the "struct file" */ static inline void eventpoll_init_file(struct file *file) { diff --git a/include/uapi/linux/kcmp.h b/include/uapi/linux/kcmp.h index 84df14b37360..481e103da78e 100644 --- a/include/uapi/linux/kcmp.h +++ b/include/uapi/linux/kcmp.h @@ -1,6 +1,8 @@ #ifndef _UAPI_LINUX_KCMP_H #define _UAPI_LINUX_KCMP_H +#include + /* Comparison type */ enum kcmp_type { KCMP_FILE, @@ -10,8 +12,16 @@ enum kcmp_type { KCMP_SIGHAND, KCMP_IO, KCMP_SYSVSEM, + KCMP_EPOLL_TFD, KCMP_TYPES, }; +/* Slot for KCMP_EPOLL_TFD */ +struct kcmp_epoll_slot { + __u32 efd; /* epoll file descriptor */ + __u32 tfd; /* target file number */ + __u32 toff; /* target offset within same numbered sequence */ +}; + #endif /* _UAPI_LINUX_KCMP_H */ diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 3a47fa998fe0..ea34ed8bb952 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -11,6 +11,10 @@ #include #include #include +#include +#include +#include +#include #include @@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2) return err; } +#ifdef CONFIG_EPOLL +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + struct file *filp, *filp_epoll, *filp_tgt; + struct kcmp_epoll_slot slot; + struct files_struct *files; + + if (copy_from_user(&slot, uslot, sizeof(slot))) + return -EFAULT; + + filp = get_file_raw_ptr(task1, idx1); + if (!filp) + return -EBADF; + + files = get_files_struct(task2); + if (!files) + return -EBADF; + + spin_lock(&files->file_lock); + filp_epoll = fcheck_files(files, slot.efd); + if (filp_epoll) + get_file(filp_epoll); + else + filp_tgt = ERR_PTR(-EBADF); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (filp_epoll) { + filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); + fput(filp_epoll); + } else + + if (IS_ERR(filp_tgt)) + return PTR_ERR(filp_tgt); + + return kcmp_ptr(filp, filp_tgt, KCMP_FILE); +} +#else +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + return -EOPNOTSUPP; +} +#endif + SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2) { @@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, ret = -EOPNOTSUPP; #endif break; + case KCMP_EPOLL_TFD: + ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); + break; default: ret = -EINVAL; break; -- cgit v1.2.3-71-gd317 From e41d58185f1444368873d4d7422f7664a68be61d Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 12 Jul 2017 14:34:35 -0700 Subject: fault-inject: support systematic fault injection Add /proc/self/task//fail-nth file that allows failing 0-th, 1-st, 2-nd and so on calls systematically. Excerpt from the added documentation: "Write to this file of integer N makes N-th call in the current task fail (N is 0-based). Read from this file returns a single char 'Y' or 'N' that says if the fault setup with a previous write to this file was injected or not, and disables the fault if it wasn't yet injected. Note that this file enables all types of faults (slab, futex, etc). This setting takes precedence over all other generic settings like probability, interval, times, etc. But per-capability settings (e.g. fail_futex/ignore-private) take precedence over it. This feature is intended for systematic testing of faults in a single system call. See an example below" Why add a new setting: 1. Existing settings are global rather than per-task. So parallel testing is not possible. 2. attr->interval is close but it depends on attr->count which is non reset to 0, so interval does not work as expected. 3. Trying to model this with existing settings requires manipulations of all of probability, interval, times, space, task-filter and unexposed count and per-task make-it-fail files. 4. Existing settings are per-failure-type, and the set of failure types is potentially expanding. 5. make-it-fail can't be changed by unprivileged user and aggressive stress testing better be done from an unprivileged user. Similarly, this would require opening the debugfs files to the unprivileged user, as he would need to reopen at least times file (not possible to pre-open before dropping privs). The proposed interface solves all of the above (see the example). We want to integrate this into syzkaller fuzzer. A prototype has found 10 bugs in kernel in first day of usage: https://groups.google.com/forum/#!searchin/syzkaller/%22FAULT_INJECTION%22%7Csort:relevance I've made the current interface work with all types of our sandboxes. For setuid the secret sauce was prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) to make /proc entries non-root owned. So I am fine with the current version of the code. [akpm@linux-foundation.org: fix build] Link: http://lkml.kernel.org/r/20170328130128.101773-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Akinobu Mita Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/fault-injection/fault-injection.txt | 78 +++++++++++++++++++++++ fs/proc/base.c | 52 +++++++++++++++ include/linux/sched.h | 1 + kernel/fork.c | 4 ++ lib/fault-inject.c | 7 ++ 5 files changed, 142 insertions(+) (limited to 'kernel') diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt index 415484f3d59a..192d8cbcc5f9 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.txt @@ -134,6 +134,22 @@ use the boot option: fail_futex= mmc_core.fail_request=,,, +o proc entries + +- /proc/self/task//fail-nth: + + Write to this file of integer N makes N-th call in the current task fail + (N is 0-based). Read from this file returns a single char 'Y' or 'N' + that says if the fault setup with a previous write to this file was + injected or not, and disables the fault if it wasn't yet injected. + Note that this file enables all types of faults (slab, futex, etc). + This setting takes precedence over all other generic debugfs settings + like probability, interval, times, etc. But per-capability settings + (e.g. fail_futex/ignore-private) take precedence over it. + + This feature is intended for systematic testing of faults in a single + system call. See an example below. + How to add new fault injection capability ----------------------------------------- @@ -278,3 +294,65 @@ allocation failure. # env FAILCMD_TYPE=fail_page_alloc \ ./tools/testing/fault-injection/failcmd.sh --times=100 \ -- make -C tools/testing/selftests/ run_tests + +Systematic faults using fail-nth +--------------------------------- + +The following code systematically faults 0-th, 1-st, 2-nd and so on +capabilities in the socketpair() system call. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main() +{ + int i, err, res, fail_nth, fds[2]; + char buf[128]; + + system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait"); + sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid)); + fail_nth = open(buf, O_RDWR); + for (i = 0;; i++) { + sprintf(buf, "%d", i); + write(fail_nth, buf, strlen(buf)); + res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); + err = errno; + read(fail_nth, buf, 1); + if (res == 0) { + close(fds[0]); + close(fds[1]); + } + printf("%d-th fault %c: res=%d/%d\n", i, buf[0], res, err); + if (buf[0] != 'Y') + break; + } + return 0; +} + +An example output: + +0-th fault Y: res=-1/23 +1-th fault Y: res=-1/23 +2-th fault Y: res=-1/23 +3-th fault Y: res=-1/12 +4-th fault Y: res=-1/12 +5-th fault Y: res=-1/23 +6-th fault Y: res=-1/23 +7-th fault Y: res=-1/23 +8-th fault Y: res=-1/12 +9-th fault Y: res=-1/12 +10-th fault Y: res=-1/12 +11-th fault Y: res=-1/12 +12-th fault Y: res=-1/12 +13-th fault Y: res=-1/12 +14-th fault Y: res=-1/12 +15-th fault Y: res=-1/12 +16-th fault N: res=0/12 diff --git a/fs/proc/base.c b/fs/proc/base.c index f1e1927ccd48..88b773f318cd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1355,6 +1355,53 @@ static const struct file_operations proc_fault_inject_operations = { .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; + +static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err, n; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + put_task_struct(task); + if (task != current) + return -EPERM; + err = kstrtoint_from_user(buf, count, 10, &n); + if (err) + return err; + if (n < 0 || n == INT_MAX) + return -EINVAL; + current->fail_nth = n + 1; + return count; +} + +static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + put_task_struct(task); + if (task != current) + return -EPERM; + if (count < 1) + return -EINVAL; + err = put_user((char)(current->fail_nth ? 'N' : 'Y'), buf); + if (err) + return err; + current->fail_nth = 0; + return 1; +} + +static const struct file_operations proc_fail_nth_operations = { + .read = proc_fail_nth_read, + .write = proc_fail_nth_write, +}; #endif @@ -3311,6 +3358,11 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), + /* + * Operations on the file check that the task is current, + * so we create it with 0666 to support testing under unprivileged user. + */ + REG("fail-nth", 0666, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), diff --git a/include/linux/sched.h b/include/linux/sched.h index 20814b7d7d70..3822d749fc9e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -974,6 +974,7 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; + int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call diff --git a/kernel/fork.c b/kernel/fork.c index d2b9d7c31eaf..ade237a96308 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -573,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) kcov_task_init(tsk); +#ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +#endif + return tsk; free_stack: diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 4ff157159a0d..09ac73c177fd 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -107,6 +107,12 @@ static inline bool fail_stacktrace(struct fault_attr *attr) bool should_fail(struct fault_attr *attr, ssize_t size) { + if (in_task() && current->fail_nth) { + if (--current->fail_nth == 0) + goto fail; + return false; + } + /* No need to check any other properties if the probability is 0 */ if (attr->probability == 0) return false; @@ -134,6 +140,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size) if (!fail_stacktrace(attr)) return false; +fail: fail_dump(attr); if (atomic_read(&attr->times) != -1) -- cgit v1.2.3-71-gd317 From f2e0cff85ed111a3cf24d894c3fa11697dfae628 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:43 -0700 Subject: kernel/watchdog: introduce arch_touch_nmi_watchdog() For architectures that define HAVE_NMI_WATCHDOG, instead of having them provide the complete touch_nmi_watchdog() function, just have them provide arch_touch_nmi_watchdog(). This gives the generic code more flexibility in implementing this function, and arch implementations don't miss out on touching the softlockup watchdog or other generic details. Link: http://lkml.kernel.org/r/20170616065715.18390-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/blackfin/include/asm/nmi.h | 2 ++ arch/blackfin/kernel/nmi.c | 2 +- arch/mn10300/include/asm/nmi.h | 2 ++ arch/mn10300/kernel/mn10300-watchdog-low.S | 8 ++++---- arch/mn10300/kernel/mn10300-watchdog.c | 2 +- arch/sparc/include/asm/nmi.h | 1 + arch/sparc/kernel/nmi.c | 6 ++---- include/linux/nmi.h | 27 ++++++++++++++++----------- kernel/watchdog_hld.c | 5 ++--- 9 files changed, 31 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/arch/blackfin/include/asm/nmi.h b/arch/blackfin/include/asm/nmi.h index b9caac4fcfd8..107d23705f46 100644 --- a/arch/blackfin/include/asm/nmi.h +++ b/arch/blackfin/include/asm/nmi.h @@ -9,4 +9,6 @@ #include +extern void arch_touch_nmi_watchdog(void); + #endif diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c index 633c37083e87..1e714329fe8a 100644 --- a/arch/blackfin/kernel/nmi.c +++ b/arch/blackfin/kernel/nmi.c @@ -190,7 +190,7 @@ static int __init init_nmi_wdt(void) } device_initcall(init_nmi_wdt); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { atomic_set(&nmi_touched[smp_processor_id()], 1); } diff --git a/arch/mn10300/include/asm/nmi.h b/arch/mn10300/include/asm/nmi.h index f3671cbbc117..b05627597b1b 100644 --- a/arch/mn10300/include/asm/nmi.h +++ b/arch/mn10300/include/asm/nmi.h @@ -11,4 +11,6 @@ #ifndef _ASM_NMI_H #define _ASM_NMI_H +extern void arch_touch_nmi_watchdog(void); + #endif /* _ASM_NMI_H */ diff --git a/arch/mn10300/kernel/mn10300-watchdog-low.S b/arch/mn10300/kernel/mn10300-watchdog-low.S index f2f5c9cfaabd..34f8773de7d0 100644 --- a/arch/mn10300/kernel/mn10300-watchdog-low.S +++ b/arch/mn10300/kernel/mn10300-watchdog-low.S @@ -50,9 +50,9 @@ watchdog_handler: # we can't inline it) # ############################################################################### - .globl touch_nmi_watchdog - .type touch_nmi_watchdog,@function -touch_nmi_watchdog: + .globl arch_touch_nmi_watchdog + .type arch_touch_nmi_watchdog,@function +arch_touch_nmi_watchdog: clr d0 clr d1 mov watchdog_alert_counter, a0 @@ -63,4 +63,4 @@ touch_nmi_watchdog: lne ret [],0 - .size touch_nmi_watchdog,.-touch_nmi_watchdog + .size arch_touch_nmi_watchdog,.-arch_touch_nmi_watchdog diff --git a/arch/mn10300/kernel/mn10300-watchdog.c b/arch/mn10300/kernel/mn10300-watchdog.c index a2d8e6938d67..0d5641beadf5 100644 --- a/arch/mn10300/kernel/mn10300-watchdog.c +++ b/arch/mn10300/kernel/mn10300-watchdog.c @@ -31,7 +31,7 @@ static unsigned int watchdog; static unsigned int watchdog_hz = 1; unsigned int watchdog_alert_counter[NR_CPUS]; -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); /* * the best way to detect whether a CPU has a 'hard lockup' problem diff --git a/arch/sparc/include/asm/nmi.h b/arch/sparc/include/asm/nmi.h index 26ad2b2607c6..284eac3ffaf2 100644 --- a/arch/sparc/include/asm/nmi.h +++ b/arch/sparc/include/asm/nmi.h @@ -7,6 +7,7 @@ void nmi_adjust_hz(unsigned int new_hz); extern atomic_t nmi_active; +void arch_touch_nmi_watchdog(void); void start_nmi_watchdog(void *unused); void stop_nmi_watchdog(void *unused); diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index 95e73c63c99d..048ad783ea3f 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -51,7 +51,7 @@ static DEFINE_PER_CPU(unsigned int, last_irq_sum); static DEFINE_PER_CPU(long, alert_counter); static DEFINE_PER_CPU(int, nmi_touch); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { if (atomic_read(&nmi_active)) { int cpu; @@ -61,10 +61,8 @@ void touch_nmi_watchdog(void) per_cpu(nmi_touch, cpu) = 1; } } - - touch_softlockup_watchdog(); } -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); static void die_nmi(const char *str, struct pt_regs *regs, int do_panic) { diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 5e2e57536d98..bd387ef8bccd 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -6,6 +6,9 @@ #include #include +#if defined(CONFIG_HAVE_NMI_WATCHDOG) +#include +#endif #ifdef CONFIG_LOCKUP_DETECTOR extern void touch_softlockup_watchdog_sched(void); @@ -58,6 +61,18 @@ static inline void reset_hung_task_detector(void) #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) +#if defined(CONFIG_HARDLOCKUP_DETECTOR) +extern void hardlockup_detector_disable(void); +#else +static inline void hardlockup_detector_disable(void) {} +#endif + +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +extern void arch_touch_nmi_watchdog(void); +#else +static inline void arch_touch_nmi_watchdog(void) {} +#endif + /** * touch_nmi_watchdog - restart NMI watchdog timeout. * @@ -65,21 +80,11 @@ static inline void reset_hung_task_detector(void) * may be used to reset the timeout - for code which intentionally * disables interrupts for a long time. This call is stateless. */ -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) -#include -extern void touch_nmi_watchdog(void); -#else static inline void touch_nmi_watchdog(void) { + arch_touch_nmi_watchdog(); touch_softlockup_watchdog(); } -#endif - -#if defined(CONFIG_HARDLOCKUP_DETECTOR) -extern void hardlockup_detector_disable(void); -#else -static inline void hardlockup_detector_disable(void) {} -#endif /* * Create trigger_all_cpu_backtrace() out of the arch-provided diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 54a427d1f344..90d688df6ce1 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -56,7 +56,7 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { /* * Using __raw here because some code paths have @@ -66,9 +66,8 @@ void touch_nmi_watchdog(void) * going off. */ raw_cpu_write(watchdog_nmi_touch, true); - touch_softlockup_watchdog(); } -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, -- cgit v1.2.3-71-gd317 From 05a4a95279311c3a4633b4277a5d21cfd616c6c7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:46 -0700 Subject: kernel/watchdog: split up config options Split SOFTLOCKUP_DETECTOR from LOCKUP_DETECTOR, and split HARDLOCKUP_DETECTOR_PERF from HARDLOCKUP_DETECTOR. LOCKUP_DETECTOR implies the general boot, sysctl, and programming interfaces for the lockup detectors. An architecture that wants to use a hard lockup detector must define HAVE_HARDLOCKUP_DETECTOR_PERF or HAVE_HARDLOCKUP_DETECTOR_ARCH. Alternatively an arch can define HAVE_NMI_WATCHDOG, which provides the minimum arch_touch_nmi_watchdog, and it otherwise does its own thing and does not implement the LOCKUP_DETECTOR interfaces. sparc is unusual in that it has started to implement some of the interfaces, but not fully yet. It should probably be converted to a full HAVE_HARDLOCKUP_DETECTOR_ARCH. [npiggin@gmail.com: fix] Link: http://lkml.kernel.org/r/20170617223522.66c0ad88@roar.ozlabs.ibm.com Link: http://lkml.kernel.org/r/20170616065715.18390-4-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 25 ++++- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/setup_64.c | 2 +- arch/x86/Kconfig | 1 + arch/x86/kernel/apic/hw_nmi.c | 2 +- include/linux/nmi.h | 29 +++-- kernel/Makefile | 2 +- kernel/sysctl.c | 31 +++--- kernel/watchdog.c | 243 +++++++++++++++++++++++++++-------------- kernel/watchdog_hld.c | 32 ------ lib/Kconfig.debug | 45 +++++--- 11 files changed, 251 insertions(+), 162 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index cae0958a2298..fb9bd7d36b05 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -198,9 +198,6 @@ config HAVE_KPROBES_ON_FTRACE config HAVE_NMI bool -config HAVE_NMI_WATCHDOG - depends on HAVE_NMI - bool # # An arch should select this if it provides all these things: # @@ -288,6 +285,28 @@ config HAVE_PERF_EVENTS_NMI subsystem. Also has support for calculating CPU cycle events to determine how many clock cycles in a given period. +config HAVE_HARDLOCKUP_DETECTOR_PERF + bool + depends on HAVE_PERF_EVENTS_NMI + help + The arch chooses to use the generic perf-NMI-based hardlockup + detector. Must define HAVE_PERF_EVENTS_NMI. + +config HAVE_NMI_WATCHDOG + depends on HAVE_NMI + bool + help + The arch provides a low level NMI watchdog. It provides + asm/nmi.h, and defines its own arch_touch_nmi_watchdog(). + +config HAVE_HARDLOCKUP_DETECTOR_ARCH + bool + select HAVE_NMI_WATCHDOG + help + The arch chooses to provide its own hardlockup detector, which is + a superset of the HAVE_NMI_WATCHDOG. It also conforms to config + interfaces and parameters provided by hardlockup detector subsystem. + config HAVE_PERF_REGS bool help diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7177a3f4f418..63ed758e1d20 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -197,6 +197,7 @@ config PPC select HAVE_OPTPROBES if PPC64 select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 4640f6d64f8b..074a075a9cdb 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -752,7 +752,7 @@ struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return ppc_proc_freq * watchdog_thresh; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 94a18681353d..3d2b8ce54e00 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -162,6 +162,7 @@ config X86 select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c73c9fb281e1..d6f387780849 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,7 +19,7 @@ #include #include -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index bd387ef8bccd..8aa01fd859fb 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -11,13 +11,21 @@ #endif #ifdef CONFIG_LOCKUP_DETECTOR +void lockup_detector_init(void); +#else +static inline void lockup_detector_init(void) +{ +} +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR extern void touch_softlockup_watchdog_sched(void); extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; -extern unsigned int hardlockup_panic; -void lockup_detector_init(void); +extern int soft_watchdog_enabled; +extern atomic_t watchdog_park_in_progress; #else static inline void touch_softlockup_watchdog_sched(void) { @@ -31,9 +39,6 @@ static inline void touch_softlockup_watchdog_sync(void) static inline void touch_all_softlockup_watchdogs(void) { } -static inline void lockup_detector_init(void) -{ -} #endif #ifdef CONFIG_DETECT_HUNG_TASK @@ -63,15 +68,18 @@ static inline void reset_hung_task_detector(void) #if defined(CONFIG_HARDLOCKUP_DETECTOR) extern void hardlockup_detector_disable(void); +extern unsigned int hardlockup_panic; #else static inline void hardlockup_detector_disable(void) {} #endif -#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void arch_touch_nmi_watchdog(void); #else +#if !defined(CONFIG_HAVE_NMI_WATCHDOG) static inline void arch_touch_nmi_watchdog(void) {} #endif +#endif /** * touch_nmi_watchdog - restart NMI watchdog timeout. @@ -141,15 +149,18 @@ static inline bool trigger_single_cpu_backtrace(int cpu) } #endif -#ifdef CONFIG_LOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh); +#endif + +#ifdef CONFIG_LOCKUP_DETECTOR extern int nmi_watchdog_enabled; -extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long watchdog_enabled; +extern struct cpumask watchdog_cpumask; extern unsigned long *watchdog_cpumask_bits; -extern atomic_t watchdog_park_in_progress; +extern int __read_mostly watchdog_suspended; #ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; diff --git a/kernel/Makefile b/kernel/Makefile index 72aa080f91f0..4cb8e8b23c6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index df9f2a367882..6648fbbb8157 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -900,6 +900,14 @@ static struct ctl_table kern_table[] = { .extra2 = &zero, #endif }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", .data = &soft_watchdog_enabled, @@ -909,13 +917,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, { .procname = "softlockup_panic", .data = &softlockup_panic, @@ -925,27 +926,29 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_SMP { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#endif /* CONFIG_SMP */ #endif -#ifdef CONFIG_SMP +#ifdef CONFIG_HARDLOCKUP_DETECTOR { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, + .procname = "hardlockup_panic", + .data = &hardlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", .data = &sysctl_hardlockup_all_cpu_backtrace, @@ -957,6 +960,8 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #endif +#endif + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 03e0b69bb5bf..1fba9c3d66dc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,15 +29,58 @@ #include #include +/* Watchdog configuration */ static DEFINE_MUTEX(watchdog_proc_mutex); -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +int __read_mostly nmi_watchdog_enabled; + +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | + NMI_WATCHDOG_ENABLED; #else unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif -int __read_mostly nmi_watchdog_enabled; + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR int __read_mostly soft_watchdog_enabled; +#endif + int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; @@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10; int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #endif -static struct cpumask watchdog_cpumask __read_mostly; +struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); - /* * The 'watchdog_running' variable is set to 1 when the watchdog threads * are registered/started and is set to 0 when the watchdog threads are @@ -72,7 +109,27 @@ static int __read_mostly watchdog_running; * of 'watchdog_running' cannot change while the watchdog is deactivated * temporarily (see related code in 'proc' handlers). */ -static int __read_mostly watchdog_suspended; +int __read_mostly watchdog_suspended; + +/* + * These functions can be overridden if an architecture implements its + * own hardlockup detector. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + +/* Helper for online, unparked cpus. */ +#define for_each_watchdog_cpu(cpu) \ + for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) + +atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); static u64 __read_mostly sample_period; @@ -120,6 +177,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#ifdef CONFIG_HARDLOCKUP_DETECTOR static int __init hardlockup_all_cpu_backtrace_setup(char *str) { sysctl_hardlockup_all_cpu_backtrace = @@ -128,6 +186,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str) } __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -213,18 +272,6 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_touch_ts, 0); } -/* watchdog detector functions */ -bool is_hardlockup(void) -{ - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); - - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return true; - - __this_cpu_write(hrtimer_interrupts_saved, hrint); - return false; -} - static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); @@ -237,21 +284,21 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -static void watchdog_interrupt_count(void) +/* watchdog detector functions */ +bool is_hardlockup(void) { - __this_cpu_inc(hrtimer_interrupts); -} + unsigned long hrint = __this_cpu_read(hrtimer_interrupts); -/* - * These two functions are mostly architecture specific - * defining them as weak here. - */ -int __weak watchdog_nmi_enable(unsigned int cpu) -{ - return 0; + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) + return true; + + __this_cpu_write(hrtimer_interrupts_saved, hrint); + return false; } -void __weak watchdog_nmi_disable(unsigned int cpu) + +static void watchdog_interrupt_count(void) { + __this_cpu_inc(hrtimer_interrupts); } static int watchdog_enable_all_cpus(void); @@ -502,57 +549,6 @@ static void watchdog_unpark_threads(void) kthread_unpark(per_cpu(softlockup_watchdog, cpu)); } -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. - */ -int lockup_detector_suspend(void) -{ - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - mutex_unlock(&watchdog_proc_mutex); - - return ret; -} - -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. - */ -void lockup_detector_resume(void) -{ - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); -} - static int update_watchdog_all_cpus(void) { int ret; @@ -604,6 +600,81 @@ static void watchdog_disable_all_cpus(void) } } +#else /* SOFTLOCKUP */ +static int watchdog_park_threads(void) +{ + return 0; +} + +static void watchdog_unpark_threads(void) +{ +} + +static int watchdog_enable_all_cpus(void) +{ + return 0; +} + +static void watchdog_disable_all_cpus(void) +{ +} + +static void set_sample_period(void) +{ +} +#endif /* SOFTLOCKUP */ + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + get_online_cpus(); + mutex_lock(&watchdog_proc_mutex); + /* + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). + */ + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + else { + watchdog_disable_all_cpus(); + pr_err("Failed to suspend lockup detectors, disabled\n"); + watchdog_enabled = 0; + } + + mutex_unlock(&watchdog_proc_mutex); + + return ret; +} + +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) +{ + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; + /* + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. + */ + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); +} + #ifdef CONFIG_SYSCTL /* @@ -810,9 +881,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ +#ifdef CONFIG_SOFTLOCKUP_DETECTOR if (smpboot_update_cpumask_percpu_thread( &watchdog_threads, &watchdog_cpumask) != 0) pr_err("cpumask update failed\n"); +#endif } } out: diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 90d688df6ce1..295a0d84934c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -22,39 +22,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); void arch_touch_nmi_watchdog(void) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f28f4252e54a..b0d01c6d4e03 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -778,34 +778,45 @@ config DEBUG_SHIRQ menu "Debug Lockups and Hangs" config LOCKUP_DETECTOR - bool "Detect Hard and Soft Lockups" + bool + +config SOFTLOCKUP_DETECTOR + bool "Detect Soft Lockups" depends on DEBUG_KERNEL && !S390 + select LOCKUP_DETECTOR help Say Y here to enable the kernel to act as a watchdog to detect - hard and soft lockups. + soft lockups. Softlockups are bugs that cause the kernel to loop in kernel mode for more than 20 seconds, without giving other tasks a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. +config HARDLOCKUP_DETECTOR_PERF + bool + select SOFTLOCKUP_DETECTOR + +# +# arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard +# lockup detector rather than the perf based detector. +# +config HARDLOCKUP_DETECTOR + bool "Detect Hard Lockups" + depends on DEBUG_KERNEL && !S390 + depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH + select LOCKUP_DETECTOR + select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF + select HARDLOCKUP_DETECTOR_ARCH if HAVE_HARDLOCKUP_DETECTOR_ARCH + help + Say Y here to enable the kernel to act as a watchdog to detect + hard lockups. + Hardlockups are bugs that cause the CPU to loop in kernel mode for more than 10 seconds, without letting other interrupts have a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. - The overhead should be minimal. A periodic hrtimer runs to - generate interrupts and kick the watchdog task every 4 seconds. - An NMI is generated every 10 seconds or so to check for hardlockups. - - The frequency of hrtimer and NMI events and the soft and hard lockup - thresholds can be controlled through the sysctl watchdog_thresh. - -config HARDLOCKUP_DETECTOR - def_bool y - depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG - depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI - config BOOTPARAM_HARDLOCKUP_PANIC bool "Panic (Reboot) On Hard Lockups" depends on HARDLOCKUP_DETECTOR @@ -826,7 +837,7 @@ config BOOTPARAM_HARDLOCKUP_PANIC_VALUE config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" - depends on LOCKUP_DETECTOR + depends on SOFTLOCKUP_DETECTOR help Say Y here to enable the kernel to panic on "soft lockups", which are bugs that cause the kernel to loop in kernel @@ -843,7 +854,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE int - depends on LOCKUP_DETECTOR + depends on SOFTLOCKUP_DETECTOR range 0 1 default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC default 1 if BOOTPARAM_SOFTLOCKUP_PANIC @@ -851,7 +862,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE config DETECT_HUNG_TASK bool "Detect Hung Tasks" depends on DEBUG_KERNEL - default LOCKUP_DETECTOR + default SOFTLOCKUP_DETECTOR help Say Y here to enable the kernel to detect "hung tasks", which are bugs that cause the task to be stuck in -- cgit v1.2.3-71-gd317 From a10a842ff81a7e3810817b3b04e4c432b6191e21 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:49 -0700 Subject: kernel/watchdog: provide watchdog_nmi_reconfigure() for arch watchdogs After reconfiguring watchdog sysctls etc., architecture specific watchdogs may not get all their parameters updated. watchdog_nmi_reconfigure() can be implemented to pull the new values in and set the arch NMI watchdog. [npiggin@gmail.com: add code comments] Link: http://lkml.kernel.org/r/20170617125933.774d3858@roar.ozlabs.ibm.com [arnd@arndb.de: hide unused function] Link: http://lkml.kernel.org/r/20170620204854.966601-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20170616065715.18390-5-npiggin@gmail.com Signed-off-by: Nicholas Piggin Signed-off-by: Arnd Bergmann Reviewed-by: Don Zickus Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1fba9c3d66dc..cabe3e9fb620 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -114,6 +114,10 @@ int __read_mostly watchdog_suspended; /* * These functions can be overridden if an architecture implements its * own hardlockup detector. + * + * watchdog_nmi_enable/disable can be implemented to start and stop when + * softlockup watchdog threads start and stop. The arch must select the + * SOFTLOCKUP_DETECTOR Kconfig. */ int __weak watchdog_nmi_enable(unsigned int cpu) { @@ -123,6 +127,22 @@ void __weak watchdog_nmi_disable(unsigned int cpu) { } +/* + * watchdog_nmi_reconfigure can be implemented to be notified after any + * watchdog configuration change. The arch hardlockup watchdog should + * respond to the following variables: + * - nmi_watchdog_enabled + * - watchdog_thresh + * - watchdog_cpumask + * - sysctl_hardlockup_all_cpu_backtrace + * - hardlockup_panic + * - watchdog_suspended + */ +void __weak watchdog_nmi_reconfigure(void) +{ +} + + #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* Helper for online, unparked cpus. */ @@ -600,6 +620,14 @@ static void watchdog_disable_all_cpus(void) } } +#ifdef CONFIG_SYSCTL +static int watchdog_update_cpus(void) +{ + return smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask); +} +#endif + #else /* SOFTLOCKUP */ static int watchdog_park_threads(void) { @@ -619,6 +647,13 @@ static void watchdog_disable_all_cpus(void) { } +#ifdef CONFIG_SYSCTL +static int watchdog_update_cpus(void) +{ + return 0; +} +#endif + static void set_sample_period(void) { } @@ -651,6 +686,8 @@ int lockup_detector_suspend(void) watchdog_enabled = 0; } + watchdog_nmi_reconfigure(); + mutex_unlock(&watchdog_proc_mutex); return ret; @@ -671,6 +708,8 @@ void lockup_detector_resume(void) if (watchdog_running && !watchdog_suspended) watchdog_unpark_threads(); + watchdog_nmi_reconfigure(); + mutex_unlock(&watchdog_proc_mutex); put_online_cpus(); } @@ -696,6 +735,8 @@ static int proc_watchdog_update(void) else watchdog_disable_all_cpus(); + watchdog_nmi_reconfigure(); + return err; } @@ -881,12 +922,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - if (smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask) != 0) + if (watchdog_update_cpus() != 0) pr_err("cpumask update failed\n"); -#endif } + + watchdog_nmi_reconfigure(); } out: mutex_unlock(&watchdog_proc_mutex); -- cgit v1.2.3-71-gd317 From e2ae8ab4b571e2e4094a28acb60649bc2732c67f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jul 2017 14:35:58 -0700 Subject: kexec_file: adjust declaration of kexec_purgatory Defining kexec_purgatory as a zero-length char array upsets compile time size checking. Since this is built on a per-arch basis, define it as an unsized char array (like is done for other similar things, e.g. linker sections). This silences the warning generated by the future CONFIG_FORTIFY_SOURCE, which did not like the memcmp() of a "0 byte" array. This drops the __weak and uses an extern instead, since both users define kexec_purgatory. Link: http://lkml.kernel.org/r/1497903987-21002-4-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Acked-by: "Eric W. Biederman" Cc: Daniel Micay Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 7 ------- kernel/kexec_internal.h | 2 ++ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index c8f7f77e9fa9..9f48f4412297 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,13 +26,6 @@ #include #include "kexec_internal.h" -/* - * Declare these symbols weak so that if architecture provides a purgatory, - * these will be overridden. - */ -char __weak kexec_purgatory[0]; -size_t __weak kexec_purgatory_size = 0; - static int kexec_calculate_store_digests(struct kimage *image); /* Architectures can provide this probe function */ diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 799a8a452187..50dfcb039a41 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -17,6 +17,8 @@ extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE #include void kimage_file_post_load_cleanup(struct kimage *image); +extern char kexec_purgatory[]; +extern size_t kexec_purgatory_size; #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } #endif /* CONFIG_KEXEC_FILE */ -- cgit v1.2.3-71-gd317 From 7cd815bce828220deffd1654265f0ef891567774 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 12 Jul 2017 14:36:20 -0700 Subject: fork,random: use get_random_canary() to set tsk->stack_canary Use the ascii-armor canary to prevent unterminated C string overflows from being able to successfully overwrite the canary, even if they somehow obtain the canary value. Inspired by execshield ascii-armor and Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170524155751.424-3-riel@redhat.com Signed-off-by: Rik van Riel Acked-by: Kees Cook Cc: Daniel Micay Cc: "Theodore Ts'o" Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Catalin Marinas Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ade237a96308..17921b0390b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -554,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_long(); + tsk->stack_canary = get_random_canary(); #endif /* -- cgit v1.2.3-71-gd317 From 69f0d429c413fe96db2c187475cebcc6e3a8c7f5 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 13 Jul 2017 14:18:24 +0800 Subject: locking/rtmutex: Remove unnecessary priority adjustment We don't need to adjust priority before adding a new pi_waiter, the priority only needs to be updated after pi_waiter change or task priority change. Steven Rostedt pointed out: "Interesting, I did some git mining and this was added with the original entry of the rtmutex.c (23f78d4a03c5). Looking at even that version, I don't see the purpose of adjusting the task prio here. It is done before anything changes in the task." Signed-off-by: Alex Shi Reviewed-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1499926704-28841-1-git-send-email-alex.shi@linaro.org [ Enhance the changelog. ] Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 78069895032a..649dc9d3951a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, return -EDEADLK; raw_spin_lock(&task->pi_lock); - rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; -- cgit v1.2.3-71-gd317 From 0e4097c3354e2f5a5ad8affd9dc7f7f7d00bb6b9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 9 Jul 2017 00:40:28 -0700 Subject: sched/cputime: Don't use smp_processor_id() in preemptible context Recent kernels trigger this warning: BUG: using smp_processor_id() in preemptible [00000000] code: 99-trinity/181 caller is debug_smp_processor_id+0x17/0x19 CPU: 0 PID: 181 Comm: 99-trinity Not tainted 4.12.0-01059-g2a42eb9 #1 Call Trace: dump_stack+0x82/0xb8 check_preemption_disabled() debug_smp_processor_id() vtime_delta() task_cputime() thread_group_cputime() thread_group_cputime_adjusted() wait_consider_task() do_wait() SYSC_wait4() do_syscall_64() entry_SYSCALL64_slow_path() As Frederic pointed out: | Although those sched_clock_cpu() things seem to only matter when the | sched_clock() is unstable. And that stability is a condition for nohz_full | to work anyway. So probably sched_clock() alone would be enough. This patch fixes it by replacing sched_clock_cpu() with sched_clock() to avoid calling smp_processor_id() in a preemptible context. Reported-by: Xiaolong Ye Signed-off-by: Wanpeng Li Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1499586028-7402-1-git-send-email-wanpeng.li@hotmail.com [ Prettified the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 6e3ea4ac1bda..14d2dbf97c53 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -683,7 +683,7 @@ static u64 vtime_delta(struct vtime *vtime) { unsigned long long clock; - clock = sched_clock_cpu(smp_processor_id()); + clock = sched_clock(); if (clock < vtime->starttime) return 0; @@ -814,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = sched_clock_cpu(smp_processor_id()); + vtime->starttime = sched_clock(); write_seqcount_end(&vtime->seqcount); } @@ -826,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = sched_clock_cpu(cpu); + vtime->starttime = sched_clock(); write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } -- cgit v1.2.3-71-gd317 From 193be41e33168a3a06eb9d356d9e39c69de161d2 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 12 Jul 2017 19:24:29 -0700 Subject: sched/deadline: Fix confusing comments about selection of top pi-waiter This comment in the code is incomplete, and I believe it begs a definition of dl_boosted to make sense of the condition that follows. Rewrite the comment and also rearrange the condition that follows to reflect the first condition "we have a top pi-waiter which is a SCHED_DEADLINE task" in that order. Also fix a typo that follows. Signed-off-by: Joel Fernandes Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170713022429.10307-1-joelaf@google.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a84299f44b5d..755bd3f1a1a9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) struct sched_dl_entity *pi_se = &p->dl; /* - * Use the scheduling parameters of the top pi-waiter - * task if we have one and its (absolute) deadline is - * smaller than our one... OTW we keep our runtime and - * deadline. + * Use the scheduling parameters of the top pi-waiter task if: + * - we have a top pi-waiter which is a SCHED_DEADLINE task AND + * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is + * smaller than our deadline OR we are a !SCHED_DEADLINE task getting + * boosted due to a SCHED_DEADLINE pi-waiter). + * Otherwise we keep our runtime and deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { + if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { pi_se = &pi_task->dl; } else if (!dl_prio(p->normal_prio)) { /* * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceedes its + * that is going to be deboosted, but exceeds its * runtime while doing so. No point in replenishing * it, as it's going to return back to its original * scheduling class after this. -- cgit v1.2.3-71-gd317 From 5f92a7b0fcd627fbd06ceb1cee3bbe5d08d13356 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 14 Jul 2017 14:49:46 -0700 Subject: kernel/watchdog.c: use better pr_fmt prefix After commit 73ce0511c436 ("kernel/watchdog.c: move hardlockup detector to separate file"), 'NMI watchdog' is inappropriate in kernel/watchdog.c, using 'watchdog' only. Link: http://lkml.kernel.org/r/1499928642-48983-1-git-send-email-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Babu Moger Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cabe3e9fb620..06d3389bca0d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -9,7 +9,7 @@ * to those contributors as well. */ -#define pr_fmt(fmt) "NMI watchdog: " fmt +#define pr_fmt(fmt) "watchdog: " fmt #include #include -- cgit v1.2.3-71-gd317 From 6d7964a722afc8e4f880b947f174009063028c99 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 14 Jul 2017 14:50:11 -0700 Subject: kmod: throttle kmod thread limit If we reach the limit of modprobe_limit threads running the next request_module() call will fail. The original reason for adding a kill was to do away with possible issues with in old circumstances which would create a recursive series of request_module() calls. We can do better than just be super aggressive and reject calls once we've reached the limit by simply making pending callers wait until the threshold has been reduced, and then throttling them in, one by one. This throttling enables requests over the kmod concurrent limit to be processed once a pending request completes. Only the first item queued up to wait is woken up. The assumption here is once a task is woken it will have no other option to also kick the queue to check if there are more pending tasks -- regardless of whether or not it was successful. By throttling and processing only max kmod concurrent tasks we ensure we avoid unexpected fatal request_module() calls, and we keep memory consumption on module loading to a minimum. With x86_64 qemu, with 4 cores, 4 GiB of RAM it takes the following run time to run both tests: time ./kmod.sh -t 0008 real 0m16.366s user 0m0.883s sys 0m8.916s time ./kmod.sh -t 0009 real 0m50.803s user 0m0.791s sys 0m9.852s Link: http://lkml.kernel.org/r/20170628223155.26472-4-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Reviewed-by: Petr Mladek Cc: Jessica Yu Cc: Shuah Khan Cc: Rusty Russell Cc: Michal Marek Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 16 +++++++--------- tools/testing/selftests/kmod/kmod.sh | 24 ++---------------------- 2 files changed, 9 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index ff68198fe83b..6d016c5d97c8 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -68,6 +68,7 @@ static DECLARE_RWSEM(umhelper_sem); */ #define MAX_KMOD_CONCURRENT 50 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); +static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* modprobe_path is set via /proc/sys. @@ -140,7 +141,6 @@ int __request_module(bool wait, const char *fmt, ...) va_list args; char module_name[MODULE_NAME_LEN]; int ret; - static int kmod_loop_msg; /* * We don't allow synchronous module loading from async. Module @@ -164,14 +164,11 @@ int __request_module(bool wait, const char *fmt, ...) return ret; if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { - /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg < 5) { - printk(KERN_ERR - "request_module: runaway loop modprobe %s\n", - module_name); - kmod_loop_msg++; - } - return -ENOMEM; + pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", + atomic_read(&kmod_concurrent_max), + MAX_KMOD_CONCURRENT, module_name); + wait_event_interruptible(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0); } trace_module_request(module_name, wait, _RET_IP_); @@ -179,6 +176,7 @@ int __request_module(bool wait, const char *fmt, ...) ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_inc(&kmod_concurrent_max); + wake_up(&kmod_wq); return ret; } diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh index 10196a62ed09..8cecae9a8bca 100644 --- a/tools/testing/selftests/kmod/kmod.sh +++ b/tools/testing/selftests/kmod/kmod.sh @@ -59,28 +59,8 @@ ALL_TESTS="$ALL_TESTS 0004:1:1" ALL_TESTS="$ALL_TESTS 0005:10:1" ALL_TESTS="$ALL_TESTS 0006:10:1" ALL_TESTS="$ALL_TESTS 0007:5:1" - -# Disabled tests: -# -# 0008 x 150 - multithreaded - push kmod_concurrent over max_modprobes for request_module()" -# Current best-effort failure interpretation: -# Enough module requests get loaded in place fast enough to reach over the -# max_modprobes limit and trigger a failure -- before we're even able to -# start processing pending requests. -ALL_TESTS="$ALL_TESTS 0008:150:0" - -# 0009 x 150 - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()" -# Current best-effort failure interpretation: -# -# get_fs_type() requests modules using aliases as such the optimization in -# place today to look for already loaded modules will not take effect and -# we end up requesting a new module to load, this bumps the kmod_concurrent, -# and in certain circumstances can lead to pushing the kmod_concurrent over -# the max_modprobe limit. -# -# This test fails much easier than test 0008 since the alias optimizations -# are not in place. -ALL_TESTS="$ALL_TESTS 0009:150:0" +ALL_TESTS="$ALL_TESTS 0008:150:1" +ALL_TESTS="$ALL_TESTS 0009:150:1" test_modprobe() { -- cgit v1.2.3-71-gd317 From a696712c3dd54eb58d2c5a807b4aaa27782d80d6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 17 Jul 2017 19:47:02 +0200 Subject: genirq/PM: Properly pretend disabled state when force resuming interrupts Interrupts with the IRQF_FORCE_RESUME flag set have also the IRQF_NO_SUSPEND flag set. They are not disabled in the suspend path, but must be forcefully resumed. That's used by XEN to keep IPIs enabled beyond the suspension of device irqs. Force resume works by pretending that the interrupt was disabled and then calling __irq_enable(). Incrementing the disabled depth counter was enough to do that, but with the recent changes which use state flags to avoid unnecessary hardware access, this is not longer sufficient. If the state flags are not set, then the hardware callbacks are not invoked and the interrupt line stays disabled in "hardware". Set the disabled and masked state when pretending that an interrupt got disabled by suspend. Fixes: bf22ff45bed6 ("genirq: Avoid unnecessary low level irq function calls") Suggested-by: Thomas Gleixner Signed-off-by: Juergen Gross Signed-off-by: Thomas Gleixner Cc: xen-devel@lists.xenproject.org Cc: boris.ostrovsky@oracle.com Link: http://lkml.kernel.org/r/20170717174703.4603-2-jgross@suse.com --- kernel/irq/chip.c | 10 ---------- kernel/irq/internals.h | 10 ++++++++++ kernel/irq/pm.c | 2 ++ 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d171bc57e1e0..a3cc37c0c85e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -170,21 +170,11 @@ static void irq_state_clr_disabled(struct irq_desc *desc) irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); } -static void irq_state_set_disabled(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); -} - static void irq_state_clr_masked(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } -static void irq_state_set_masked(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); -} - static void irq_state_clr_started(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index dbfba9933ed2..a2c48058354c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -227,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +static inline void irq_state_set_disabled(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); +} + +static inline void irq_state_set_masked(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); +} + #undef __irqd_to_state static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cea1de0161f1..6bd9b58429cc 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc) /* Pretend that it got disabled ! */ desc->depth++; + irq_state_set_disabled(desc); + irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); -- cgit v1.2.3-71-gd317 From 848618857d2535176037bdc085f8d012d907071f Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 12 Jul 2017 19:14:16 -0700 Subject: tracing/ring_buffer: Try harder to allocate ftrace can fail to allocate per-CPU ring buffer on systems with a large number of CPUs coupled while large amounts of cache happening in the page cache. Currently the ring buffer allocation doesn't retry in the VM implementation even if direct-reclaim made some progress but still wasn't able to find a free page. On retrying I see that the allocations almost always succeed. The retry doesn't happen because __GFP_NORETRY is used in the tracer to prevent the case where we might OOM, however if we drop __GFP_NORETRY, we risk destabilizing the system if OOM killer is triggered. To prevent this situation, use the __GFP_RETRY_MAYFAIL flag introduced recently [1]. Tested the following still succeeds without destabilizing a system with 1GB memory. echo 300000 > /sys/kernel/debug/tracing/buffer_size_kb [1] https://marc.info/?l=linux-mm&m=149820805124906&w=2 Link: http://lkml.kernel.org/r/20170713021416.8897-1-joelaf@google.com Cc: Tim Murray Cc: Ingo Molnar Cc: Andrew Morton Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4ae268e687fe..529cc50d7243 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) for (i = 0; i < nr_pages; i++) { struct page *page; /* - * __GFP_NORETRY flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is - * not destabilized. + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, + GFP_KERNEL | __GFP_RETRY_MAYFAIL, cpu_to_node(cpu)); if (!bpage) goto free_pages; @@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); + GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); if (!page) goto free_pages; bpage->page = page_address(page); -- cgit v1.2.3-71-gd317 From b0659ae5e30074ede1dc08f2c6d64f0c11d64e0f Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 18 Jul 2017 14:37:24 +0800 Subject: audit: fix memleak in auditd_send_unicast_skb. Found this issue by kmemleak report, auditd_send_unicast_skb did not free skb if rcu_dereference(auditd_conn) returns null. unreferenced object 0xffff88082568ce00 (size 256): comm "auditd", pid 1119, jiffies 4294708499 backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_node+0xcc/0x210 [] __alloc_skb+0x5d/0x290 [] audit_make_reply+0x54/0xd0 [] audit_receive_msg+0x967/0xd70 ---------------- (gdb) list *audit_receive_msg+0x967 0xffffffff8113dff7 is in audit_receive_msg (kernel/audit.c:1133). 1132 skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); --------------- [] audit_receive+0x52/0xa0 [] netlink_unicast+0x181/0x240 [] netlink_sendmsg+0x2c2/0x3b0 [] sock_sendmsg+0x38/0x50 [] SYSC_sendto+0x102/0x190 [] SyS_sendto+0xe/0x10 [] entry_SYSCALL_64_fastpath+0x1a/0xa5 [] 0xffffffffffffffff Signed-off-by: Shu Wang Signed-off-by: Paul Moore --- kernel/audit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7cad70214b81..07def5e49cc9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -641,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); + kfree_skb(skb); rc = -ECONNREFUSED; goto err; } -- cgit v1.2.3-71-gd317 From 3bda69c1c3993a2bddbae01397d12bfef6054011 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 18 Jul 2017 14:08:34 +0300 Subject: perf/core: Fix scheduling regression of pinned groups Vince Weaver reported: > I was tracking down some regressions in my perf_event_test testsuite. > Some of the tests broke in the 4.11-rc1 timeframe. > > I've bisected one of them, this report is about > tests/overflow/simul_oneshot_group_overflow > This test creates an event group containing two sampling events, set > to overflow to a signal handler (which disables and then refreshes the > event). > > On a good kernel you get the following: > Event perf::instructions with period 1000000 > Event perf::instructions with period 2000000 > fd 3 overflows: 946 (perf::instructions/1000000) > fd 4 overflows: 473 (perf::instructions/2000000) > Ending counts: > Count 0: 946379875 > Count 1: 946365218 > > With the broken kernels you get: > Event perf::instructions with period 1000000 > Event perf::instructions with period 2000000 > fd 3 overflows: 938 (perf::instructions/1000000) > fd 4 overflows: 318 (perf::instructions/2000000) > Ending counts: > Count 0: 946373080 > Count 1: 653373058 The root cause of the bug is that the following commit: 487f05e18a ("perf/core: Optimize event rescheduling on active contexts") erronously assumed that event's 'pinned' setting determines whether the event belongs to a pinned group or not, but in fact, it's the group leader's pinned state that matters. This was discovered by Vince in the test case described above, where two instruction counters are grouped, the group leader is pinned, but the other event is not; in the regressed case the counters were off by 33% (the difference between events' periods), but should be the same within the error margin. Fix the problem by looking at the group leader's pinning. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 487f05e18a ("perf/core: Optimize event rescheduling on active contexts") Link: http://lkml.kernel.org/r/87lgnmvw7h.fsf@ashishki-desk.ger.corp.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9747e422ab20..c9cdbd396770 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1452,6 +1452,13 @@ static enum event_type_t get_event_type(struct perf_event *event) lockdep_assert_held(&ctx->lock); + /* + * It's 'group type', really, because if our group leader is + * pinned, so are we. + */ + if (event->group_leader != event) + event = event->group_leader; + event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; -- cgit v1.2.3-71-gd317 From db9108e054700c96322b0f0028546aa4e643cf0b Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Thu, 20 Jul 2017 18:36:09 +0800 Subject: tracing: Fix kmemleak in instance_rmdir Hit the kmemleak when executing instance_rmdir, it forgot releasing mem of tracing_cpumask. With this fix, the warn does not appear any more. unreferenced object 0xffff93a8dfaa7c18 (size 8): comm "mkdir", pid 1436, jiffies 4294763622 (age 9134.308s) hex dump (first 8 bytes): ff ff ff ff ff ff ff ff ........ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc_node+0xf1/0x280 [] alloc_cpumask_var_node+0x23/0x30 [] alloc_cpumask_var+0xe/0x10 [] instance_mkdir+0x90/0x240 [] tracefs_syscall_mkdir+0x40/0x70 [] vfs_mkdir+0x109/0x1b0 [] SyS_mkdir+0xd0/0x100 [] do_syscall_64+0x67/0x150 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff Link: http://lkml.kernel.org/r/1500546969-12594-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: ccfe9e42e451 ("tracing: Make tracing_cpumask available for all instances") Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2d0ffcc49dba..42b9355033d4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7774,6 +7774,7 @@ static int instance_rmdir(const char *name) } kfree(tr->topts); + free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); -- cgit v1.2.3-71-gd317 From f86f418059b94aa01f9342611a272ca60c583e89 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Wed, 7 Jun 2017 16:12:51 +0800 Subject: trace: fix the errors caused by incompatible type of RCU variables The variables which are processed by RCU functions should be annotated as RCU, otherwise sparse will report the errors like below: "error: incompatible types in comparison expression (different address spaces)" Link: http://lkml.kernel.org/r/1496823171-7758-1-git-send-email-zhang.chunyan@linaro.org Signed-off-by: Chunyan Zhang [ Updated to not be 100% 80 column strict ] Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 +++--- include/linux/trace_events.h | 2 +- kernel/trace/ftrace.c | 41 +++++++++++++++++++++++++++-------------- kernel/trace/trace.h | 6 +++--- 4 files changed, 34 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5857390ac35a..6383115e9d2c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -145,8 +145,8 @@ enum { #ifdef CONFIG_DYNAMIC_FTRACE /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { - struct ftrace_hash *notrace_hash; - struct ftrace_hash *filter_hash; + struct ftrace_hash __rcu *notrace_hash; + struct ftrace_hash __rcu *filter_hash; struct mutex regex_lock; }; @@ -168,7 +168,7 @@ static inline void ftrace_free_init_mem(void) { } */ struct ftrace_ops { ftrace_func_t func; - struct ftrace_ops *next; + struct ftrace_ops __rcu *next; unsigned long flags; void *private; ftrace_func_t saved_func; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index f73cedfa2e0b..536c80ff7ad9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -338,7 +338,7 @@ enum { struct trace_event_file { struct list_head list; struct trace_event_call *event_call; - struct event_filter *filter; + struct event_filter __rcu *filter; struct dentry *dir; struct trace_array *tr; struct trace_subsystem_dir *system; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 53f6b6401cf0..02004ae91860 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void) mutex_lock(&ftrace_lock); - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) + for (ops = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); + ops != &ftrace_list_end; + ops = rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock))) cnt++; mutex_unlock(&ftrace_lock); @@ -275,10 +278,11 @@ static void update_ftrace_function(void) * If there's only one ftrace_ops registered, the ftrace_ops_list * will point to the ops we want. */ - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); /* If there's no ftrace_ops registered, just call the stub function */ - if (ftrace_ops_list == &ftrace_list_end) { + if (set_function_trace_op == &ftrace_list_end) { func = ftrace_stub; /* @@ -286,7 +290,8 @@ static void update_ftrace_function(void) * recursion safe and not dynamic and the arch supports passing ops, * then have the mcount trampoline call the function directly. */ - } else if (ftrace_ops_list->next == &ftrace_list_end) { + } else if (rcu_dereference_protected(ftrace_ops_list->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { func = ftrace_ops_get_list_func(ftrace_ops_list); } else { @@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void) return ftrace_trace_function == ftrace_ops_list_func; } -static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static void add_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { - ops->next = *list; + rcu_assign_pointer(ops->next, *list); + /* * We are entering ops into the list but another * CPU might be walking that list. We need to make sure @@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) rcu_assign_pointer(*list, ops); } -static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static int remove_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { struct ftrace_ops **p; @@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) * If we are removing the last function, then simply point * to the ftrace_stub. */ - if (*list == ops && ops->next == &ftrace_list_end) { + if (rcu_dereference_protected(*list, + lockdep_is_held(&ftrace_lock)) == ops && + rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { *list = &ftrace_list_end; return 0; } @@ -1569,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); - hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash); + rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash); if (hash_contains_ip(ip, &hash)) ret = 1; @@ -2840,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * If there's no more ops registered with ftrace, run a * sanity check to make sure all rec flags are cleared. */ - if (ftrace_ops_list == &ftrace_list_end) { + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -6453,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) != &ftrace_list_end) update_ftrace_function(); ftrace_startup_sysctl(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6ade1c55cc3a..490ba229931d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1210,9 +1210,9 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred *preds; - struct filter_pred *root; - char *filter_string; + struct filter_pred __rcu *preds; + struct filter_pred __rcu *root; + char *filter_string; }; struct event_subsystem { -- cgit v1.2.3-71-gd317 From 4cabc5b186b5427b9ee5a7495172542af105f02b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Jul 2017 00:00:21 +0200 Subject: bpf: fix mixed signed/unsigned derived min/max value bounds Edward reported that there's an issue in min/max value bounds tracking when signed and unsigned compares both provide hints on limits when having unknown variables. E.g. a program such as the following should have been rejected: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff8a94cda93400 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 11: (65) if r1 s> 0x1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=0,max_value=1 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 14: (b7) r0 = 0 15: (95) exit What happens is that in the first part ... 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 ... r1 carries an unsigned value, and is compared as unsigned against a register carrying an immediate. Verifier deduces in reg_set_min_max() that since the compare is unsigned and operation is greater than (>), that in the fall-through/false case, r1's minimum bound must be 0 and maximum bound must be r2. Latter is larger than the bound and thus max value is reset back to being 'invalid' aka BPF_REGISTER_MAX_RANGE. Thus, r1 state is now 'R1=inv,min_value=0'. The subsequent test ... 11: (65) if r1 s> 0x1 goto pc+2 ... is a signed compare of r1 with immediate value 1. Here, verifier deduces in reg_set_min_max() that since the compare is signed this time and operation is greater than (>), that in the fall-through/false case, we can deduce that r1's maximum bound must be 1, meaning with prior test, we result in r1 having the following state: R1=inv,min_value=0,max_value=1. Given that the actual value this holds is -8, the bounds are wrongly deduced. When this is being added to r0 which holds the map_value(_adj) type, then subsequent store access in above case will go through check_mem_access() which invokes check_map_access_adj(), that will then probe whether the map memory is in bounds based on the min_value and max_value as well as access size since the actual unknown value is min_value <= x <= max_value; commit fce366a9dd0d ("bpf, verifier: fix alu ops against map_value{, _adj} register types") provides some more explanation on the semantics. It's worth to note in this context that in the current code, min_value and max_value tracking are used for two things, i) dynamic map value access via check_map_access_adj() and since commit 06c1c049721a ("bpf: allow helpers access to variable memory") ii) also enforced at check_helper_mem_access() when passing a memory address (pointer to packet, map value, stack) and length pair to a helper and the length in this case is an unknown value defining an access range through min_value/max_value in that case. The min_value/max_value tracking is /not/ used in the direct packet access case to track ranges. However, the issue also affects case ii), for example, the following crafted program based on the same principle must be rejected as well: 0: (b7) r2 = 0 1: (bf) r3 = r10 2: (07) r3 += -512 3: (7a) *(u64 *)(r10 -16) = -8 4: (79) r4 = *(u64 *)(r10 -16) 5: (b7) r6 = -1 6: (2d) if r4 > r6 goto pc+5 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 7: (65) if r4 s> 0x1 goto pc+4 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0,max_value=1 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 8: (07) r4 += 1 9: (b7) r5 = 0 10: (6a) *(u16 *)(r10 -512) = 0 11: (85) call bpf_skb_load_bytes#26 12: (b7) r0 = 0 13: (95) exit Meaning, while we initialize the max_value stack slot that the verifier thinks we access in the [1,2] range, in reality we pass -7 as length which is interpreted as u32 in the helper. Thus, this issue is relevant also for the case of helper ranges. Resetting both bounds in check_reg_overflow() in case only one of them exceeds limits is also not enough as similar test can be created that uses values which are within range, thus also here learned min value in r1 is incorrect when mixed with later signed test to create a range: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff880ad081fa00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (65) if r1 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=3,max_value=4 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 14: (b7) r0 = 0 15: (95) exit This leaves us with two options for fixing this: i) to invalidate all prior learned information once we switch signed context, ii) to track min/max signed and unsigned boundaries separately as done in [0]. (Given latter introduces major changes throughout the whole verifier, it's rather net-next material, thus this patch follows option i), meaning we can derive bounds either from only signed tests or only unsigned tests.) There is still the case of adjust_reg_min_max_vals(), where we adjust bounds on ALU operations, meaning programs like the following where boundaries on the reg get mixed in context later on when bounds are merged on the dst reg must get rejected, too: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff89b2bf87ce00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+6 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (b7) r7 = 1 12: (65) if r7 s> 0x0 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,max_value=0 R10=fp 13: (b7) r0 = 0 14: (95) exit from 12 to 15: R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,min_value=1 R10=fp 15: (0f) r7 += r1 16: (65) if r7 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 17: (0f) r0 += r7 18: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=4,max_value=4 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 19: (b7) r0 = 0 20: (95) exit Meaning, in adjust_reg_min_max_vals() we must also reset range values on the dst when src/dst registers have mixed signed/ unsigned derived min/max value bounds with one unbounded value as otherwise they can be added together deducing false boundaries. Once both boundaries are established from either ALU ops or compare operations w/o mixing signed/unsigned insns, then they can safely be added to other regs also having both boundaries established. Adding regs with one unbounded side to a map value where the bounded side has been learned w/o mixing ops is possible, but the resulting map value won't recover from that, meaning such op is considered invalid on the time of actual access. Invalid bounds are set on the dst reg in case i) src reg, or ii) in case dst reg already had them. The only way to recover would be to perform i) ALU ops but only 'add' is allowed on map value types or ii) comparisons, but these are disallowed on pointers in case they span a range. This is fine as only BPF_JEQ and BPF_JNE may be performed on PTR_TO_MAP_VALUE_OR_NULL registers which potentially turn them into PTR_TO_MAP_VALUE type depending on the branch, so only here min/max value cannot be invalidated for them. In terms of state pruning, value_from_signed is considered as well in states_equal() when dealing with adjusted map values. With regards to breaking existing programs, there is a small risk, but use-cases are rather quite narrow where this could occur and mixing compares probably unlikely. Joint work with Josef and Edward. [0] https://lists.iovisor.org/pipermail/iovisor-dev/2017-June/000822.html Fixes: 484611357c19 ("bpf: allow access into map value arrays") Reported-by: Edward Cree Signed-off-by: Daniel Borkmann Signed-off-by: Edward Cree Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 108 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 95 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 621076f56251..8e5d31f6faef 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -43,6 +43,7 @@ struct bpf_reg_state { u32 min_align; u32 aux_off; u32 aux_off_align; + bool value_from_signed; }; enum bpf_stack_slot_type { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a86723c5b64..af9e84a4944e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; regs[regno].max_value = BPF_REGISTER_MAX_RANGE; + regs[regno].value_from_signed = false; regs[regno].min_align = 0; } @@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } -static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +static bool __is_pointer_value(bool allow_ptr_leaks, + const struct bpf_reg_state *reg) { - if (env->allow_ptr_leaks) + if (allow_ptr_leaks) return false; - switch (env->cur_state.regs[regno].type) { + switch (reg->type) { case UNKNOWN_VALUE: case CONST_IMM: return false; @@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } } +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +{ + return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); +} + static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, int off, int size, bool strict) { @@ -1832,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_align = dst_reg->min_align; /* We don't know anything about what was done to this register, mark it - * as unknown. + * as unknown. Also, if both derived bounds came from signed/unsigned + * mixed compares and one side is unbounded, we cannot really do anything + * with them as boundaries cannot be trusted. Thus, arithmetic of two + * regs of such kind will get invalidated bounds on the dst side. */ - if (min_val == BPF_REGISTER_MIN_RANGE && - max_val == BPF_REGISTER_MAX_RANGE) { + if ((min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (BPF_SRC(insn->code) == BPF_X && + ((min_val != BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (min_val == BPF_REGISTER_MIN_RANGE && + max_val != BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value != BPF_REGISTER_MIN_RANGE && + dst_reg->max_value == BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value == BPF_REGISTER_MIN_RANGE && + dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) && + regs[insn->dst_reg].value_from_signed != + regs[insn->src_reg].value_from_signed)) { reset_reg_range_values(regs, insn->dst_reg); return; } @@ -2023,6 +2044,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; regs[insn->dst_reg].min_align = calc_align(insn->imm); + regs[insn->dst_reg].value_from_signed = false; } } else if (opcode > BPF_END) { @@ -2198,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. */ false_reg->max_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val + 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. */ false_reg->max_value = val - 1; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2239,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg @@ -2248,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* * If this is false, then the val is <= the register, if it is * true the register <= to the val. */ false_reg->min_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val - 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* If this is false then constant < register, if it is true then * the register < constant. */ false_reg->min_value = val + 1; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2290,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, -- cgit v1.2.3-71-gd317 From 2aeb1883547626d82c597cce2c99f0b9c62e2425 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 20 Jul 2017 16:14:55 +0200 Subject: perf/core: Fix locking for children siblings group read We're missing ctx lock when iterating children siblings within the perf_read path for group reading. Following race and crash can happen: User space doing read syscall on event group leader: T1: perf_read lock event->ctx->mutex perf_read_group lock leader->child_mutex __perf_read_group_add(child) list_for_each_entry(sub, &leader->sibling_list, group_entry) ----> sub might be invalid at this point, because it could get removed via perf_event_exit_task_context in T2 Child exiting and cleaning up its events: T2: perf_event_exit_task_context lock ctx->mutex list_for_each_entry_safe(child_event, next, &child_ctx->event_list,... perf_event_exit_event(child) lock ctx->lock perf_group_detach(child) unlock ctx->lock ----> child is removed from sibling_list without any sync with T1 path above ... free_event(child) Before the child is removed from the leader's child_list, (and thus is omitted from perf_read_group processing), we need to ensure that perf_read_group touches child's siblings under its ctx->lock. Peter further notes: | One additional note; this bug got exposed by commit: | | ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP") | | which made it possible to actually trigger this code-path. Tested-by: Andi Kleen Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP") Link: http://lkml.kernel.org/r/20170720141455.2106-1-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c9cdbd396770..c17c0881fd36 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4372,7 +4372,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { + struct perf_event_context *ctx = leader->ctx; struct perf_event *sub; + unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -4402,12 +4404,15 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + raw_spin_lock_irqsave(&ctx->lock, flags); + list_for_each_entry(sub, &leader->sibling_list, group_entry) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); } + raw_spin_unlock_irqrestore(&ctx->lock, flags); return 0; } -- cgit v1.2.3-71-gd317