From f96f56780ca584930bb3a2769d73fd9a101bcbbe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 4 Aug 2014 03:10:16 +0000 Subject: kprobes: Skip kretprobe hit in NMI context to avoid deadlock Skip kretprobe hit in NMI context, because if an NMI happens inside the critical section protected by kretprobe_table.lock and another(or same) kretprobe hit, pre_kretprobe_handler tries to lock kretprobe_table.lock again. Normal interrupts have no problem because they are disabled with the lock. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: David S. Miller Link: http://lkml.kernel.org/r/20140804031016.11433.65539.stgit@kbuild-fedora.novalocal [ Minor edits for clarity. ] Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 734e9a7d280b..3995f546d0f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1778,7 +1778,18 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) unsigned long hash, flags = 0; struct kretprobe_instance *ri; - /*TODO: consider to only swap the RA after the last pre_handler fired */ + /* + * To avoid deadlocks, prohibit return probing in NMI contexts, + * just skip the probe and increase the (inexact) 'nmissed' + * statistical counter, so that the user is informed that + * something happened: + */ + if (unlikely(in_nmi())) { + rp->nmissed++; + return 0; + } + + /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { -- cgit v1.2.3-71-gd317 From ff7e0055bb5ddbbb320cdd8dfd3e18672bddd2ad Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 16 Aug 2014 04:13:37 +0930 Subject: module: Clean up ro/nx after early module load failures The commit 4982223e51e8 module: set nx before marking module MODULE_STATE_COMING. introduced a regression: if a module fails to parse its arguments or if mod_sysfs_setup fails, then the module's memory will be freed while still read-only. Anything that reuses that memory will crash as soon as it tries to write to it. Cc: stable@vger.kernel.org # v3.16 Cc: Rusty Russell Signed-off-by: Andy Lutomirski Signed-off-by: Rusty Russell --- kernel/module.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6f69463f0066..03214bd288e9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3304,6 +3304,11 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_lock(&module_mutex); module_bug_cleanup(mod); mutex_unlock(&module_mutex); + + /* we can't deallocate the module until we clear memory protection */ + unset_module_init_ro_nx(mod); + unset_module_core_ro_nx(mod); + ddebug_cleanup: dynamic_debug_remove(info->debug); synchronize_sched(); -- cgit v1.2.3-71-gd317 From 71b1fb5c4473a5b1e601d41b109bdfe001ec82e0 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Mon, 18 Aug 2014 12:20:20 +0100 Subject: cgroup: reject cgroup names with '\n' /proc//cgroup contains one cgroup path on each line. If cgroup names are allowed to contain "\n", applications cannot parse /proc//cgroup safely. Signed-off-by: Alban Crequy Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788cfd52..c3d1802a9b30 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4543,6 +4543,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cftype *base_files; int ssid, ret; + /* Do not accept '\n' to prevent making /proc//cgroup unparsable. + */ + if (strchr(name, '\n')) + return -EINVAL; + parent = cgroup_kn_lock_live(parent_kn); if (!parent) return -ENODEV; -- cgit v1.2.3-71-gd317 From b3f207855f57b9c8f43a547a801340bb5cbc59e5 Mon Sep 17 00:00:00 2001 From: Pawel Moll Date: Fri, 13 Jun 2014 16:03:32 +0100 Subject: perf: Handle compat ioctl When running a 32-bit userspace on a 64-bit kernel (eg. i386 application on x86_64 kernel or 32-bit arm userspace on arm64 kernel) some of the perf ioctls must be treated with special care, as they have a pointer size encoded in the command. For example, PERF_EVENT_IOC_ID in 32-bit world will be encoded as 0x80042407, but 64-bit kernel will expect 0x80082407. In result the ioctl will fail returning -ENOTTY. This patch solves the problem by adding code fixing up the size as compat_ioctl file operation. Reported-by: Drew Richardson Signed-off-by: Pawel Moll Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1402671812-9078-1-git-send-email-pawel.moll@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cf24b3e42ec..f9c1ed002dbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "internal.h" @@ -3717,6 +3718,26 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +#ifdef CONFIG_COMPAT +static long perf_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (_IOC_NR(cmd)) { + case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): + case _IOC_NR(PERF_EVENT_IOC_ID): + /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ + if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { + cmd &= ~IOCSIZE_MASK; + cmd |= sizeof(void *) << IOCSIZE_SHIFT; + } + break; + } + return perf_ioctl(file, cmd, arg); +} +#else +# define perf_compat_ioctl NULL +#endif + int perf_event_task_enable(void) { struct perf_event *event; @@ -4222,7 +4243,7 @@ static const struct file_operations perf_fops = { .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, + .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, }; -- cgit v1.2.3-71-gd317 From 33b7f99cf003ca6c1d31c42b50e1100ad71aaec0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 15 Aug 2014 17:23:02 -0400 Subject: ftrace: Allow ftrace_ops to use the hashes from other ops Currently the top level debug file system function tracer shares its ftrace_ops with the function graph tracer. This was thought to be fine because the tracers are not used together, as one can only enable function or function_graph tracer in the current_tracer file. But that assumption proved to be incorrect. The function profiler can use the function graph tracer when function tracing is enabled. Since all function graph users uses the function tracing ftrace_ops this causes a conflict and when a user enables both function profiling as well as the function tracer it will crash ftrace and disable it. The quick solution so far is to move them as separate ftrace_ops like it was earlier. The problem though is to synchronize the functions that are traced because both function and function_graph tracer are limited by the selections made in the set_ftrace_filter and set_ftrace_notrace files. To handle this, a new structure is made called ftrace_ops_hash. This structure will now hold the filter_hash and notrace_hash, and the ftrace_ops will point to this structure. That will allow two ftrace_ops to share the same hashes. Since most ftrace_ops do not share the hashes, and to keep allocation simple, the ftrace_ops structure will include both a pointer to the ftrace_ops_hash called func_hash, as well as the structure itself, called local_hash. When the ops are registered, the func_hash pointer will be initialized to point to the local_hash within the ftrace_ops structure. Some of the ftrace internal ftrace_ops will be initialized statically. This will allow for the function and function_graph tracer to have separate ops but still share the same hash tables that determine what functions they trace. Cc: stable@vger.kernel.org # 3.16 (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 14 +++++-- kernel/trace/ftrace.c | 100 +++++++++++++++++++++++++------------------------ 2 files changed, 63 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6bb5e3f2a3b4..f0b0edbf55a9 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -102,6 +102,15 @@ enum { FTRACE_OPS_FL_DELETED = 1 << 8, }; +#ifdef CONFIG_DYNAMIC_FTRACE +/* The hash used to know what functions callbacks trace */ +struct ftrace_ops_hash { + struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash; + struct mutex regex_lock; +}; +#endif + /* * Note, ftrace_ops can be referenced outside of RCU protection. * (Although, for perf, the control ops prevent that). If ftrace_ops is @@ -121,10 +130,9 @@ struct ftrace_ops { int __percpu *disabled; #ifdef CONFIG_DYNAMIC_FTRACE int nr_trampolines; - struct ftrace_hash *notrace_hash; - struct ftrace_hash *filter_hash; + struct ftrace_ops_hash local_hash; + struct ftrace_ops_hash *func_hash; struct ftrace_hash *tramp_hash; - struct mutex regex_lock; unsigned long trampoline; #endif }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1654b12c891a..c92757adba79 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -65,15 +65,17 @@ #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) #ifdef CONFIG_DYNAMIC_FTRACE -#define INIT_REGEX_LOCK(opsname) \ - .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock), +#define INIT_OPS_HASH(opsname) \ + .func_hash = &opsname.local_hash, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else -#define INIT_REGEX_LOCK(opsname) +#define INIT_OPS_HASH(opsname) #endif static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, + INIT_OPS_HASH(ftrace_list_end) }; /* ftrace_enabled is a method to turn ftrace on or off */ @@ -140,7 +142,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) { #ifdef CONFIG_DYNAMIC_FTRACE if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { - mutex_init(&ops->regex_lock); + mutex_init(&ops->local_hash.regex_lock); + ops->func_hash = &ops->local_hash; ops->flags |= FTRACE_OPS_FL_INITIALIZED; } #endif @@ -899,7 +902,7 @@ static void unregister_ftrace_profiler(void) static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(ftrace_profile_ops) + INIT_OPS_HASH(ftrace_profile_ops) }; static int register_ftrace_profiler(void) @@ -1081,11 +1084,12 @@ static const struct ftrace_hash empty_hash = { #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) static struct ftrace_ops global_ops = { - .func = ftrace_stub, - .notrace_hash = EMPTY_HASH, - .filter_hash = EMPTY_HASH, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(global_ops) + .func = ftrace_stub, + .local_hash.notrace_hash = EMPTY_HASH, + .local_hash.filter_hash = EMPTY_HASH, + INIT_OPS_HASH(global_ops) + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED, }; struct ftrace_page { @@ -1226,8 +1230,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); - free_ftrace_hash(ops->filter_hash); - free_ftrace_hash(ops->notrace_hash); + free_ftrace_hash(ops->func_hash->filter_hash); + free_ftrace_hash(ops->func_hash->notrace_hash); } static struct ftrace_hash *alloc_ftrace_hash(int size_bits) @@ -1382,8 +1386,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); - notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); + filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); + notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && @@ -1554,14 +1558,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * gets inversed. */ if (filter_hash) { - hash = ops->filter_hash; - other_hash = ops->notrace_hash; + hash = ops->func_hash->filter_hash; + other_hash = ops->func_hash->notrace_hash; if (ftrace_hash_empty(hash)) all = 1; } else { inc = !inc; - hash = ops->notrace_hash; - other_hash = ops->filter_hash; + hash = ops->func_hash->notrace_hash; + other_hash = ops->func_hash->filter_hash; /* * If the notrace hash has no items, * then there's nothing to do. @@ -2436,8 +2440,8 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) * Filter_hash being empty will default to trace module. * But notrace hash requires a test of individual module functions. */ - return ftrace_hash_empty(ops->filter_hash) && - ftrace_hash_empty(ops->notrace_hash); + return ftrace_hash_empty(ops->func_hash->filter_hash) && + ftrace_hash_empty(ops->func_hash->notrace_hash); } /* @@ -2459,12 +2463,12 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) return 0; /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->filter_hash) && - !ftrace_lookup_ip(ops->filter_hash, rec->ip)) + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) return 0; /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) return 0; return 1; @@ -2785,10 +2789,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } else { rec = &iter->pg->records[iter->idx++]; if (((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || + !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || + !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && !(rec->flags & FTRACE_FL_ENABLED))) { @@ -2837,9 +2841,9 @@ static void *t_start(struct seq_file *m, loff_t *pos) * functions are enabled. */ if ((iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->filter_hash)) || + ftrace_hash_empty(ops->func_hash->filter_hash)) || (iter->flags & FTRACE_ITER_NOTRACE && - ftrace_hash_empty(ops->notrace_hash))) { + ftrace_hash_empty(ops->func_hash->notrace_hash))) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -3001,12 +3005,12 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, iter->ops = ops; iter->flags = flag; - mutex_lock(&ops->regex_lock); + mutex_lock(&ops->func_hash->regex_lock); if (flag & FTRACE_ITER_NOTRACE) - hash = ops->notrace_hash; + hash = ops->func_hash->notrace_hash; else - hash = ops->filter_hash; + hash = ops->func_hash->filter_hash; if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; @@ -3041,7 +3045,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, file->private_data = iter; out_unlock: - mutex_unlock(&ops->regex_lock); + mutex_unlock(&ops->func_hash->regex_lock); return ret; } @@ -3279,7 +3283,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = { .func = function_trace_probe_call, .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(trace_probe_ops) + INIT_OPS_HASH(trace_probe_ops) }; static int ftrace_probe_registered; @@ -3342,7 +3346,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_probe *entry; - struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -3359,7 +3363,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (WARN_ON(not)) return -EINVAL; - mutex_lock(&trace_probe_ops.regex_lock); + mutex_lock(&trace_probe_ops.func_hash->regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) { @@ -3428,7 +3432,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, out_unlock: mutex_unlock(&ftrace_lock); out: - mutex_unlock(&trace_probe_ops.regex_lock); + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); return count; @@ -3446,7 +3450,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; struct ftrace_func_probe *p; - struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct list_head free_list; struct ftrace_hash *hash; struct hlist_node *tmp; @@ -3468,7 +3472,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return; } - mutex_lock(&trace_probe_ops.regex_lock); + mutex_lock(&trace_probe_ops.func_hash->regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) @@ -3521,7 +3525,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&trace_probe_ops.regex_lock); + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); } @@ -3717,12 +3721,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, if (unlikely(ftrace_disabled)) return -ENODEV; - mutex_lock(&ops->regex_lock); + mutex_lock(&ops->func_hash->regex_lock); if (enable) - orig_hash = &ops->filter_hash; + orig_hash = &ops->func_hash->filter_hash; else - orig_hash = &ops->notrace_hash; + orig_hash = &ops->func_hash->notrace_hash; if (reset) hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); @@ -3752,7 +3756,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_unlock(&ftrace_lock); out_regex_unlock: - mutex_unlock(&ops->regex_lock); + mutex_unlock(&ops->func_hash->regex_lock); free_ftrace_hash(hash); return ret; @@ -3975,15 +3979,15 @@ int ftrace_regex_release(struct inode *inode, struct file *file) trace_parser_put(parser); - mutex_lock(&iter->ops->regex_lock); + mutex_lock(&iter->ops->func_hash->regex_lock); if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); if (filter_hash) - orig_hash = &iter->ops->filter_hash; + orig_hash = &iter->ops->func_hash->filter_hash; else - orig_hash = &iter->ops->notrace_hash; + orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); ret = ftrace_hash_move(iter->ops, filter_hash, @@ -3994,7 +3998,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_unlock(&ftrace_lock); } - mutex_unlock(&iter->ops->regex_lock); + mutex_unlock(&iter->ops->func_hash->regex_lock); free_ftrace_hash(iter->hash); kfree(iter); @@ -4611,7 +4615,7 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(global_ops) + INIT_OPS_HASH(global_ops) }; static int __init ftrace_nodyn_init(void) @@ -4713,7 +4717,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops control_ops = { .func = ftrace_ops_control_func, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(control_ops) + INIT_OPS_HASH(control_ops) }; static inline void -- cgit v1.2.3-71-gd317 From fa8137be6ba632041e725e4623258ba27a2cf9be Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 11:44:03 -0400 Subject: cgroup: Display legacy cgroup files on default hierarchy Kernel command line parameter cgroup__DEVEL__legacy_files_on_dfl forces legacy cgroup files to show up on default hierarhcy if susbsystem does not have any files defined for default hierarchy. But this seems to be working only if legacy files are defined in ss->legacy_cftypes. If one adds some cftypes later using cgroup_add_legacy_cftypes(), these files don't show up on default hierarchy. Update the function accordingly so that the dynamically added legacy files also show up in the default hierarchy if the target subsystem is also using the base legacy files for the default hierarchy. tj: Patch description and comment updates. Signed-off-by: Vivek Goyal Signed-off-by: Tejun Heo --- kernel/cgroup.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c3d1802a9b30..50b94113f4f7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3271,8 +3271,17 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; - for (cft = cfts; cft && cft->name[0] != '\0'; cft++) - cft->flags |= __CFTYPE_NOT_ON_DFL; + /* + * If legacy_flies_on_dfl, we want to show the legacy files on the + * dfl hierarchy but iff the target subsystem hasn't been updated + * for the dfl hierarchy yet. + */ + if (!cgroup_legacy_files_on_dfl || + ss->dfl_cftypes != ss->legacy_cftypes) { + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_NOT_ON_DFL; + } + return cgroup_add_cftypes(ss, cfts); } -- cgit v1.2.3-71-gd317 From 84261912ebee41269004e8a9f3614ba38ef6b206 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 18 Aug 2014 13:21:08 -0400 Subject: ftrace: Update all ftrace_ops for a ftrace_hash_ops update When updating what an ftrace_ops traces, if it is registered (that is, actively tracing), and that ftrace_ops uses the shared global_ops local_hash, then we need to update all tracers that are active and also share the global_ops' ftrace_hash_ops. Cc: stable@vger.kernel.org # 3.16 (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c92757adba79..37f9e90d241c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1292,9 +1292,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) } static void -ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); static void -ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); +ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_move(struct ftrace_ops *ops, int enable, @@ -1346,13 +1346,13 @@ update: * Remove the current set, update the hash and add * them back. */ - ftrace_hash_rec_disable(ops, enable); + ftrace_hash_rec_disable_modify(ops, enable); old_hash = *dst; rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - ftrace_hash_rec_enable(ops, enable); + ftrace_hash_rec_enable_modify(ops, enable); return 0; } @@ -1686,6 +1686,41 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } +static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, + int filter_hash, int inc) +{ + struct ftrace_ops *op; + + __ftrace_hash_rec_update(ops, filter_hash, inc); + + if (ops->func_hash != &global_ops.local_hash) + return; + + /* + * If the ops shares the global_ops hash, then we need to update + * all ops that are enabled and use this hash. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Already done */ + if (op == ops) + continue; + if (op->func_hash == &global_ops.local_hash) + __ftrace_hash_rec_update(op, filter_hash, inc); + } while_for_each_ftrace_op(op); +} + +static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 1); +} + static void print_ip_ins(const char *fmt, unsigned char *p) { int i; -- cgit v1.2.3-71-gd317 From bce0b6c51ac76fc0e763262a6c2a9d05e486f0d8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 20 Aug 2014 23:57:04 -0400 Subject: ftrace: Fix up trampoline accounting with looping on hash ops Now that a ftrace_hash can be shared by multiple ftrace_ops, they can dec the rec->flags by more than once (one per those that share the ftrace_hash). This means that the tramp_hash may not have a hash item when it was added. For example, if two ftrace_ops share a hash for a ftrace record, and the first ops has a trampoline, when it adds itself it will set the rec->flags TRAMP flag and increments its nr_trampolines counter. When the second ops is added, it must clear that tramp flag but also decrement the other ops that shares its hash. As the update to the function callbacks has not yet been performed, the other ops will not have the tramp hash set yet and it can not be used to know to decrement its nr_trampolines. Luckily, the tramp_hash does not need to be used. As the ftrace_mutex is held, a ops with a trampoline to a record during an update of another ops that shares the record will have its func_hash pointing to it. Since a trampoline can only be set for a record if only one ops is attached to it, we can just check if the record has a trampoline (the FTRACE_FL_TRAMP flag is set) and then find the ops that has this record in its hashes. Also added some output to help debug when things go wrong. Cc: stable@vger.kernel.org # 3.16+ (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 37f9e90d241c..92376aeac4a7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1507,25 +1507,38 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) static void ftrace_remove_tramp(struct ftrace_ops *ops, struct dyn_ftrace *rec) { - struct ftrace_func_entry *entry; - - entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip); - if (!entry) + /* If TRAMP is not set, no ops should have a trampoline for this */ + if (!(rec->flags & FTRACE_FL_TRAMP)) return; + rec->flags &= ~FTRACE_FL_TRAMP; + + if ((!ftrace_hash_empty(ops->func_hash->filter_hash) && + !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) || + ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) + return; /* * The tramp_hash entry will be removed at time * of update. */ ops->nr_trampolines--; - rec->flags &= ~FTRACE_FL_TRAMP; } -static void ftrace_clear_tramps(struct dyn_ftrace *rec) +static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops) { struct ftrace_ops *op; + /* If TRAMP is not set, no ops should have a trampoline for this */ + if (!(rec->flags & FTRACE_FL_TRAMP)) + return; + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* + * This function is called to clear other tramps + * not the one that is being updated. + */ + if (op == ops) + continue; if (op->nr_trampolines) ftrace_remove_tramp(op, rec); } while_for_each_ftrace_op(op); @@ -1626,13 +1639,10 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, /* * If we are adding another function callback * to this function, and the previous had a - * trampoline used, then we need to go back to - * the default trampoline. + * custom trampoline in use, then we need to go + * back to the default trampoline. */ - rec->flags &= ~FTRACE_FL_TRAMP; - - /* remove trampolines from any ops for this rec */ - ftrace_clear_tramps(rec); + ftrace_clear_tramps(rec, ops); } /* @@ -1935,8 +1945,8 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) if (rec->flags & FTRACE_FL_TRAMP) { ops = ftrace_find_tramp_ops_new(rec); if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { - pr_warning("Bad trampoline accounting at: %p (%pS)\n", - (void *)rec->ip, (void *)rec->ip); + pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n", + (void *)rec->ip, (void *)rec->ip, rec->flags); /* Ftrace is shutting down, return anything */ return (unsigned long)FTRACE_ADDR; } @@ -2266,7 +2276,10 @@ static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) } while_for_each_ftrace_rec(); /* The number of recs in the hash must match nr_trampolines */ - FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines); + if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines)) + pr_warn("count=%ld trampolines=%d\n", + ops->tramp_hash->count, + ops->nr_trampolines); return 0; } -- cgit v1.2.3-71-gd317 From 5f151b240192a1557119d5375af71efc26825bc8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 15 Aug 2014 17:18:46 -0400 Subject: ftrace: Fix function_profiler and function tracer together The latest rewrite of ftrace removed the separate ftrace_ops of the function tracer and the function graph tracer and had them share the same ftrace_ops. This simplified the accounting by removing the multiple layers of functions called, where the global_ops func would call a special list that would iterate over the other ops that were registered within it (like function and function graph), which itself was registered to the ftrace ops list of all functions currently active. If that sounds confusing, the code that implemented it was also confusing and its removal is a good thing. The problem with this change was that it assumed that the function and function graph tracer can never be used at the same time. This is mostly true, but there is an exception. That is when the function profiler uses the function graph tracer to profile. The function profiler can be activated the same time as the function tracer, and this breaks the assumption and the result is that ftrace will crash (it detects the error and shuts itself down, it does not cause a kernel oops). To solve this issue, a previous change allowed the hash tables for the functions traced by a ftrace_ops to be a pointer and let multiple ftrace_ops share the same hash. This allows the function and function_graph tracer to have separate ftrace_ops, but still share the hash, which is what is done. Now the function and function graph tracers have separate ftrace_ops again, and the function tracer can be run while the function_profile is active. Cc: stable@vger.kernel.org # 3.16 (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 60 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 92376aeac4a7..08aca65d709a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -68,8 +68,12 @@ #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), +#define ASSIGN_OPS_HASH(opsname, val) \ + .func_hash = val, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else #define INIT_OPS_HASH(opsname) +#define ASSIGN_OPS_HASH(opsname, val) #endif static struct ftrace_ops ftrace_list_end __read_mostly = { @@ -4663,7 +4667,6 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(global_ops) }; static int __init ftrace_nodyn_init(void) @@ -5197,6 +5200,17 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER +static struct ftrace_ops graph_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_STUB, +#ifdef FTRACE_GRAPH_TRAMP_ADDR + .trampoline = FTRACE_GRAPH_TRAMP_ADDR, +#endif + ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) +}; + static int ftrace_graph_active; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) @@ -5359,12 +5373,28 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) */ static void update_function_graph_func(void) { - if (ftrace_ops_list == &ftrace_list_end || - (ftrace_ops_list == &global_ops && - global_ops.next == &ftrace_list_end)) - ftrace_graph_entry = __ftrace_graph_entry; - else + struct ftrace_ops *op; + bool do_test = false; + + /* + * The graph and global ops share the same set of functions + * to test. If any other ops is on the list, then + * the graph tracing needs to test if its the function + * it should call. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op != &global_ops && op != &graph_ops && + op != &ftrace_list_end) { + do_test = true; + /* in double loop, break out with goto */ + goto out; + } + } while_for_each_ftrace_op(op); + out: + if (do_test) ftrace_graph_entry = ftrace_graph_entry_test; + else + ftrace_graph_entry = __ftrace_graph_entry; } static struct notifier_block ftrace_suspend_notifier = { @@ -5405,16 +5435,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_entry = ftrace_graph_entry_test; update_function_graph_func(); - /* Function graph doesn't use the .func field of global_ops */ - global_ops.flags |= FTRACE_OPS_FL_STUB; - -#ifdef CONFIG_DYNAMIC_FTRACE - /* Optimize function graph calling (if implemented by arch) */ - if (FTRACE_GRAPH_TRAMP_ADDR != 0) - global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR; -#endif - - ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); + ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -5432,12 +5453,7 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); - global_ops.flags &= ~FTRACE_OPS_FL_STUB; -#ifdef CONFIG_DYNAMIC_FTRACE - if (FTRACE_GRAPH_TRAMP_ADDR != 0) - global_ops.trampoline = 0; -#endif + ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -- cgit v1.2.3-71-gd317 From 39b5552cd5090d4c210d278cd2732f493075f033 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Sun, 17 Aug 2014 20:59:10 -0400 Subject: ftrace: Use current addr when converting to nop in __ftrace_replace_code() In __ftrace_replace_code(), when converting the call to a nop in a function it needs to compare against the "curr" (current) value of the ftrace ops, and not the "new" one. It currently does not affect x86 which is the only arch to do the trampolines with function graph tracer, but when other archs that do depend on this code implement the function graph trampoline, it can crash. Here's an example when ARM uses the trampolines (in the future): ------------[ cut here ]------------ WARNING: CPU: 0 PID: 9 at kernel/trace/ftrace.c:1716 ftrace_bug+0x17c/0x1f4() Modules linked in: omap_rng rng_core ipv6 CPU: 0 PID: 9 Comm: migration/0 Not tainted 3.16.0-test-10959-gf0094b28f303-dirty #52 [] (unwind_backtrace) from [] (show_stack+0x20/0x24) [] (show_stack) from [] (dump_stack+0x78/0x94) [] (dump_stack) from [] (warn_slowpath_common+0x7c/0x9c) [] (warn_slowpath_common) from [] (warn_slowpath_null+0x2c/0x34) [] (warn_slowpath_null) from [] (ftrace_bug+0x17c/0x1f4) [] (ftrace_bug) from [] (ftrace_replace_code+0x80/0x9c) [] (ftrace_replace_code) from [] (ftrace_modify_all_code+0xb8/0x164) [] (ftrace_modify_all_code) from [] (__ftrace_modify_code+0x14/0x1c) [] (__ftrace_modify_code) from [] (multi_cpu_stop+0xf4/0x134) [] (multi_cpu_stop) from [] (cpu_stopper_thread+0x54/0x130) [] (cpu_stopper_thread) from [] (smpboot_thread_fn+0x1ac/0x1bc) [] (smpboot_thread_fn) from [] (kthread+0xe0/0xfc) [] (kthread) from [] (ret_from_fork+0x14/0x20) ---[ end trace dc9ce72c5b617d8f ]--- [ 65.047264] ftrace failed to modify [] asm_do_IRQ+0x10/0x1c [ 65.054070] actual: 85:1b:00:eb Fixes: 7413af1fb70e7 "ftrace: Make get_ftrace_addr() and get_ftrace_addr_old() global" Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 08aca65d709a..5916a8e59e87 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2017,7 +2017,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_make_call(rec, ftrace_addr); case FTRACE_UPDATE_MAKE_NOP: - return ftrace_make_nop(NULL, rec, ftrace_addr); + return ftrace_make_nop(NULL, rec, ftrace_old_addr); case FTRACE_UPDATE_MODIFY_CALL: return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); -- cgit v1.2.3-71-gd317 From 7cad45eea3849faeb34591b60d16b50d13a38d77 Mon Sep 17 00:00:00 2001 From: Vincent Stehlé Date: Fri, 22 Aug 2014 01:31:20 +0200 Subject: irq: Export handle_fasteoi_irq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export handle_fasteoi_irq to be able to use it in e.g. the Zynq gpio driver since commit 6dd859508336 ("gpio: zynq: Fix IRQ handlers"). This fixes the following link issue: ERROR: "handle_fasteoi_irq" [drivers/gpio/gpio-zynq.ko] undefined! Signed-off-by: Vincent Stehlé Acked-by: Arnd Bergmann Cc: linux-arm-kernel@lists.infradead.org Cc: Vincent Stehle Cc: Lars-Peter Clausen Cc: Linus Walleij Link: http://lkml.kernel.org/r/1408663880-29179-1-git-send-email-vincent.stehle@laposte.net Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a2b28a2fd7b1..6223fab9a9d2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -517,6 +517,7 @@ out: chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } +EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** * handle_edge_irq - edge type IRQ handler -- cgit v1.2.3-71-gd317 From 4ce97dbf50245227add17c83d87dc838e7ca79d0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 25 Aug 2014 13:59:41 -0400 Subject: trace: Fix epoll hang when we race with new entries Epoll on trace_pipe can sometimes hang in a weird case. If the ring buffer is empty when we set waiters_pending but an event shows up exactly at that moment we can miss being woken up by the ring buffers irq work. Since ring_buffer_empty() is inherently racey we will sometimes think that the buffer is not empty. So we don't get woken up and we don't think there are any events even though there were some ready when we added the watch, which makes us hang. This patch fixes this by making sure that we are actually on the wait list before we set waiters_pending, and add a memory barrier to make sure ring_buffer_empty() is going to be correct. Link: http://lkml.kernel.org/p/1408989581-23727-1-git-send-email-jbacik@fb.com Cc: stable@vger.kernel.org # 3.10+ Cc: Martin Lau Signed-off-by: Josef Bacik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index afb04b9b818a..b38fb2b9e237 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -626,8 +626,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, work = &cpu_buffer->irq_work; } - work->waiters_pending = true; poll_wait(filp, &work->waiters, poll_table); + work->waiters_pending = true; + /* + * There's a tight race between setting the waiters_pending and + * checking if the ring buffer is empty. Once the waiters_pending bit + * is set, the next event will wake the task up, but we can get stuck + * if there's only a single event in. + * + * FIXME: Ideally, we need a memory barrier on the writer side as well, + * but adding a memory barrier to all events will cause too much of a + * performance hit in the fast path. We only need a memory barrier when + * the buffer goes from empty to having content. But as this race is + * extremely small, and it's not a problem if another event comes in, we + * will fix it later. + */ + smp_mb(); if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) -- cgit v1.2.3-71-gd317 From 11ed7f934cb807f26da09547b5946c2e534d1dac Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 27 Aug 2014 16:43:40 -0400 Subject: rcu: Make nocb leader kthreads process pending callbacks after spawning The nocb callbacks generated before the nocb kthreads are spawned are enqueued in the nocb queue for later processing. Commit fbce7497ee5af ("rcu: Parallelize and economize NOCB kthread wakeups") introduced nocb leader kthreads which checked the nocb_leader_wake flag to see if there were any such pending callbacks. A case was reported in which newly spawned leader kthreads were not processing the pending callbacks as this flag was not set, which led to a boot hang. The following commit ensures that the newly spawned nocb kthreads process the pending callbacks by allowing the kthreads to run immediately after spawning instead of waiting. This is done by inverting the logic of nocb_leader_wake tests to nocb_leader_sleep which allows us to use the default initialization of this flag to 0 to let the kthreads run. Reported-by: Amit Shah Signed-off-by: Pranith Kumar Link: http://www.spinics.net/lists/kernel/msg1802899.html [ paulmck: Backported to v3.17-rc2. ] Signed-off-by: Paul E. McKenney Tested-by: Amit Shah --- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 71e64c718f75..6a86eb7bac45 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -358,7 +358,7 @@ struct rcu_data { struct rcu_head **nocb_gp_tail; long nocb_gp_count; long nocb_gp_count_lazy; - bool nocb_leader_wake; /* Is the nocb leader thread awake? */ + bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ struct rcu_data *nocb_next_follower; /* Next follower in wakeup chain. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 00dc411e9676..a7997e272564 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2074,9 +2074,9 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) return; - if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { + if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior xchg orders against prior callback enqueue. */ - ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; + ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; wake_up(&rdp_leader->nocb_wq); } } @@ -2253,7 +2253,7 @@ wait_again: if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); wait_event_interruptible(my_rdp->nocb_wq, - ACCESS_ONCE(my_rdp->nocb_leader_wake)); + !ACCESS_ONCE(my_rdp->nocb_leader_sleep)); /* Memory barrier handled by smp_mb() calls below and repoll. */ } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ @@ -2292,12 +2292,12 @@ wait_again: schedule_timeout_interruptible(1); /* Rescan in case we were a victim of memory ordering. */ - my_rdp->nocb_leader_wake = false; - smp_mb(); /* Ensure _wake false before scan. */ + my_rdp->nocb_leader_sleep = true; + smp_mb(); /* Ensure _sleep true before scan. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) if (ACCESS_ONCE(rdp->nocb_head)) { /* Found CB, so short-circuit next wait. */ - my_rdp->nocb_leader_wake = true; + my_rdp->nocb_leader_sleep = false; break; } goto wait_again; @@ -2307,17 +2307,17 @@ wait_again: rcu_nocb_wait_gp(my_rdp); /* - * We left ->nocb_leader_wake set to reduce cache thrashing. - * We clear it now, but recheck for new callbacks while + * We left ->nocb_leader_sleep unset to reduce cache thrashing. + * We set it now, but recheck for new callbacks while * traversing our follower list. */ - my_rdp->nocb_leader_wake = false; - smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ + my_rdp->nocb_leader_sleep = true; + smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { if (ACCESS_ONCE(rdp->nocb_head)) - my_rdp->nocb_leader_wake = true; /* No need to wait. */ + my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ -- cgit v1.2.3-71-gd317 From 800df627e2eabaf4a921d342a1d5162c843b7fc2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 29 Aug 2014 15:18:29 -0700 Subject: resource: fix the case of null pointer access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Richard and Daniel reported that UML is broken due to changes to resource traversal functions. Problem is that iomem_resource.child can be null and new code does not consider that possibility. Old code used a for loop and that loop will not even execute if p was null. Revert back to for() loop logic and bail out if p is null. I also moved sibling_only check out of resource_lock. There is no reason to keep it inside the lock. Following is backtrace of the UML crash. RIP: 0033:[<0000000060039b9f>] RSP: 0000000081459da0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 00000000219b3fff RCX: 000000006010d1d9 RDX: 0000000000000001 RSI: 00000000602dfb94 RDI: 0000000081459df8 RBP: 0000000081459de0 R08: 00000000601b59f4 R09: ffffffff0000ff00 R10: ffffffff0000ff00 R11: 0000000081459e88 R12: 0000000081459df8 R13: 00000000219b3fff R14: 00000000602dfb94 R15: 0000000000000000 Kernel panic - not syncing: Segfault with no mm CPU: 0 PID: 1 Comm: swapper Not tainted 3.16.0-10454-g58d08e3 #13 Stack: 00000000 000080d0 81459df0 219b3fff 81459e70 6010d1d9 ffffffff 6033e010 81459e50 6003a269 81459e30 00000000 Call Trace: [<6010d1d9>] ? kclist_add_private+0x0/0xe7 [<6003a269>] walk_system_ram_range+0x61/0xb7 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<6010d574>] kcore_update_ram+0x4c/0x168 [<6010d72e>] ? kclist_add+0x0/0x2e [<6000e943>] proc_kcore_init+0xea/0xf1 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<600189f0>] do_one_initcall+0x13c/0x204 [<6004ca46>] ? parse_args+0x1df/0x2e0 [<6004c82d>] ? parameq+0x0/0x3a [<601b5990>] ? strcpy+0x0/0x18 [<60001e1a>] kernel_init_freeable+0x240/0x31e [<6026f1c0>] kernel_init+0x12/0x148 [<60019fad>] new_thread_handler+0x81/0xa3 Fixes 8c86e70acead629aacb4a ("resource: provide new functions to walk through resources"). Reported-by: Daniel Walter Tested-by: Richard Weinberger Tested-by: Toralf Förster Tested-by: Daniel Walter Signed-off-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index da14b8d09296..60c5a3856ab7 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -351,15 +351,12 @@ static int find_next_iomem_res(struct resource *res, char *name, end = res->end; BUG_ON(start >= end); - read_lock(&resource_lock); - - if (first_level_children_only) { - p = iomem_resource.child; + if (first_level_children_only) sibling_only = true; - } else - p = &iomem_resource; - while ((p = next_resource(p, sibling_only))) { + read_lock(&resource_lock); + + for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) -- cgit v1.2.3-71-gd317 From 74ca317c26a3f8543203b61d262c0ab2e30c384e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 29 Aug 2014 15:18:46 -0700 Subject: kexec: create a new config option CONFIG_KEXEC_FILE for new syscall Currently new system call kexec_file_load() and all the associated code compiles if CONFIG_KEXEC=y. But new syscall also compiles purgatory code which currently uses gcc option -mcmodel=large. This option seems to be available only gcc 4.4 onwards. Hiding new functionality behind a new config option will not break existing users of old gcc. Those who wish to enable new functionality will require new gcc. Having said that, I am trying to figure out how can I move away from using -mcmodel=large but that can take a while. I think there are other advantages of introducing this new config option. As this option will be enabled only on x86_64, other arches don't have to compile generic kexec code which will never be used. This new code selects CRYPTO=y and CRYPTO_SHA256=y. And all other arches had to do this for CONFIG_KEXEC. Now with introduction of new config option, we can remove crypto dependency from other arches. Now CONFIG_KEXEC_FILE is available only on x86_64. So whereever I had CONFIG_X86_64 defined, I got rid of that. For CONFIG_KEXEC_FILE, instead of doing select CRYPTO=y, I changed it to "depends on CRYPTO=y". This should be safer as "select" is not recursive. Signed-off-by: Vivek Goyal Cc: Eric Biederman Cc: H. Peter Anvin Tested-by: Shaun Ruffell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kbuild | 4 +--- arch/x86/Kconfig | 18 ++++++++++++++---- arch/x86/Makefile | 5 +---- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/crash.c | 6 ++---- arch/x86/kernel/machine_kexec_64.c | 11 +++++++++++ arch/x86/purgatory/Makefile | 5 +---- kernel/kexec.c | 11 +++++++++++ 8 files changed, 42 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 61b6d51866f8..3942f74c92d7 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -17,6 +17,4 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ obj-y += net/ -ifeq ($(CONFIG_X86_64),y) -obj-$(CONFIG_KEXEC) += purgatory/ -endif +obj-$(CONFIG_KEXEC_FILE) += purgatory/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5d0bf1aa9dcb..778178f4c7d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1585,9 +1585,6 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" - select BUILD_BIN2C - select CRYPTO - select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -1602,9 +1599,22 @@ config KEXEC interface is strongly in flux, so no good recommendation can be made. +config KEXEC_FILE + bool "kexec file based system call" + select BUILD_BIN2C + depends on KEXEC + depends on X86_64 + depends on CRYPTO=y + depends on CRYPTO_SHA256=y + ---help--- + This is new version of kexec system call. This system call is + file based and takes file descriptors as system call argument + for kernel and initramfs as opposed to list of segments as + accepted by previous system call. + config KEXEC_VERIFY_SIG bool "Verify kernel signature during kexec_file_load() syscall" - depends on KEXEC + depends on KEXEC_FILE ---help--- This option makes kernel signature verification mandatory for kexec_file_load() syscall. If kernel is signature can not be diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c1aa36887843..c96bcec544fc 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -184,11 +184,8 @@ archheaders: $(Q)$(MAKE) $(build)=arch/x86/syscalls all archprepare: -ifeq ($(CONFIG_KEXEC),y) -# Build only for 64bit. No loaders for 32bit yet. - ifeq ($(CONFIG_X86_64),y) +ifeq ($(CONFIG_KEXEC_FILE),y) $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c - endif endif ### diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b5ea75c4a4b4..ada2e2d6be3e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -71,6 +71,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o @@ -118,5 +119,4 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o - obj-$(CONFIG_KEXEC) += kexec-bzimage64.o endif diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 0553a34fa0df..a618fcd2c07d 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -182,8 +182,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_save_cpu(regs, safe_smp_processor_id()); } -#ifdef CONFIG_X86_64 - +#ifdef CONFIG_KEXEC_FILE static int get_nr_ram_ranges_callback(unsigned long start_pfn, unsigned long nr_pfn, void *arg) { @@ -696,5 +695,4 @@ int crash_load_segments(struct kimage *image) return ret; } - -#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 8b04018e5d1f..485981059a40 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -25,9 +25,11 @@ #include #include +#ifdef CONFIG_KEXEC_FILE static struct kexec_file_ops *kexec_file_loaders[] = { &kexec_bzImage64_ops, }; +#endif static void free_transition_pgtable(struct kimage *image) { @@ -178,6 +180,7 @@ static void load_segments(void) ); } +#ifdef CONFIG_KEXEC_FILE /* Update purgatory as needed after various image segments have been prepared */ static int arch_update_purgatory(struct kimage *image) { @@ -209,6 +212,12 @@ static int arch_update_purgatory(struct kimage *image) return ret; } +#else /* !CONFIG_KEXEC_FILE */ +static inline int arch_update_purgatory(struct kimage *image) +{ + return 0; +} +#endif /* CONFIG_KEXEC_FILE */ int machine_kexec_prepare(struct kimage *image) { @@ -329,6 +338,7 @@ void arch_crash_save_vmcoreinfo(void) /* arch-dependent functionality related to kexec file-based syscall */ +#ifdef CONFIG_KEXEC_FILE int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) { @@ -522,3 +532,4 @@ overflow: (int)ELF64_R_TYPE(rel[i].r_info), value); return -ENOEXEC; } +#endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 7fde9ee438a4..c4ae06e4ae74 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -24,7 +24,4 @@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) -# No loaders for 32bits yet. -ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_KEXEC) += kexec-purgatory.o -endif +obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o diff --git a/kernel/kexec.c b/kernel/kexec.c index 0b49a0a58102..2bee072268d9 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -64,7 +64,9 @@ bool kexec_in_progress = false; char __weak kexec_purgatory[0]; size_t __weak kexec_purgatory_size = 0; +#ifdef CONFIG_KEXEC_FILE static int kexec_calculate_store_digests(struct kimage *image); +#endif /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { @@ -341,6 +343,7 @@ out_free_image: return ret; } +#ifdef CONFIG_KEXEC_FILE static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) { struct fd f = fdget(fd); @@ -612,6 +615,9 @@ out_free_image: kfree(image); return ret; } +#else /* CONFIG_KEXEC_FILE */ +static inline void kimage_file_post_load_cleanup(struct kimage *image) { } +#endif /* CONFIG_KEXEC_FILE */ static int kimage_is_destination_range(struct kimage *image, unsigned long start, @@ -1375,6 +1381,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +#ifdef CONFIG_KEXEC_FILE SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, unsigned long, cmdline_len, const char __user *, cmdline_ptr, unsigned long, flags) @@ -1451,6 +1458,8 @@ out: return ret; } +#endif /* CONFIG_KEXEC_FILE */ + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load @@ -2006,6 +2015,7 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +#ifdef CONFIG_KEXEC_FILE static int __kexec_add_segment(struct kimage *image, char *buf, unsigned long bufsz, unsigned long mem, unsigned long memsz) @@ -2682,6 +2692,7 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } +#endif /* CONFIG_KEXEC_FILE */ /* * Move into place and start executing a preloaded standalone -- cgit v1.2.3-71-gd317 From 62109b43176b87e78b2b6d91bcfe16128c30229b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 3 Sep 2014 01:21:03 +0200 Subject: PM / sleep: Fix test_suspend= command line option After commit d431cbc53cb7 (PM / sleep: Simplify sleep states sysfs interface code) the pm_states[] array is not populated initially, which causes setup_test_suspend() to always fail and the suspend testing during boot doesn't work any more. Fix the problem by using pm_labels[] instead of pm_states[] in setup_test_suspend() and storing a pointer to the label of the sleep state to test rather than the number representing it, because the connection between the state numbers and labels is only established by suspend_set_ops(). Fixes: d431cbc53cb7 (PM / sleep: Simplify sleep states sysfs interface code) Reported-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 1 + kernel/power/suspend.c | 2 +- kernel/power/suspend_test.c | 31 +++++++++++++++++++------------ 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 5d49dcac2537..2df883a9d3cb 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -179,6 +179,7 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ +extern const char *pm_labels[]; extern const char *pm_states[]; extern int suspend_devices_and_enter(suspend_state_t state); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6dadb25cb0d8..18c62195660f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,7 +31,7 @@ #include "power.h" -static const char *pm_labels[] = { "mem", "standby", "freeze", }; +const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 2f524928b6aa..bd91bc177c93 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -129,20 +129,20 @@ static int __init has_wakealarm(struct device *dev, const void *data) * at startup time. They're normally disabled, for faster boot and because * we can't know which states really work on this particular system. */ -static suspend_state_t test_state __initdata = PM_SUSPEND_ON; +static const char *test_state_label __initdata; static char warn_bad_state[] __initdata = KERN_WARNING "PM: can't test '%s' suspend state\n"; static int __init setup_test_suspend(char *value) { - suspend_state_t i; + int i; /* "=mem" ==> "mem" */ value++; - for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (!strcmp(pm_states[i], value)) { - test_state = i; + for (i = 0; pm_labels[i]; i++) + if (!strcmp(pm_labels[i], value)) { + test_state_label = pm_labels[i]; return 0; } @@ -158,13 +158,21 @@ static int __init test_suspend(void) struct rtc_device *rtc = NULL; struct device *dev; + suspend_state_t test_state; /* PM is initialized by now; is that state testable? */ - if (test_state == PM_SUSPEND_ON) - goto done; - if (!pm_states[test_state]) { - printk(warn_bad_state, pm_states[test_state]); - goto done; + if (!test_state_label) + return 0; + + for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) { + const char *state_label = pm_states[test_state]; + + if (state_label && !strcmp(test_state_label, state_label)) + break; + } + if (test_state == PM_SUSPEND_MAX) { + printk(warn_bad_state, test_state_label); + return 0; } /* RTCs have initialized by now too ... can we use one? */ @@ -173,13 +181,12 @@ static int __init test_suspend(void) rtc = rtc_class_open(dev_name(dev)); if (!rtc) { printk(warn_no_rtc); - goto done; + return 0; } /* go for it */ test_wakealarm(rtc, test_state); rtc_class_close(rtc); -done: return 0; } late_initcall(test_suspend); -- cgit v1.2.3-71-gd317 From a4189487da1b4f8260c6006b9dc47c3c4107a5ae Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 4 Sep 2014 14:43:07 +0800 Subject: cgroup: delay the clearing of cgrp->kn->priv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run these two scripts concurrently: for ((; ;)) { mkdir /cgroup/sub rmdir /cgroup/sub } for ((; ;)) { echo $$ > /cgroup/sub/cgroup.procs echo $$ > /cgroup/cgroup.procs } A kernel bug will be triggered: BUG: unable to handle kernel NULL pointer dereference at 00000038 IP: [] cgroup_put+0x9/0x80 ... Call Trace: [] cgroup_kn_unlock+0x39/0x50 [] cgroup_kn_lock_live+0x61/0x70 [] __cgroup_procs_write.isra.26+0x51/0x230 [] cgroup_tasks_write+0x12/0x20 [] cgroup_file_write+0x40/0x130 [] kernfs_fop_write+0xd1/0x160 [] vfs_write+0x98/0x1e0 [] SyS_write+0x4d/0xa0 [] sysenter_do_call+0x12/0x12 We clear cgrp->kn->priv in the end of cgroup_rmdir(), but another concurrent thread can access kn->priv after the clearing. We should move the clearing to css_release_work_fn(). At that time no one is holding reference to the cgroup and no one can gain a new reference to access it. v2: - move RCU_INIT_POINTER() into the else block. (Tejun) - remove the cgroup_parent() check. (Tejun) - update the comment in css_tryget_online_from_dir(). Cc: # 3.15+ Reported-by: Toralf Förster Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 50b94113f4f7..bf30076664ca 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4396,6 +4396,15 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; + + /* + * There are two control paths which try to determine + * cgroup from dentry without going through kernfs - + * cgroupstats_build() and css_tryget_online_from_dir(). + * Those are supported by RCU protecting clearing of + * cgrp->kn->priv backpointer. + */ + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); } mutex_unlock(&cgroup_mutex); @@ -4834,16 +4843,6 @@ static int cgroup_rmdir(struct kernfs_node *kn) cgroup_kn_unlock(kn); - /* - * There are two control paths which try to determine cgroup from - * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_online_from_dir(). Those are supported by RCU - * protecting clearing of cgrp->kn->priv backpointer, which should - * happen after all files under it have been removed. - */ - if (!ret) - RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); - cgroup_put(cgrp); return ret; } @@ -5430,7 +5429,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU - * protected for this access. See cgroup_rmdir() for details. + * protected for this access. See css_release_work_fn() for details. */ cgrp = rcu_dereference(kn->priv); if (cgrp) -- cgit v1.2.3-71-gd317 From aa32362f011c6e863132b16c1761487166a4bad2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 4 Sep 2014 14:43:38 +0800 Subject: cgroup: check cgroup liveliness before unbreaking kernfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When cgroup_kn_lock_live() is called through some kernfs operation and another thread is calling cgroup_rmdir(), we'll trigger the warning in cgroup_get(). ------------[ cut here ]------------ WARNING: CPU: 1 PID: 1228 at kernel/cgroup.c:1034 cgroup_get+0x89/0xa0() ... Call Trace: [] dump_stack+0x41/0x52 [] warn_slowpath_common+0x7f/0xa0 [] warn_slowpath_null+0x1d/0x20 [] cgroup_get+0x89/0xa0 [] cgroup_kn_lock_live+0x28/0x70 [] __cgroup_procs_write.isra.26+0x51/0x230 [] cgroup_tasks_write+0x12/0x20 [] cgroup_file_write+0x40/0x130 [] kernfs_fop_write+0xd1/0x160 [] vfs_write+0x98/0x1e0 [] SyS_write+0x4d/0xa0 [] sysenter_do_call+0x12/0x12 ---[ end trace 6f2e0c38c2108a74 ]--- Fix this by calling css_tryget() instead of cgroup_get(). v2: - move cgroup_tryget() right below cgroup_get() definition. (Tejun) Cc: # 3.15+ Reported-by: Toralf Förster Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bf30076664ca..940aced4ed00 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1035,6 +1035,11 @@ static void cgroup_get(struct cgroup *cgrp) css_get(&cgrp->self); } +static bool cgroup_tryget(struct cgroup *cgrp) +{ + return css_tryget(&cgrp->self); +} + static void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); @@ -1147,7 +1152,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. */ - cgroup_get(cgrp); + if (!cgroup_tryget(cgrp)) + return NULL; kernfs_break_active_protection(kn); mutex_lock(&cgroup_mutex); -- cgit v1.2.3-71-gd317 From 40bea039593dfc7f3f9814dab844f6db43ae580b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Aug 2014 18:50:16 +0200 Subject: nohz: Restore NMI safe local irq work for local nohz kick The local nohz kick is currently used by perf which needs it to be NMI-safe. Recent commit though (7d1311b93e58ed55f3a31cc8f94c4b8fe988a2b9) changed its implementation to fire the local kick using the remote kick API. It was convenient to make the code more generic but the remote kick isn't NMI-safe. As a result: WARNING: CPU: 3 PID: 18062 at kernel/irq_work.c:72 irq_work_queue_on+0x11e/0x140() CPU: 3 PID: 18062 Comm: trinity-subchil Not tainted 3.16.0+ #34 0000000000000009 00000000903774d1 ffff880244e06c00 ffffffff9a7f1e37 0000000000000000 ffff880244e06c38 ffffffff9a0791dd ffff880244fce180 0000000000000003 ffff880244e06d58 ffff880244e06ef8 0000000000000000 Call Trace: [] dump_stack+0x4e/0x7a [] warn_slowpath_common+0x7d/0xa0 [] warn_slowpath_null+0x1a/0x20 [] irq_work_queue_on+0x11e/0x140 [] tick_nohz_full_kick_cpu+0x57/0x90 [] __perf_event_overflow+0x275/0x350 [] ? perf_event_task_disable+0xa0/0xa0 [] ? x86_perf_event_set_period+0xbf/0x150 [] perf_event_overflow+0x14/0x20 [] intel_pmu_handle_irq+0x206/0x410 [] ? arch_vtime_task_switch+0x63/0x130 [] perf_event_nmi_handler+0x2b/0x50 [] nmi_handle+0xd2/0x390 [] ? nmi_handle+0x5/0x390 [] ? lock_release+0xab/0x330 [] default_do_nmi+0x72/0x1c0 [] ? cpuacct_account_field+0xcf/0x200 [] do_nmi+0xb8/0x100 Lets fix this by restoring the use of local irq work for the nohz local kick. Reported-by: Catalin Iacob Reported-and-tested-by: Dave Jones Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- include/linux/tick.h | 7 +------ kernel/time/tick-sched.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index 059052306831..9a82c7dc3fdd 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -183,13 +183,8 @@ static inline bool tick_nohz_full_cpu(int cpu) extern void tick_nohz_init(void); extern void __tick_nohz_full_check(void); +extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); - -static inline void tick_nohz_full_kick(void) -{ - tick_nohz_full_kick_cpu(smp_processor_id()); -} - extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(struct task_struct *tsk); #else diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99aa6ee3908f..f654a8a298fa 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -224,6 +224,20 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { .func = nohz_full_kick_work_func, }; +/* + * Kick this CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), + * is NMI safe. + */ +void tick_nohz_full_kick(void) +{ + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. -- cgit v1.2.3-71-gd317 From 849151dd5481bc8acb1d287a299b5d6a4ca9f1c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2014 12:18:07 +0200 Subject: compat: nanosleep: Clarify error handling The error handling in compat_sys_nanosleep() is correct, but completely non obvious. Document it and restrict it to the -ERESTART_RESTARTBLOCK return value for clarity. Reported-by: Kees Cook Signed-off-by: Thomas Gleixner --- kernel/compat.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 633394f442f8..ebb3c369d03d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -226,7 +226,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) ret = hrtimer_nanosleep_restart(restart); set_fs(oldfs); - if (ret) { + if (ret == -ERESTART_RESTARTBLOCK) { rmtp = restart->nanosleep.compat_rmtp; if (rmtp && compat_put_timespec(&rmt, rmtp)) @@ -256,7 +256,26 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); set_fs(oldfs); - if (ret) { + /* + * hrtimer_nanosleep() can only return 0 or + * -ERESTART_RESTARTBLOCK here because: + * + * - we call it with HRTIMER_MODE_REL and therefor exclude the + * -ERESTARTNOHAND return path. + * + * - we supply the rmtp argument from the task stack (due to + * the necessary compat conversion. So the update cannot + * fail, which excludes the -EFAULT return path as well. If + * it fails nevertheless we have a bigger problem and wont + * reach this place anymore. + * + * - if the return value is 0, we do not have to update rmtp + * because there is no remaining time. + * + * We check for -ERESTART_RESTARTBLOCK nevertheless if the + * core implementation decides to return random nonsense. + */ + if (ret == -ERESTART_RESTARTBLOCK) { struct restart_block *restart = ¤t_thread_info()->restart_block; @@ -266,7 +285,6 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, if (rmtp && compat_put_timespec(&rmt, rmtp)) return -EFAULT; } - return ret; } -- cgit v1.2.3-71-gd317 From 9bf2419fa7bffa16ce58a4d5c20399eff8c970c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2014 12:24:49 +0200 Subject: timekeeping: Update timekeeper before updating vsyscall and pvclock The update_walltime() code works on the shadow timekeeper to make the seqcount protected region as short as possible. But that update to the shadow timekeeper does not update all timekeeper fields because it's sufficient to do that once before it becomes life. One of these fields is tkr.base_mono. That stays stale in the shadow timekeeper unless an operation happens which copies the real timekeeper to the shadow. The update function is called after the update calls to vsyscall and pvclock. While not correct, it did not cause any problems because none of the invoked update functions used base_mono. commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based) changed that in the kvm pvclock update function, so the stale mono_base value got used and caused kvm-clock to malfunction. Put the update where it belongs and fix the issue. Reported-by: Chris J Arges Reported-by: Paolo Bonzini Cc: Gleb Natapov Cc: John Stultz Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1409050000570.3333@nanos Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb4a9c2cf8d9..ec1791fae965 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -442,11 +442,12 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) tk->ntp_error = 0; ntp_clear(); } - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); tk_update_ktime_data(tk); + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); -- cgit v1.2.3-71-gd317 From 3577af70a2ce4853d58e57d832e687d739281479 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 2 Sep 2014 15:27:20 -0700 Subject: perf: Fix a race condition in perf_remove_from_context() We saw a kernel soft lockup in perf_remove_from_context(), it looks like the `perf` process, when exiting, could not go out of the retry loop. Meanwhile, the target process was forking a child. So either the target process should execute the smp function call to deactive the event (if it was running) or it should do a context switch which deactives the event. It seems we optimize out a context switch in perf_event_context_sched_out(), and what's more important, we still test an obsolete task pointer when retrying, so no one actually would deactive that event in this situation. Fix it directly by reloading the task pointer in perf_remove_from_context(). This should cure the above soft lockup. Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Link: http://lkml.kernel.org/r/1409696840-843-1-git-send-email-xiyou.wangcong@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f9c1ed002dbc..d640a8b4dcbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1524,6 +1524,11 @@ retry: */ if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -1967,6 +1972,11 @@ retry: */ if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } -- cgit v1.2.3-71-gd317 From 000a7d66ec30898f46869be01ab8205b056385d0 Mon Sep 17 00:00:00 2001 From: Patrick Palka Date: Tue, 9 Sep 2014 14:50:48 -0700 Subject: kernel/printk/printk.c: fix faulty logic in the case of recursive printk We shouldn't set text_len in the code path that detects printk recursion because text_len corresponds to the length of the string inside textbuf. A few lines down from the line text_len = strlen(recursion_msg); is the line text_len += vscnprintf(text + text_len, ...); So if printk detects recursion, it sets text_len to 29 (the length of recursion_msg) and logs an error. Then the message supplied by the caller of printk is stored inside textbuf but offset by 29 bytes. This means that the output of the recursive call to printk will contain 29 bytes of garbage in front of it. This defect is caused by commit 458df9fd4815 ("printk: remove separate printk_sched buffers and use printk buf instead") which turned the line text_len = vscnprintf(text, ...); into text_len += vscnprintf(text + text_len, ...); To fix this, this patch avoids setting text_len when logging the printk recursion error. This patch also marks unlikely() the branch leading up to this code. Fixes: 458df9fd4815b478 ("printk: remove separate printk_sched buffers and use printk buf instead") Signed-off-by: Patrick Palka Reviewed-by: Petr Mladek Reviewed-by: Jan Kara Acked-by: Steven Rostedt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e04c455a0e38..1ce770687ea8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1665,15 +1665,15 @@ asmlinkage int vprintk_emit(int facility, int level, raw_spin_lock(&logbuf_lock); logbuf_cpu = this_cpu; - if (recursion_bug) { + if (unlikely(recursion_bug)) { static const char recursion_msg[] = "BUG: recent printk recursion!"; recursion_bug = 0; - text_len = strlen(recursion_msg); /* emit KERN_CRIT message */ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, text_len); + NULL, 0, recursion_msg, + strlen(recursion_msg)); } /* -- cgit v1.2.3-71-gd317 From acbbe6fbb240a927ee1f5994f04d31267d422215 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 9 Sep 2014 14:51:01 -0700 Subject: kcmp: fix standard comparison bug The C operator <= defines a perfectly fine total ordering on the set of values representable in a long. However, unlike its namesake in the integers, it is not translation invariant, meaning that we do not have "b <= c" iff "a+b <= a+c" for all a,b,c. This means that it is always wrong to try to boil down the relationship between two longs to a question about the sign of their difference, because the resulting relation [a LEQ b iff a-b <= 0] is neither anti-symmetric or transitive. The former is due to -LONG_MIN==LONG_MIN (take any two a,b with a-b = LONG_MIN; then a LEQ b and b LEQ a, but a != b). The latter can either be seen observing that x LEQ x+1 for all x, implying x LEQ x+1 LEQ x+2 ... LEQ x-1 LEQ x; or more directly with the simple example a=LONG_MIN, b=0, c=1, for which a-b < 0, b-c < 0, but a-c > 0. Note that it makes absolutely no difference that a transmogrying bijection has been applied before the comparison is done. In fact, had the obfuscation not been done, one could probably not observe the bug (assuming all values being compared always lie in one half of the address space, the mathematical value of a-b is always representable in a long). As it stands, one can easily obtain three file descriptors exhibiting the non-transitivity of kcmp(). Side note 1: I can't see that ensuring the MSB of the multiplier is set serves any purpose other than obfuscating the obfuscating code. Side note 2: #include #include #include #include #include #include #include enum kcmp_type { KCMP_FILE, KCMP_VM, KCMP_FILES, KCMP_FS, KCMP_SIGHAND, KCMP_IO, KCMP_SYSVSEM, KCMP_TYPES, }; pid_t pid; int kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) { return syscall(SYS_kcmp, pid1, pid2, type, idx1, idx2); } int cmp_fd(int fd1, int fd2) { int c = kcmp(pid, pid, KCMP_FILE, fd1, fd2); if (c < 0) { perror("kcmp"); exit(1); } assert(0 <= c && c < 3); return c; } int cmp_fdp(const void *a, const void *b) { static const int normalize[] = {0, -1, 1}; return normalize[cmp_fd(*(int*)a, *(int*)b)]; } #define MAX 100 /* This is plenty; I've seen it trigger for MAX==3 */ int main(int argc, char *argv[]) { int r, s, count = 0; int REL[3] = {0,0,0}; int fd[MAX]; pid = getpid(); while (count < MAX) { r = open("/dev/null", O_RDONLY); if (r < 0) break; fd[count++] = r; } printf("opened %d file descriptors\n", count); for (r = 0; r < count; ++r) { for (s = r+1; s < count; ++s) { REL[cmp_fd(fd[r], fd[s])]++; } } printf("== %d\t< %d\t> %d\n", REL[0], REL[1], REL[2]); qsort(fd, count, sizeof(fd[0]), cmp_fdp); memset(REL, 0, sizeof(REL)); for (r = 0; r < count; ++r) { for (s = r+1; s < count; ++s) { REL[cmp_fd(fd[r], fd[s])]++; } } printf("== %d\t< %d\t> %d\n", REL[0], REL[1], REL[2]); return (REL[0] + REL[2] != 0); } Signed-off-by: Rasmus Villemoes Reviewed-by: Cyrill Gorcunov "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcmp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kcmp.c b/kernel/kcmp.c index e30ac0fe61c3..0aa69ea1d8fd 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type) */ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) { - long ret; + long t1, t2; - ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); + t1 = kptr_obfuscate((long)v1, type); + t2 = kptr_obfuscate((long)v2, type); - return (ret < 0) | ((ret > 0) << 1); + return (t1 < t2) | ((t1 > t2) << 1); } /* The caller must have pinned the task */ -- cgit v1.2.3-71-gd317 From 13c42c2f43b19aab3195f2d357db00d1e885eaa8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Sep 2014 23:44:35 +0200 Subject: futex: Unlock hb->lock in futex_wait_requeue_pi() error path futex_wait_requeue_pi() calls futex_wait_setup(). If futex_wait_setup() succeeds it returns with hb->lock held and preemption disabled. Now the sanity check after this does: if (match_futex(&q.key, &key2)) { ret = -EINVAL; goto out_put_keys; } which releases the keys but does not release hb->lock. So we happily return to user space with hb->lock held and therefor preemption disabled. Unlock hb->lock before taking the exit route. Reported-by: Dave "Trinity" Jones Signed-off-by: Thomas Gleixner Reviewed-by: Darren Hart Reviewed-by: Davidlohr Bueso Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1409112318500.4178@nanos Signed-off-by: Thomas Gleixner --- kernel/futex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index d3a9d946d0b7..815d7af2ffe8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2592,6 +2592,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * shared futexes. We need to compare the keys: */ if (match_futex(&q.key, &key2)) { + queue_unlock(hb); ret = -EINVAL; goto out_put_keys; } -- cgit v1.2.3-71-gd317 From d78c9300c51d6ceed9f6d078d4e9366f259de28c Mon Sep 17 00:00:00 2001 From: Andrew Hunter Date: Thu, 4 Sep 2014 14:17:16 -0700 Subject: jiffies: Fix timeval conversion to jiffies timeval_to_jiffies tried to round a timeval up to an integral number of jiffies, but the logic for doing so was incorrect: intervals corresponding to exactly N jiffies would become N+1. This manifested itself particularly repeatedly stopping/starting an itimer: setitimer(ITIMER_PROF, &val, NULL); setitimer(ITIMER_PROF, NULL, &val); would add a full tick to val, _even if it was exactly representable in terms of jiffies_ (say, the result of a previous rounding.) Doing this repeatedly would cause unbounded growth in val. So fix the math. Here's what was wrong with the conversion: we essentially computed (eliding seconds) jiffies = usec * (NSEC_PER_USEC/TICK_NSEC) by using scaling arithmetic, which took the best approximation of NSEC_PER_USEC/TICK_NSEC with denominator of 2^USEC_JIFFIE_SC = x/(2^USEC_JIFFIE_SC), and computed: jiffies = (usec * x) >> USEC_JIFFIE_SC and rounded this calculation up in the intermediate form (since we can't necessarily exactly represent TICK_NSEC in usec.) But the scaling arithmetic is a (very slight) *over*approximation of the true value; that is, instead of dividing by (1 usec/ 1 jiffie), we effectively divided by (1 usec/1 jiffie)-epsilon (rounding down). This would normally be fine, but we want to round timeouts up, and we did so by adding 2^USEC_JIFFIE_SC - 1 before the shift; this would be fine if our division was exact, but dividing this by the slightly smaller factor was equivalent to adding just _over_ 1 to the final result (instead of just _under_ 1, as desired.) In particular, with HZ=1000, we consistently computed that 10000 usec was 11 jiffies; the same was true for any exact multiple of TICK_NSEC. We could possibly still round in the intermediate form, adding something less than 2^USEC_JIFFIE_SC - 1, but easier still is to convert usec->nsec, round in nanoseconds, and then convert using time*spec*_to_jiffies. This adds one constant multiplication, and is not observably slower in microbenchmarks on recent x86 hardware. Tested: the following program: int main() { struct itimerval zero = {{0, 0}, {0, 0}}; /* Initially set to 10 ms. */ struct itimerval initial = zero; initial.it_interval.tv_usec = 10000; setitimer(ITIMER_PROF, &initial, NULL); /* Save and restore several times. */ for (size_t i = 0; i < 10; ++i) { struct itimerval prev; setitimer(ITIMER_PROF, &zero, &prev); /* on old kernels, this goes up by TICK_USEC every iteration */ printf("previous value: %ld %ld %ld %ld\n", prev.it_interval.tv_sec, prev.it_interval.tv_usec, prev.it_value.tv_sec, prev.it_value.tv_usec); setitimer(ITIMER_PROF, &prev, NULL); } return 0; } Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Paul Turner Cc: Richard Cochran Cc: Prarit Bhargava Reviewed-by: Paul Turner Reported-by: Aaron Jacobs Signed-off-by: Andrew Hunter [jstultz: Tweaked to apply to 3.17-rc] Signed-off-by: John Stultz --- include/linux/jiffies.h | 12 ----------- kernel/time/time.c | 56 +++++++++++++++++++++++++++---------------------- 2 files changed, 31 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 1f44466c1e9d..c367cbdf73ab 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -258,23 +258,11 @@ extern unsigned long preset_lpj; #define SEC_JIFFIE_SC (32 - SHIFT_HZ) #endif #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29) -#define USEC_JIFFIE_SC (SEC_JIFFIE_SC + 19) #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) -#define USEC_CONVERSION \ - ((unsigned long)((((u64)NSEC_PER_USEC << USEC_JIFFIE_SC) +\ - TICK_NSEC -1) / (u64)TICK_NSEC)) -/* - * USEC_ROUND is used in the timeval to jiffie conversion. See there - * for more details. It is the scaled resolution rounding value. Note - * that it is a 64-bit value. Since, when it is applied, we are already - * in jiffies (albit scaled), it is nothing but the bits we will shift - * off. - */ -#define USEC_ROUND (u64)(((u64)1 << USEC_JIFFIE_SC) - 1) /* * The maximum jiffie value is (MAX_INT >> 1). Here we translate that * into seconds. The 64-bit case will overflow if we are not careful, diff --git a/kernel/time/time.c b/kernel/time/time.c index f0294ba14634..a9ae20fb0b11 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -559,17 +559,20 @@ EXPORT_SYMBOL(usecs_to_jiffies); * that a remainder subtract here would not do the right thing as the * resolution values don't fall on second boundries. I.e. the line: * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * Note that due to the small error in the multiplier here, this + * rounding is incorrect for sufficiently large values of tv_nsec, but + * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're + * OK. * * Rather, we just shift the bits off the right. * * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. */ -unsigned long -timespec_to_jiffies(const struct timespec *value) +static unsigned long +__timespec_to_jiffies(unsigned long sec, long nsec) { - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; + nsec = nsec + TICK_NSEC - 1; if (sec >= MAX_SEC_IN_JIFFIES){ sec = MAX_SEC_IN_JIFFIES; @@ -580,6 +583,13 @@ timespec_to_jiffies(const struct timespec *value) (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } + +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); +} + EXPORT_SYMBOL(timespec_to_jiffies); void @@ -596,31 +606,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) } EXPORT_SYMBOL(jiffies_to_timespec); -/* Same for "timeval" - * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. +/* + * We could use a similar algorithm to timespec_to_jiffies (with a + * different multiplier for usec instead of nsec). But this has a + * problem with rounding: we can't exactly add TICK_NSEC - 1 to the + * usec value, since it's not necessarily integral. + * + * We could instead round in the intermediate scaled representation + * (i.e. in units of 1/2^(large scale) jiffies) but that's also + * perilous: the scaling introduces a small positive error, which + * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 + * units to the intermediate before shifting) leads to accidental + * overflow and overestimates. + * + * At the cost of one additional multiplication by a constant, just + * use the timespec implementation. */ unsigned long timeval_to_jiffies(const struct timeval *value) { - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + return __timespec_to_jiffies(value->tv_sec, + value->tv_usec * NSEC_PER_USEC); } EXPORT_SYMBOL(timeval_to_jiffies); -- cgit v1.2.3-71-gd317 From e86fea764991e00a03ff1e56409ec9cacdbda4c9 Mon Sep 17 00:00:00 2001 From: Richard Larocque Date: Tue, 9 Sep 2014 18:31:03 -0700 Subject: alarmtimer: Return relative times in timer_gettime Returns the time remaining for an alarm timer, rather than the time at which it is scheduled to expire. If the timer has already expired or it is not currently scheduled, the it_value's members are set to zero. This new behavior matches that of the other posix-timers and the POSIX specifications. This is a change in user-visible behavior, and may break existing applications. Hopefully, few users rely on the old incorrect behavior. Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Sharvil Nanavati Signed-off-by: Richard Larocque [jstultz: minor style tweak] Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4aec4a457431..b4bce62e47b2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -541,18 +541,22 @@ static int alarm_timer_create(struct k_itimer *new_timer) * @new_timer: k_itimer pointer * @cur_setting: itimerspec data to fill * - * Copies the itimerspec data out from the k_itimer + * Copies out the current itimerspec data */ static void alarm_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) { - memset(cur_setting, 0, sizeof(struct itimerspec)); + ktime_t relative_expiry_time = + alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); - cur_setting->it_interval = - ktime_to_timespec(timr->it.alarm.interval); - cur_setting->it_value = - ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); - return; + if (ktime_to_ns(relative_expiry_time) > 0) { + cur_setting->it_value = ktime_to_timespec(relative_expiry_time); + } else { + cur_setting->it_value.tv_sec = 0; + cur_setting->it_value.tv_nsec = 0; + } + + cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); } /** -- cgit v1.2.3-71-gd317 From 265b81d23a46c39df0a735a3af4238954b41a4c2 Mon Sep 17 00:00:00 2001 From: Richard Larocque Date: Tue, 9 Sep 2014 18:31:04 -0700 Subject: alarmtimer: Do not signal SIGEV_NONE timers Avoids sending a signal to alarm timers created with sigev_notify set to SIGEV_NONE by checking for that special case in the timeout callback. The regular posix timers avoid sending signals to SIGEV_NONE timers by not scheduling any callbacks for them in the first place. Although it would be possible to do something similar for alarm timers, it's simpler to handle this as a special case in the timeout. Prior to this patch, the alarm timer would ignore the sigev_notify value and try to deliver signals to the process anyway. Even worse, the sanity check for the value of sigev_signo is skipped when SIGEV_NONE was specified, so the signal number could be bogus. If sigev_signo was an unitialized value (as it often would be if SIGEV_NONE is used), then it's hard to predict which signal will be sent. Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Sharvil Nanavati Signed-off-by: Richard Larocque Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index b4bce62e47b2..41a925396830 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -466,8 +466,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, { struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); - if (posix_timer_event(ptr, 0) != 0) - ptr->it_overrun++; + if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { + if (posix_timer_event(ptr, 0) != 0) + ptr->it_overrun++; + } /* Re-add periodic timers */ if (ptr->it.alarm.interval.tv64) { -- cgit v1.2.3-71-gd317 From 474e941bed9262f5fa2394f9a4a67e24499e5926 Mon Sep 17 00:00:00 2001 From: Richard Larocque Date: Tue, 9 Sep 2014 18:31:05 -0700 Subject: alarmtimer: Lock k_itimer during timer callback Locks the k_itimer's it_lock member when handling the alarm timer's expiry callback. The regular posix timers defined in posix-timers.c have this lock held during timout processing because their callbacks are routed through posix_timer_fn(). The alarm timers follow a different path, so they ought to grab the lock somewhere else. Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Sharvil Nanavati Signed-off-by: Richard Larocque Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 41a925396830..a7077d3ae52f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -464,8 +464,12 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) { + unsigned long flags; struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); + enum alarmtimer_restart result = ALARMTIMER_NORESTART; + + spin_lock_irqsave(&ptr->it_lock, flags); if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { if (posix_timer_event(ptr, 0) != 0) ptr->it_overrun++; @@ -475,9 +479,11 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, if (ptr->it.alarm.interval.tv64) { ptr->it_overrun += alarm_forward(alarm, now, ptr->it.alarm.interval); - return ALARMTIMER_RESTART; + result = ALARMTIMER_RESTART; } - return ALARMTIMER_NORESTART; + spin_unlock_irqrestore(&ptr->it_lock, flags); + + return result; } /** -- cgit v1.2.3-71-gd317 From eb4aec84d6bdf98d00cedb41c18000f7a31e648a Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Thu, 18 Sep 2014 17:28:46 +0800 Subject: cgroup: fix unbalanced locking cgroup_pidlist_start() holds cgrp->pidlist_mutex and then calls pidlist_array_load(), and cgroup_pidlist_stop() releases the mutex. It is wrong that we release the mutex in the failure path in pidlist_array_load(), because cgroup_pidlist_stop() will be called no matter if cgroup_pidlist_start() returns errno or not. Fixes: 4bac00d16a8760eae7205e41d2c246477d42a210 Cc: # 3.14+ Signed-off-by: Zefan Li Signed-off-by: Tejun Heo Acked-by: Cong Wang --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 940aced4ed00..3a73f995a81e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3985,7 +3985,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, l = cgroup_pidlist_find_create(cgrp, type); if (!l) { - mutex_unlock(&cgrp->pidlist_mutex); pidlist_free(array); return -ENOMEM; } -- cgit v1.2.3-71-gd317