From b6070a8d9853eda010a549fa9a09eb8d7269b929 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 20 Jul 2012 11:53:29 -0700 Subject: futex: Test for pi_mutex on fault in futex_wait_requeue_pi() If fixup_pi_state_owner() faults, pi_mutex may be NULL. Test for pi_mutex != NULL before testing the owner against current and possibly unlocking it. Signed-off-by: Darren Hart Cc: Dave Jones Cc: Dan Carpenter Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/dc59890338fc413606f04e5c5b131530734dae3d.1342809673.git.dvhart@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e2b0fb9a0b3b..05018bfe21a7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2370,7 +2370,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * fault, unlock the rt_mutex and return the fault to userspace. */ if (ret == -EFAULT) { - if (rt_mutex_owner(pi_mutex) == current) + if (pi_mutex && rt_mutex_owner(pi_mutex) == current) rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { /* -- cgit v1.2.3-71-gd317 From f27071cb7fe3e1d37a9dbe6c0dfc5395cd40fa43 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 20 Jul 2012 11:53:30 -0700 Subject: futex: Fix bug in WARN_ON for NULL q.pi_state The WARN_ON in futex_wait_requeue_pi() for a NULL q.pi_state was testing the address (&q.pi_state) of the pointer instead of the value (q.pi_state) of the pointer. Correct it accordingly. Signed-off-by: Darren Hart Cc: Dave Jones Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1c85d97f6e5f79ec389a4ead3e367363c74bd09a.1342809673.git.dvhart@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 05018bfe21a7..5551adaf7cdf 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2343,7 +2343,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * signal. futex_unlock_pi() will not destroy the lock_ptr nor * the pi_state. */ - WARN_ON(!&q.pi_state); + WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); debug_rt_mutex_free_waiter(&rt_waiter); -- cgit v1.2.3-71-gd317 From 6f7b0a2a5c0fb03be7c25bd1745baa50582348ef Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 20 Jul 2012 11:53:31 -0700 Subject: futex: Forbid uaddr == uaddr2 in futex_wait_requeue_pi() If uaddr == uaddr2, then we have broken the rule of only requeueing from a non-pi futex to a pi futex with this call. If we attempt this, as the trinity test suite manages to do, we miss early wakeups as q.key is equal to key2 (because they are the same uaddr). We will then attempt to dereference the pi_mutex (which would exist had the futex_q been properly requeued to a pi futex) and trigger a NULL pointer dereference. Signed-off-by: Darren Hart Cc: Dave Jones Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/ad82bfe7f7d130247fbe2b5b4275654807774227.1342809673.git.dvhart@linux.intel.com Signed-off-by: Thomas Gleixner --- kernel/futex.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5551adaf7cdf..3717e7b306e0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to - * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and - * complete the acquisition of the rt_mutex prior to returning to userspace. - * This ensures the rt_mutex maintains an owner when it has waiters; without - * one, the pi logic wouldn't know which task to boost/deboost, if there was a - * need to. + * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake + * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to + * userspace. This ensures the rt_mutex maintains an owner when it has waiters; + * without one, the pi logic would not know which task to boost/deboost, if + * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there * via the following: @@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (uaddr == uaddr2) + return -EINVAL; + if (!bitset) return -EINVAL; -- cgit v1.2.3-71-gd317 From b44d50dcacea0d485ca2ff9140f8cc28ee22f28d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 23 Jul 2012 16:22:37 -0400 Subject: time: Fix casting issue in tk_set_xtime and tk_xtime_add commit 1e75fa8b (time: Condense timekeeper.xtime into xtime_sec) introduced helper functions which apply a timespec to the core internal timekeeper data. The internal storage type is u64. The timespec tv_nsec value must be shifted before set or added to the internal value. tv_nsec is a long, which is 32bit on a 32bit system, so without casting tv_nsec to u64 we lose the bits which are shifted over the 32bit boundary. Add the proper typecasts. Reported-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk Signed-off-by: John Stultz Acked-by: Prarit Bhargava Link: http://lkml.kernel.org/r/1343074957-16541-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5980e902978c..8f2aba1246f2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -108,13 +108,13 @@ static struct timespec tk_xtime(struct timekeeper *tk) static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec = ts->tv_sec; - tk->xtime_nsec = ts->tv_nsec << tk->shift; + tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; } static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec += ts->tv_sec; - tk->xtime_nsec += ts->tv_nsec << tk->shift; + tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; } /** -- cgit v1.2.3-71-gd317 From dc9b229a58dc0dfed34272ff26c6d5fd17c674e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Jul 2012 19:29:45 +0200 Subject: genirq: Allow irq chips to mark themself oneshot safe Some interrupt chips like MSI are oneshot safe by implementation. For those interrupts we can avoid the mask/unmask sequence for threaded interrupt handlers. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1207132056540.32033@ionos Cc: Linus Torvalds Cc: Avi Kivity Cc: Marcelo Tosatti Cc: Jan Kiszka --- include/linux/irq.h | 1 + kernel/irq/manage.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 553fb66da130..216b0ba109d7 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -349,6 +349,7 @@ enum { IRQCHIP_MASK_ON_SUSPEND = (1 << 2), IRQCHIP_ONOFFLINE_ENABLED = (1 << 3), IRQCHIP_SKIP_SET_WAKE = (1 << 4), + IRQCHIP_ONESHOT_SAFE = (1 << 5), }; /* This include will go away once we isolated irq_desc usage to core code */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8c548232ba39..2e326d1ebec1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -959,6 +959,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto out_thread; } + /* + * Drivers are often written to work w/o knowledge about the + * underlying irq chip implementation, so a request for a + * threaded irq without a primary hard irq context handler + * requires the ONESHOT flag to be set. Some irq chips like + * MSI based interrupts are per se one shot safe. Check the + * chip flags, so we can avoid the unmask dance at the end of + * the threaded handler for those. + */ + if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) + new->flags &= ~IRQF_ONESHOT; + /* * The following block of code has to be executed atomically */ @@ -1033,7 +1045,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ new->thread_mask = 1 << ffz(thread_mask); - } else if (new->handler == irq_default_primary_handler) { + } else if (new->handler == irq_default_primary_handler && + !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it -- cgit v1.2.3-71-gd317 From 45afb1734fa6323a8ba08bd6c392ee227df67dde Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 7 Jul 2012 16:49:02 +0900 Subject: sched: Use task_rq_unlock() in __sched_setscheduler() It seems there's no specific reason to open-code it. I guess commit 0122ec5b02f76 ("sched: Add p->pi_lock to task_rq_lock()") simply missed it. Let's be consistent with others. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1341647342-6742-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5d011ef4c0df..2cb4e7777998 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4340,9 +4340,7 @@ recheck: */ if (unlikely(policy == p->policy && (!rt_policy(policy) || param->sched_priority == p->rt_priority))) { - - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return 0; } -- cgit v1.2.3-71-gd317 From 014acbf0d5c8445e0ff88ae60edd676dd9cc461c Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Thu, 12 Jul 2012 15:03:42 +0800 Subject: sched: Fix minor code style issues Delete redudant spaces between type name and data name or operators. Signed-off-by: Ying Xue Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1342076622-6606-1-git-send-email-ying.xue0@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cpupri.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index d72586fdf660..23aa789c53ee 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -65,8 +65,8 @@ static int convert_prio(int prio) int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask) { - int idx = 0; - int task_pri = convert_prio(p->prio); + int idx = 0; + int task_pri = convert_prio(p->prio); if (task_pri >= MAX_RT_PRIO) return 0; @@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, */ void cpupri_set(struct cpupri *cp, int cpu, int newpri) { - int *currpri = &cp->cpu_to_pri[cpu]; - int oldpri = *currpri; - int do_mb = 0; + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + int do_mb = 0; newpri = convert_prio(newpri); -- cgit v1.2.3-71-gd317 From 0f26d0e0a715556270d85b7946b99546a2f92888 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 30 Jul 2012 22:44:41 -0500 Subject: kdb: Remove unused KDB_FLAG_ONLY_DO_DUMP This code cleanup was missed in the original kdb merge, and this code is simply not used at all. The code that was previously used to set the KDB_FLAG_ONLY_DO_DUMP was removed prior to the initial kdb merge. Signed-off-by: Jason Wessel --- include/linux/kdb.h | 2 -- kernel/debug/kdb/kdb_main.c | 15 +-------------- 2 files changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 064725854db8..42d9e863a313 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -75,8 +75,6 @@ extern const char *kdb_diemsg; #define KDB_FLAG_CATASTROPHIC (1 << 1) /* A catastrophic event has occurred */ #define KDB_FLAG_CMD_INTERRUPT (1 << 2) /* Previous command was interrupted */ #define KDB_FLAG_NOIPI (1 << 3) /* Do not send IPIs */ -#define KDB_FLAG_ONLY_DO_DUMP (1 << 4) /* Only do a dump, used when - * kdb is off */ #define KDB_FLAG_NO_CONSOLE (1 << 5) /* No console is available, * kdb is disabled */ #define KDB_FLAG_NO_VT_CONSOLE (1 << 6) /* No VT console is available, do diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 1f91413edb87..31df1706b9a9 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -139,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); static char *__env[] = { #if defined(CONFIG_SMP) "PROMPT=[%d]kdb> ", - "MOREPROMPT=[%d]more> ", #else "PROMPT=kdb> ", - "MOREPROMPT=more> ", #endif + "MOREPROMPT=more> ", "RADIX=16", "MDCOUNT=8", /* lines of md output */ KDB_PLATFORM_ENV, @@ -1236,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, *cmdbuf = '\0'; *(cmd_hist[cmd_head]) = '\0'; - if (KDB_FLAG(ONLY_DO_DUMP)) { - /* kdb is off but a catastrophic error requires a dump. - * Take the dump and reboot. - * Turn on logging so the kdb output appears in the log - * buffer in the dump. - */ - const char *setargs[] = { "set", "LOGGING", "1" }; - kdb_set(2, setargs); - kdb_reboot(0, NULL); - /*NOTREACHED*/ - } - do_full_getstr: #if defined(CONFIG_SMP) snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), -- cgit v1.2.3-71-gd317 From 07cd27bbd4d07af6c3e24ae479316a69e7935e1e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 30 Jul 2012 22:44:41 -0500 Subject: kdb: Remove cpu from the more prompt Having the CPU in the more prompt is completely redundent vs the standard kdb prompt, and it also wastes 32 bytes on the stack. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_io.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index bb9520f0f6ff..0a69d2adc4f3 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -715,9 +715,6 @@ kdb_printit: /* check for having reached the LINES number of printed lines */ if (kdb_nextline == linecount) { char buf1[16] = ""; -#if defined(CONFIG_SMP) - char buf2[32]; -#endif /* Watch out for recursion here. Any routine that calls * kdb_printf will come back through here. And kdb_read @@ -732,14 +729,6 @@ kdb_printit: if (moreprompt == NULL) moreprompt = "more> "; -#if defined(CONFIG_SMP) - if (strchr(moreprompt, '%')) { - sprintf(buf2, moreprompt, get_cpu()); - put_cpu(); - moreprompt = buf2; - } -#endif - kdb_input_flush(); c = console_drivers; -- cgit v1.2.3-71-gd317 From b10d22d6e8f76b9e94871aebe0fc62aab2748200 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 30 Jul 2012 04:58:10 -0700 Subject: kernel/debug: Make use of KGDB_REASON_NMI Currently kernel never set KGDB_REASON_NMI. We do now, when we enter KGDB/KDB from an NMI. This is not to be confused with kgdb_nmicallback(), NMI callback is an entry for the slave CPUs during CPUs roundup, but REASON_NMI is the entry for the master CPU. Signed-off-by: Anton Vorontsov Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_debugger.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8b68ce78ff17..be7b33b73d30 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "kdb_private.h" #include "../debug_core.h" @@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks) if (atomic_read(&kgdb_setting_breakpoint)) reason = KDB_REASON_KEYBOARD; + if (in_nmi()) + reason = KDB_REASON_NMI; + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { if ((bp->bp_enabled) && (bp->bp_addr == addr)) { reason = KDB_REASON_BREAK; -- cgit v1.2.3-71-gd317 From b9403130a5350fca59a50ed11c198cb8c7e54119 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 12 Jul 2012 16:10:13 +0800 Subject: sched/cleanups: Add load balance cpumask pointer to 'struct lb_env' With this patch struct ld_env will have a pointer of the load balancing cpumask and we don't need to pass a cpumask around anymore. Signed-off-by: Michael Wang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FFE8665.3080705@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 22321db64952..d0cc03b3e70b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3069,6 +3069,9 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + /* The set of CPUs under consideration for load-balancing */ + struct cpumask *cpus; + unsigned int flags; unsigned int loop; @@ -3653,8 +3656,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, const struct cpumask *cpus, - int *balance, struct sg_lb_stats *sgs) + int local_group, int *balance, struct sg_lb_stats *sgs) { unsigned long nr_running, max_nr_running, min_nr_running; unsigned long load, max_cpu_load, min_cpu_load; @@ -3671,7 +3673,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, max_nr_running = 0; min_nr_running = ~0UL; - for_each_cpu_and(i, sched_group_cpus(group), cpus) { + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); nr_running = rq->nr_running; @@ -3800,8 +3802,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct lb_env *env, - const struct cpumask *cpus, - int *balance, struct sd_lb_stats *sds) + int *balance, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; @@ -3818,8 +3819,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(env, sg, load_idx, local_group, - cpus, balance, &sgs); + update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); if (local_group && !(*balance)) return; @@ -4055,7 +4055,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * to restore balance. * * @env: The load balancing environment. - * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. * @@ -4065,7 +4064,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * -find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) +find_busiest_group(struct lb_env *env, int *balance) { struct sd_lb_stats sds; @@ -4075,7 +4074,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(env, cpus, balance, &sds); + update_sd_lb_stats(env, balance, &sds); /* * this_cpu is not the appropriate cpu to perform load balancing at @@ -4155,8 +4154,7 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq *find_busiest_queue(struct lb_env *env, - struct sched_group *group, - const struct cpumask *cpus) + struct sched_group *group) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -4171,7 +4169,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (!capacity) capacity = fix_small_capacity(env->sd, group); - if (!cpumask_test_cpu(i, cpus)) + if (!cpumask_test_cpu(i, env->cpus)) continue; rq = cpu_rq(i); @@ -4252,6 +4250,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .dst_grpmask = sched_group_cpus(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, + .cpus = cpus, }; cpumask_copy(cpus, cpu_active_mask); @@ -4260,7 +4259,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(&env, cpus, balance); + group = find_busiest_group(&env, balance); if (*balance == 0) goto out_balanced; @@ -4270,7 +4269,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(&env, group, cpus); + busiest = find_busiest_queue(&env, group); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; -- cgit v1.2.3-71-gd317 From e6dab5ffab59e910ec0e3355f4a6f29f7a7be474 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Wed, 11 Jul 2012 18:14:58 +0400 Subject: perf/trace: Add ability to set a target task for events A few events are interesting not only for a current task. For example, sched_stat_* events are interesting for a task which wakes up. For this reason, it will be good if such events will be delivered to a target task too. Now a target task can be set by using __perf_task(). The original idea and a draft patch belongs to Peter Zijlstra. I need these events for profiling sleep times. sched_switch is used for getting callchains and sched_stat_* is used for getting time periods. These events are combined in user space, then it can be analyzed by perf tools. Inspired-by: Peter Zijlstra Cc: Steven Rostedt Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Arun Sharma Signed-off-by: Andrew Vagin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1342016098-213063-1-git-send-email-avagin@openvz.org Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 5 +++-- include/linux/perf_event.h | 3 ++- include/trace/events/sched.h | 4 ++++ include/trace/ftrace.h | 6 +++++- kernel/events/callchain.c | 9 ++++++++- kernel/events/core.c | 30 ++++++++++++++++++++++++++++-- kernel/events/internal.h | 3 ++- kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_kprobe.c | 6 ++++-- kernel/trace/trace_syscalls.c | 4 ++-- kernel/trace/trace_uprobe.c | 2 +- 11 files changed, 60 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index af961d6f7ab1..642928cf57b4 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -306,9 +306,10 @@ extern void *perf_trace_buf_prepare(int size, unsigned short type, static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, - u64 count, struct pt_regs *regs, void *head) + u64 count, struct pt_regs *regs, void *head, + struct task_struct *task) { - perf_tp_event(addr, count, raw_data, size, regs, head, rctx); + perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task); } #endif diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 76c5c8b724a7..7602ccb3f40e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1272,7 +1272,8 @@ static inline bool perf_paranoid_kernel(void) extern void perf_event_init(void); extern void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, struct pt_regs *regs, - struct hlist_head *head, int rctx); + struct hlist_head *head, int rctx, + struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ea7a2035456d..5a8671e8a67f 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -73,6 +73,9 @@ DECLARE_EVENT_CLASS(sched_wakeup_template, __entry->prio = p->prio; __entry->success = success; __entry->target_cpu = task_cpu(p); + ) + TP_perf_assign( + __perf_task(p); ), TP_printk("comm=%s pid=%d prio=%d success=%d target_cpu=%03d", @@ -325,6 +328,7 @@ DECLARE_EVENT_CLASS(sched_stat_template, ) TP_perf_assign( __perf_count(delay); + __perf_task(tsk); ), TP_printk("comm=%s pid=%d delay=%Lu [ns]", diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index c6bc2faaf261..a763888a36f9 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -712,6 +712,9 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call #undef __perf_count #define __perf_count(c) __count = (c) +#undef __perf_task +#define __perf_task(t) __task = (t) + #undef TP_perf_assign #define TP_perf_assign(args...) args @@ -725,6 +728,7 @@ perf_trace_##call(void *__data, proto) \ struct ftrace_raw_##call *entry; \ struct pt_regs __regs; \ u64 __addr = 0, __count = 1; \ + struct task_struct *__task = NULL; \ struct hlist_head *head; \ int __entry_size; \ int __data_size; \ @@ -752,7 +756,7 @@ perf_trace_##call(void *__data, proto) \ \ head = this_cpu_ptr(event_call->perf_events); \ perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ - __count, &__regs, head); \ + __count, &__regs, head, __task); \ } /* diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6581a040f399..98d4597f43d6 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -153,7 +153,8 @@ put_callchain_entry(int rctx) put_recursion_context(__get_cpu_var(callchain_recursion), rctx); } -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs) { int rctx; struct perf_callchain_entry *entry; @@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) } if (regs) { + /* + * Disallow cross-task user callchains. + */ + if (event->ctx->task && event->ctx->task != current) + goto exit_put; + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); } diff --git a/kernel/events/core.c b/kernel/events/core.c index f1cf0edeb39a..b7935fcec7d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4039,7 +4039,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - data->callchain = perf_callchain(regs); + data->callchain = perf_callchain(event, regs); if (data->callchain) size += data->callchain->nr; @@ -5209,7 +5209,8 @@ static int perf_tp_event_match(struct perf_event *event, } void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head, int rctx) + struct pt_regs *regs, struct hlist_head *head, int rctx, + struct task_struct *task) { struct perf_sample_data data; struct perf_event *event; @@ -5228,6 +5229,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_swevent_event(event, count, &data, regs); } + /* + * If we got specified a target task, also iterate its context and + * deliver this event there too. + */ + if (task && task != current) { + struct perf_event_context *ctx; + struct trace_entry *entry = record; + + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + if (!ctx) + goto unlock; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->attr.type != PERF_TYPE_TRACEPOINT) + continue; + if (event->attr.config != entry->type) + continue; + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } +unlock: + rcu_read_unlock(); + } + perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index b0b107f90afc..a096c19f2c2a 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle, } /* Callchain handling */ -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(void); extern void put_callchain_buffers(void); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fee3752ae8f6..8a6d2ee2086c 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) head = this_cpu_ptr(event_function.perf_events); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, - 1, ®s, head); + 1, ®s, head, NULL); #undef ENTRY_SIZE } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b31d3d5699fe..1a2117043bb1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ip, 1, regs, head, NULL); } /* Kretprobe profile handler */ @@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ret_ip, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 96fc73369099..60e4d7875672 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -532,7 +532,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) (unsigned long *)&rec->args); head = this_cpu_ptr(sys_data->enter_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysenter_enable(struct ftrace_event_call *call) @@ -608,7 +608,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->ret = syscall_get_return_value(current, regs); head = this_cpu_ptr(sys_data->exit_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysexit_enable(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2b36ac68549e..03003cd7dd96 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); out: preempt_enable(); -- cgit v1.2.3-71-gd317 From 02ab20ae38337b99b5c29c81090f594b8fd61283 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 27 Jul 2012 14:48:10 -0400 Subject: time/jiffies: Rename ACTHZ to SHIFTED_HZ Ingo noted that ACTHZ is a confusing name, and requested it be renamed, so this patch renames ACTHZ to SHIFTED_HZ to better describe it. Signed-off-by: John Stultz Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1343414893-45779-3-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- include/linux/jiffies.h | 21 +++++++++++++-------- include/linux/timex.h | 2 +- kernel/time/jiffies.c | 2 +- kernel/time/ntp.c | 2 +- 4 files changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 7d24466fb80b..82680541576d 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -55,21 +55,26 @@ /* LATCH is used in the interval timer and ftape setup. */ # define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ -/* HZ is the requested value. ACTHZ is actual HZ ("<< 8" is for accuracy) */ -# define ACTHZ (SH_DIV(CLOCK_TICK_RATE, LATCH, 8)) +/* + * HZ is the requested value. However the CLOCK_TICK_RATE may not allow + * for exactly HZ. So SHIFTED_HZ is high res HZ ("<< 8" is for accuracy) + */ +# define SHIFTED_HZ (SH_DIV(CLOCK_TICK_RATE, LATCH, 8)) #else -# define ACTHZ (HZ << 8) +# define SHIFTED_HZ (HZ << 8) #endif -/* TICK_NSEC is the time between ticks in nsec assuming real ACTHZ */ -#define TICK_NSEC (SH_DIV (1000000UL * 1000, ACTHZ, 8)) +/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */ +#define TICK_NSEC (SH_DIV(1000000UL * 1000, SHIFTED_HZ, 8)) /* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) -/* TICK_USEC_TO_NSEC is the time between ticks in nsec assuming real ACTHZ and */ -/* a value TUSEC for TICK_USEC (can be set bij adjtimex) */ -#define TICK_USEC_TO_NSEC(TUSEC) (SH_DIV (TUSEC * USER_HZ * 1000, ACTHZ, 8)) +/* + * TICK_USEC_TO_NSEC is the time between ticks in nsec assuming SHIFTED_HZ and + * a value TUSEC for TICK_USEC (can be set bij adjtimex) + */ +#define TICK_USEC_TO_NSEC(TUSEC) (SH_DIV(TUSEC * USER_HZ * 1000, SHIFTED_HZ, 8)) /* some arch's have a small-data section that can be accessed register-relative * but that can only take up to, say, 4-byte variables. jiffies being part of diff --git a/include/linux/timex.h b/include/linux/timex.h index 99bc88b1fc02..7c5ceb20e03a 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -232,7 +232,7 @@ struct timex { * estimated error = NTP dispersion. */ extern unsigned long tick_usec; /* USER_HZ period (usec) */ -extern unsigned long tick_nsec; /* ACTHZ period (nsec) */ +extern unsigned long tick_nsec; /* SHIFTED_HZ period (nsec) */ extern void ntp_init(void); extern void ntp_clear(void); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a470154e0408..46da0537c10b 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@ * requested HZ value. It is also not recommended * for "tick-less" systems. */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier * conversion, the .shift value could be zero. However diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index b7fbadc5c973..24174b4d669b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock); /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; -/* ACTHZ period (nsecs): */ +/* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; static u64 tick_length; -- cgit v1.2.3-71-gd317 From d4e3ab384b2343c7074f713ac330f839c38c52ee Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 27 Jul 2012 14:48:11 -0400 Subject: time: Clean up stray newlines Ingo noted inconsistent newline usage between functions. This patch cleans those up. Signed-off-by: John Stultz Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1343414893-45779-4-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cf364db5589f..05b37a5ec7fb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -239,7 +239,6 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); } - /** * timekeeping_forward_now - update clock to the current time * @@ -436,7 +435,6 @@ int do_settimeofday(const struct timespec *tv) } EXPORT_SYMBOL(do_settimeofday); - /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @tv: pointer to the timespec variable containing the offset @@ -550,7 +548,6 @@ void getrawmonotonic(struct timespec *ts) } EXPORT_SYMBOL(getrawmonotonic); - /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ @@ -683,7 +680,6 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, update_sleep_time(timespec_add(tk->total_sleep_time, *delta)); } - /** * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec delta value @@ -718,7 +714,6 @@ void timekeeping_inject_sleeptime(struct timespec *delta) clock_was_set(); } - /** * timekeeping_resume - Resumes the generic timekeeping subsystem. * @@ -1003,7 +998,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) } - /** * accumulate_nsecs_to_secs - Accumulates nsecs into secs * @@ -1032,7 +1026,6 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) } } - /** * logarithmic_accumulation - shifted accumulation of cycles * @@ -1076,7 +1069,6 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, return offset; } - /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -1177,7 +1169,6 @@ void getboottime(struct timespec *ts) } EXPORT_SYMBOL_GPL(getboottime); - /** * get_monotonic_boottime - Returns monotonic time since boot * @ts: pointer to the timespec to be set @@ -1358,7 +1349,6 @@ ktime_t ktime_get_monotonic_offset(void) } EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); - /** * xtime_update() - advances the timekeeping infrastructure * @ticks: number of ticks, that have elapsed since the last call. -- cgit v1.2.3-71-gd317 From 6d0ef903e2bda70da124c10d8ad89f2382c87991 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 27 Jul 2012 14:48:12 -0400 Subject: time: Clean up offs_real/wall_to_mono and offs_boot/total_sleep_time updates For performance reasons, we maintain ktime_t based duplicates of wall_to_monotonic (offs_real) and total_sleep_time (offs_boot). Since large problems could occur (such as the resume regression on 3.5-rc7, or the leapsecond hrtimer issue) if these value pairs were to be inconsistently updated, this patch this cleans up how we modify these value pairs to ensure we are always consistent. As a side-effect this is also more efficient as we only caulculate the duplicate values when they are changed, rather then every update_wall_time call. This also provides WARN_ONs to detect if future changes break the invariants. Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1343414893-45779-5-git-send-email-john.stultz@linaro.org [ Cleaned up minor style issues. ] Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 90 ++++++++++++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 05b37a5ec7fb..4da65592b4d9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -65,14 +65,14 @@ struct timekeeper { * used instead. */ struct timespec wall_to_monotonic; - /* time spent in suspend */ - struct timespec total_sleep_time; - /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ - struct timespec raw_time; /* Offset clock monotonic -> clock realtime */ ktime_t offs_real; + /* time spent in suspend */ + struct timespec total_sleep_time; /* Offset clock monotonic -> clock boottime */ ktime_t offs_boot; + /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ + struct timespec raw_time; /* Seqlock for all timekeeper values */ seqlock_t lock; }; @@ -117,6 +117,31 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; } +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) +{ + struct timespec tmp; + + /* + * Verify consistency of: offset_real = -wall_to_monotonic + * before modifying anything + */ + set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, + -tk->wall_to_monotonic.tv_nsec); + WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); + tk->wall_to_monotonic = wtm; + set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec_to_ktime(tmp); +} + +static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) +{ + /* Verify consistency before modifying */ + WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); + + tk->total_sleep_time = t; + tk->offs_boot = timespec_to_ktime(t); +} + /** * timekeeper_setup_internals - Set up internals to use clocksource clock. * @@ -217,14 +242,6 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) return nsec + arch_gettimeoffset(); } -static void update_rt_offset(struct timekeeper *tk) -{ - struct timespec tmp, *wtm = &tk->wall_to_monotonic; - - set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); - tk->offs_real = timespec_to_ktime(tmp); -} - /* must hold write on timekeeper.lock */ static void timekeeping_update(struct timekeeper *tk, bool clearntp) { @@ -234,7 +251,6 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) tk->ntp_error = 0; ntp_clear(); } - update_rt_offset(tk); xt = tk_xtime(tk); update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); } @@ -419,8 +435,8 @@ int do_settimeofday(const struct timespec *tv) ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; - timekeeper.wall_to_monotonic = - timespec_sub(timekeeper.wall_to_monotonic, ts_delta); + tk_set_wall_to_mono(&timekeeper, + timespec_sub(timekeeper.wall_to_monotonic, ts_delta)); tk_set_xtime(&timekeeper, tv); @@ -454,8 +470,8 @@ int timekeeping_inject_offset(struct timespec *ts) tk_xtime_add(&timekeeper, ts); - timekeeper.wall_to_monotonic = - timespec_sub(timekeeper.wall_to_monotonic, *ts); + tk_set_wall_to_mono(&timekeeper, + timespec_sub(timekeeper.wall_to_monotonic, *ts)); timekeeping_update(&timekeeper, true); @@ -621,7 +637,7 @@ void __init timekeeping_init(void) { struct clocksource *clock; unsigned long flags; - struct timespec now, boot; + struct timespec now, boot, tmp; read_persistent_clock(&now); read_boot_clock(&boot); @@ -642,23 +658,19 @@ void __init timekeeping_init(void) if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(&timekeeper); - set_normalized_timespec(&timekeeper.wall_to_monotonic, - -boot.tv_sec, -boot.tv_nsec); - update_rt_offset(&timekeeper); - timekeeper.total_sleep_time.tv_sec = 0; - timekeeper.total_sleep_time.tv_nsec = 0; + set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); + tk_set_wall_to_mono(&timekeeper, tmp); + + tmp.tv_sec = 0; + tmp.tv_nsec = 0; + tk_set_sleep_time(&timekeeper, tmp); + write_sequnlock_irqrestore(&timekeeper.lock, flags); } /* time in seconds when suspend began */ static struct timespec timekeeping_suspend_time; -static void update_sleep_time(struct timespec t) -{ - timekeeper.total_sleep_time = t; - timekeeper.offs_boot = timespec_to_ktime(t); -} - /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @delta: pointer to a timespec delta value @@ -674,10 +686,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, "sleep delta value!\n"); return; } - tk_xtime_add(tk, delta); - tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta); - update_sleep_time(timespec_add(tk->total_sleep_time, *delta)); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); + tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); } /** @@ -1018,11 +1029,18 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); - tk->xtime_sec += leap; - tk->wall_to_monotonic.tv_sec -= leap; - if (leap) - clock_was_set_delayed(); + if (unlikely(leap)) { + struct timespec ts; + + tk->xtime_sec += leap; + ts.tv_sec = leap; + ts.tv_nsec = 0; + tk_set_wall_to_mono(tk, + timespec_sub(tk->wall_to_monotonic, ts)); + + clock_was_set_delayed(); + } } } -- cgit v1.2.3-71-gd317 From 4e250fdde9be50581c7dd5fed88c9b9960615314 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 27 Jul 2012 14:48:13 -0400 Subject: time: Remove all direct references to timekeeper Ingo noted that the numerous timekeeper.value references made the timekeeping code ugly and caused many long lines that had to be broken up. He recommended replacing timekeeper.value references with tk->value. This patch provides a local tk value for all top level time functions and sets it to &timekeeper. Then all timekeeper access is done via a tk pointer. Signed-off-by: John Stultz Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1343414893-45779-6-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 282 +++++++++++++++++++++++++--------------------- 1 file changed, 154 insertions(+), 128 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4da65592b4d9..2988bc819187 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -292,18 +292,19 @@ static void timekeeping_forward_now(struct timekeeper *tk) */ void getnstimeofday(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs = 0; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - ts->tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(&timekeeper); + ts->tv_sec = tk->xtime_sec; + ts->tv_nsec = timekeeping_get_ns(tk); - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); timespec_add_ns(ts, nsecs); } @@ -311,19 +312,18 @@ EXPORT_SYMBOL(getnstimeofday); ktime_t ktime_get(void) { + struct timekeeper *tk = &timekeeper; unsigned int seq; s64 secs, nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - secs = timekeeper.xtime_sec + - timekeeper.wall_to_monotonic.tv_sec; - nsecs = timekeeping_get_ns(&timekeeper) + - timekeeper.wall_to_monotonic.tv_nsec; + seq = read_seqbegin(&tk->lock); + secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -342,18 +342,19 @@ EXPORT_SYMBOL_GPL(ktime_get); */ void ktime_get_ts(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec tomono; unsigned int seq; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - ts->tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(&timekeeper); - tomono = timekeeper.wall_to_monotonic; + seq = read_seqbegin(&tk->lock); + ts->tv_sec = tk->xtime_sec; + ts->tv_nsec = timekeeping_get_ns(tk); + tomono = tk->wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, ts->tv_nsec + tomono.tv_nsec); @@ -373,22 +374,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); */ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs_raw, nsecs_real; WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - *ts_raw = timekeeper.raw_time; - ts_real->tv_sec = timekeeper.xtime_sec; + *ts_raw = tk->raw_time; + ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; - nsecs_raw = timekeeping_get_ns_raw(&timekeeper); - nsecs_real = timekeeping_get_ns(&timekeeper); + nsecs_raw = timekeeping_get_ns_raw(tk); + nsecs_real = timekeeping_get_ns(tk); - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -421,28 +423,28 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(const struct timespec *tv) { + struct timekeeper *tk = &timekeeper; struct timespec ts_delta, xt; unsigned long flags; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); - xt = tk_xtime(&timekeeper); + xt = tk_xtime(tk); ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; - tk_set_wall_to_mono(&timekeeper, - timespec_sub(timekeeper.wall_to_monotonic, ts_delta)); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); - tk_set_xtime(&timekeeper, tv); + tk_set_xtime(tk, tv); - timekeeping_update(&timekeeper, true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -459,23 +461,23 @@ EXPORT_SYMBOL(do_settimeofday); */ int timekeeping_inject_offset(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long flags; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); - tk_xtime_add(&timekeeper, ts); - tk_set_wall_to_mono(&timekeeper, - timespec_sub(timekeeper.wall_to_monotonic, *ts)); + tk_xtime_add(tk, ts); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); - timekeeping_update(&timekeeper, true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -491,23 +493,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset); */ static int change_clocksource(void *data) { + struct timekeeper *tk = &timekeeper; struct clocksource *new, *old; unsigned long flags; new = (struct clocksource *) data; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); if (!new->enable || new->enable(new) == 0) { - old = timekeeper.clock; - tk_setup_internals(&timekeeper, new); + old = tk->clock; + tk_setup_internals(tk, new); if (old->disable) old->disable(old); } - timekeeping_update(&timekeeper, true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); return 0; } @@ -521,7 +524,9 @@ static int change_clocksource(void *data) */ void timekeeping_notify(struct clocksource *clock) { - if (timekeeper.clock == clock) + struct timekeeper *tk = &timekeeper; + + if (tk->clock == clock) return; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); @@ -550,15 +555,16 @@ EXPORT_SYMBOL_GPL(ktime_get_real); */ void getrawmonotonic(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs; do { - seq = read_seqbegin(&timekeeper.lock); - nsecs = timekeeping_get_ns_raw(&timekeeper); - *ts = timekeeper.raw_time; + seq = read_seqbegin(&tk->lock); + nsecs = timekeeping_get_ns_raw(tk); + *ts = tk->raw_time; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); timespec_add_ns(ts, nsecs); } @@ -569,15 +575,16 @@ EXPORT_SYMBOL(getrawmonotonic); */ int timekeeping_valid_for_hres(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; int ret; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); return ret; } @@ -587,15 +594,16 @@ int timekeeping_valid_for_hres(void) */ u64 timekeeping_max_deferment(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; u64 ret; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - ret = timekeeper.clock->max_idle_ns; + ret = tk->clock->max_idle_ns; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); return ret; } @@ -635,6 +643,7 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts) */ void __init timekeeping_init(void) { + struct timekeeper *tk = &timekeeper; struct clocksource *clock; unsigned long flags; struct timespec now, boot, tmp; @@ -642,30 +651,30 @@ void __init timekeeping_init(void) read_persistent_clock(&now); read_boot_clock(&boot); - seqlock_init(&timekeeper.lock); + seqlock_init(&tk->lock); ntp_init(); - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - tk_setup_internals(&timekeeper, clock); + tk_setup_internals(tk, clock); - tk_set_xtime(&timekeeper, &now); - timekeeper.raw_time.tv_sec = 0; - timekeeper.raw_time.tv_nsec = 0; + tk_set_xtime(tk, &now); + tk->raw_time.tv_sec = 0; + tk->raw_time.tv_nsec = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) - boot = tk_xtime(&timekeeper); + boot = tk_xtime(tk); set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); - tk_set_wall_to_mono(&timekeeper, tmp); + tk_set_wall_to_mono(tk, tmp); tmp.tv_sec = 0; tmp.tv_nsec = 0; - tk_set_sleep_time(&timekeeper, tmp); + tk_set_sleep_time(tk, tmp); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); } /* time in seconds when suspend began */ @@ -703,6 +712,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, */ void timekeeping_inject_sleeptime(struct timespec *delta) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec ts; @@ -711,15 +721,15 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) return; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); - __timekeeping_inject_sleeptime(&timekeeper, delta); + __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(&timekeeper, true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -734,6 +744,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) */ static void timekeeping_resume(void) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec ts; @@ -741,18 +752,18 @@ static void timekeeping_resume(void) clocksource_resume(); - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); - __timekeeping_inject_sleeptime(&timekeeper, &ts); + __timekeeping_inject_sleeptime(tk, &ts); } /* re-base the last cycle value */ - timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); - timekeeper.ntp_error = 0; + tk->clock->cycle_last = tk->clock->read(tk->clock); + tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(&timekeeper, false); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + timekeeping_update(tk, false); + write_sequnlock_irqrestore(&tk->lock, flags); touch_softlockup_watchdog(); @@ -764,14 +775,15 @@ static void timekeeping_resume(void) static int timekeeping_suspend(void) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec delta, delta_delta; static struct timespec old_delta; read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(&timekeeper); + write_seqlock_irqsave(&tk->lock, flags); + timekeeping_forward_now(tk); timekeeping_suspended = 1; /* @@ -780,7 +792,7 @@ static int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time); + delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); delta_delta = timespec_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -793,7 +805,7 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -904,7 +916,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * the error. This causes the likely below to be unlikely. * * The proper fix is to avoid rounding up by using - * the high precision timekeeper.xtime_nsec instead of + * the high precision tk->xtime_nsec instead of * xtime.tv_nsec everywhere. Fixing this will take some * time. */ @@ -1094,21 +1106,22 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, static void update_wall_time(void) { struct clocksource *clock; + struct timekeeper *tk = &timekeeper; cycle_t offset; int shift = 0, maxshift; unsigned long flags; s64 remainder; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) goto out; - clock = timekeeper.clock; + clock = tk->clock; #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - offset = timekeeper.cycle_interval; + offset = tk->cycle_interval; #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif @@ -1121,19 +1134,19 @@ static void update_wall_time(void) * chunk in one go, and then try to consume the next smaller * doubled multiple. */ - shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); + shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); - while (offset >= timekeeper.cycle_interval) { - offset = logarithmic_accumulation(&timekeeper, offset, shift); - if(offset < timekeeper.cycle_interval<= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift); + if (offset < tk->cycle_interval<xtime_nsec & ((1 << tk->shift) - 1); + tk->xtime_nsec -= remainder; + tk->xtime_nsec += 1 << tk->shift; + tk->ntp_error += remainder << tk->ntp_error_shift; /* * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ - accumulate_nsecs_to_secs(&timekeeper); + accumulate_nsecs_to_secs(tk); - timekeeping_update(&timekeeper, false); + timekeeping_update(tk, false); out: - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); } @@ -1176,11 +1189,12 @@ out: */ void getboottime(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec boottime = { - .tv_sec = timekeeper.wall_to_monotonic.tv_sec + - timekeeper.total_sleep_time.tv_sec, - .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + - timekeeper.total_sleep_time.tv_nsec + .tv_sec = tk->wall_to_monotonic.tv_sec + + tk->total_sleep_time.tv_sec, + .tv_nsec = tk->wall_to_monotonic.tv_nsec + + tk->total_sleep_time.tv_nsec }; set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); @@ -1198,19 +1212,20 @@ EXPORT_SYMBOL_GPL(getboottime); */ void get_monotonic_boottime(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec tomono, sleep; unsigned int seq; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - ts->tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(&timekeeper); - tomono = timekeeper.wall_to_monotonic; - sleep = timekeeper.total_sleep_time; + seq = read_seqbegin(&tk->lock); + ts->tv_sec = tk->xtime_sec; + ts->tv_nsec = timekeeping_get_ns(tk); + tomono = tk->wall_to_monotonic; + sleep = tk->total_sleep_time; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); @@ -1240,31 +1255,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime); */ void monotonic_to_bootbased(struct timespec *ts) { - *ts = timespec_add(*ts, timekeeper.total_sleep_time); + struct timekeeper *tk = &timekeeper; + + *ts = timespec_add(*ts, tk->total_sleep_time); } EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return timekeeper.xtime_sec; + struct timekeeper *tk = &timekeeper; + + return tk->xtime_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return tk_xtime(&timekeeper); + struct timekeeper *tk = &timekeeper; + + return tk_xtime(tk); } struct timespec current_kernel_time(void) { + struct timekeeper *tk = &timekeeper; struct timespec now; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - now = tk_xtime(&timekeeper); - } while (read_seqretry(&timekeeper.lock, seq)); + now = tk_xtime(tk); + } while (read_seqretry(&tk->lock, seq)); return now; } @@ -1272,15 +1294,16 @@ EXPORT_SYMBOL(current_kernel_time); struct timespec get_monotonic_coarse(void) { + struct timekeeper *tk = &timekeeper; struct timespec now, mono; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - now = tk_xtime(&timekeeper); - mono = timekeeper.wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + now = tk_xtime(tk); + mono = tk->wall_to_monotonic; + } while (read_seqretry(&tk->lock, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1309,14 +1332,15 @@ void do_timer(unsigned long ticks) void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, struct timespec *wtom, struct timespec *sleep) { + struct timekeeper *tk = &timekeeper; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); - *xtim = tk_xtime(&timekeeper); - *wtom = timekeeper.wall_to_monotonic; - *sleep = timekeeper.total_sleep_time; - } while (read_seqretry(&timekeeper.lock, seq)); + seq = read_seqbegin(&tk->lock); + *xtim = tk_xtime(tk); + *wtom = tk->wall_to_monotonic; + *sleep = tk->total_sleep_time; + } while (read_seqretry(&tk->lock, seq)); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1330,19 +1354,20 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, */ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) { + struct timekeeper *tk = &timekeeper; ktime_t now; unsigned int seq; u64 secs, nsecs; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - secs = timekeeper.xtime_sec; - nsecs = timekeeping_get_ns(&timekeeper); + secs = tk->xtime_sec; + nsecs = timekeeping_get_ns(tk); - *offs_real = timekeeper.offs_real; - *offs_boot = timekeeper.offs_boot; - } while (read_seqretry(&timekeeper.lock, seq)); + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + } while (read_seqretry(&tk->lock, seq)); now = ktime_add_ns(ktime_set(secs, 0), nsecs); now = ktime_sub(now, *offs_real); @@ -1355,13 +1380,14 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) */ ktime_t ktime_get_monotonic_offset(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; struct timespec wtom; do { - seq = read_seqbegin(&timekeeper.lock); - wtom = timekeeper.wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + seq = read_seqbegin(&tk->lock); + wtom = tk->wall_to_monotonic; + } while (read_seqretry(&tk->lock, seq)); return timespec_to_ktime(wtom); } -- cgit v1.2.3-71-gd317 From 1d17d17484d40f2d5b35c79518597a2b25296996 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 4 Aug 2012 21:21:14 +0200 Subject: time: Fix adjustment cleanup bug in timekeeping_adjust() Tetsuo Handa reported that sporadically the system clock starts counting up too quickly which is enough to confuse the hangcheck timer to print a bogus stall warning. Commit 2a8c0883 "time: Move xtime_nsec adjustment underflow handling timekeeping_adjust" overlooked this exit path: } else return; which should really be a proper exit sequence, fixing the bug as a side effect. Also make the flow more readable by properly balancing curly braces. Reported-by: Tetsuo Handa wrote: Tested-by: Tetsuo Handa wrote: Signed-off-by: Ingo Molnar Cc: john.stultz@linaro.org Cc: a.p.zijlstra@chello.nl Cc: richardcochran@gmail.com Cc: prarit@redhat.com Link: http://lkml.kernel.org/r/20120804192114.GA28347@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2988bc819187..e16af197a2bc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -923,20 +923,22 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) if (likely(error <= interval)) adj = 1; else - adj = timekeeping_bigadjust(tk, error, &interval, - &offset); - } else if (error < -interval) { - /* See comment above, this is just switched for the negative */ - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else - adj = timekeeping_bigadjust(tk, error, &interval, - &offset); - } else - return; + adj = timekeeping_bigadjust(tk, error, &interval, &offset); + } else { + if (error < -interval) { + /* See comment above, this is just switched for the negative */ + error >>= 2; + if (likely(error >= -interval)) { + adj = -1; + interval = -interval; + offset = -offset; + } else { + adj = timekeeping_bigadjust(tk, error, &interval, &offset); + } + } else { + goto out_adjust; + } + } if (unlikely(tk->clock->maxadj && (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { @@ -999,6 +1001,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) tk->xtime_nsec -= offset; tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; +out_adjust: /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource -- cgit v1.2.3-71-gd317 From 300d3739e873d50d4c6e3656f89007a217fb1d29 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 7 Aug 2012 13:50:22 +0200 Subject: Revert "NMI watchdog: fix for lockup detector breakage on resume" Revert commit 45226e9 (NMI watchdog: fix for lockup detector breakage on resume) which breaks resume from system suspend on my SH7372 Mackerel board (by causing a NULL pointer dereference to happen) and is generally wrong, because it abuses the CPU hotplug functionality in a shamelessly blatant way. The original issue should be addressed through appropriate syscore resume callback instead. Signed-off-by: Rafael J. Wysocki --- include/linux/sched.h | 8 -------- kernel/power/suspend.c | 3 --- kernel/watchdog.c | 21 ++------------------- 3 files changed, 2 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index c147e7024f11..b8c86648a2f9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -334,14 +334,6 @@ static inline void lockup_detector_init(void) } #endif -#if defined(CONFIG_LOCKUP_DETECTOR) && defined(CONFIG_SUSPEND) -void lockup_detector_bootcpu_resume(void); -#else -static inline void lockup_detector_bootcpu_resume(void) -{ -} -#endif - #ifdef CONFIG_DETECT_HUNG_TASK extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_check_count; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1da39ea248fd..c8b7446b27df 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -178,9 +178,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); - /* Kick the lockup detector */ - lockup_detector_bootcpu_resume(); - Enable_cpus: enable_nonboot_cpus(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 69add8a9da68..4b1dfba70f7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -575,7 +575,7 @@ out: /* * Create/destroy watchdog threads as CPUs come and go: */ -static int +static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -610,27 +610,10 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block cpu_nfb = { +static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -#ifdef CONFIG_SUSPEND -/* - * On exit from suspend we force an offline->online transition on the boot CPU - * so that the PMU state that was lost while in suspended state gets set up - * properly for the boot CPU. This information is required for restarting the - * NMI watchdog. - */ -void lockup_detector_bootcpu_resume(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - - cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu); - cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu); - cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu); -} -#endif - void __init lockup_detector_init(void) { void *cpu = (void *)(long)smp_processor_id(); -- cgit v1.2.3-71-gd317 From e3756477aec028427fec767957c0d4b6cfb87208 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 10 Aug 2012 15:07:09 -0400 Subject: printk: Fix calculation of length used to discard records While tracking down a weird buffer overflow issue in a program that looked to be sane, I started double checking the length returned by syslog(SYSLOG_ACTION_READ_ALL, ...) to make sure it wasn't overflowing the buffer. Sure enough, it was. I saw this in strace: 11339 syslog(SYSLOG_ACTION_READ_ALL, "<5>[244017.708129] REISERFS (dev"..., 8192) = 8279 It turns out that the loops that calculate how much space the entries will take when they're copied don't include the newlines and prefixes that will be included in the final output since prev flags is passed as zero. This patch properly accounts for it and fixes the overflow. CC: stable@kernel.org Signed-off-by: Jeff Mahoney Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 6a76ab9d4476..66a2ea37b576 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1034,6 +1034,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct log *msg = log_from_idx(idx); len += msg_print_text(msg, prev, true, NULL, 0); + prev = msg->flags; idx = log_next(idx); seq++; } @@ -1046,6 +1047,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct log *msg = log_from_idx(idx); len -= msg_print_text(msg, prev, true, NULL, 0); + prev = msg->flags; idx = log_next(idx); seq++; } -- cgit v1.2.3-71-gd317 From a35b6466aabb051568b844e8c63f87a356d3d129 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Aug 2012 21:46:40 +0200 Subject: sched, cgroup: Reduce rq->lock hold times for large cgroup hierarchies Peter Portante reported that for large cgroup hierarchies (and or on large CPU counts) we get immense lock contention on rq->lock and stuff stops working properly. His workload was a ton of processes, each in their own cgroup, everybody idling except for a sporadic wakeup once every so often. It was found that: schedule() idle_balance() load_balance() local_irq_save() double_rq_lock() update_h_load() walk_tg_tree(tg_load_down) tg_load_down() Results in an entire cgroup hierarchy walk under rq->lock for every new-idle balance and since new-idle balance isn't throttled this results in a lot of work while holding the rq->lock. This patch does two things, it removes the work from under rq->lock based on the good principle of race and pray which is widely employed in the load-balancer as a whole. And secondly it throttles the update_h_load() calculation to max once per jiffy. I considered excluding update_h_load() for new-idle balance all-together, but purely relying on regular balance passes to update this data might not work out under some rare circumstances where the new-idle busiest isn't the regular busiest for a while (unlikely, but a nightmare to debug if someone hits it and suffers). Cc: pjt@google.com Cc: Larry Woodman Cc: Mike Galbraith Reported-by: Peter Portante Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-aaarrzfpnaam7pqrekofu8a6@git.kernel.org Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 11 +++++++++-- kernel/sched/sched.h | 6 +++++- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d0cc03b3e70b..c219bf8d704c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3387,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_h_load(long cpu) { + struct rq *rq = cpu_rq(cpu); + unsigned long now = jiffies; + + if (rq->h_load_throttle == now) + return; + + rq->h_load_throttle = now; + rcu_read_lock(); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); rcu_read_unlock(); @@ -4293,11 +4301,10 @@ redo: env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); + update_h_load(env.src_cpu); more_balance: local_irq_save(flags); double_rq_lock(this_rq, busiest); - if (!env.loop) - update_h_load(env.src_cpu); /* * cur_ld_moved - load moved in current iteration diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c35a1a7dd4d6..531411b1044e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -374,7 +374,11 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#endif +#ifdef CONFIG_SMP + unsigned long h_load_throttle; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + #ifdef CONFIG_RT_GROUP_SCHED struct list_head leaf_rt_rq_list; #endif -- cgit v1.2.3-71-gd317 From bea6832cc8c4a0a9a65dd17da6aaa657fe27bc3e Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 8 Aug 2012 11:27:15 +0200 Subject: sched: fix divide by zero at {thread_group,task}_times On architectures where cputime_t is 64 bit type, is possible to trigger divide by zero on do_div(temp, (__force u32) total) line, if total is a non zero number but has lower 32 bit's zeroed. Removing casting is not a good solution since some do_div() implementations do cast to u32 internally. This problem can be triggered in practice on very long lived processes: PID: 2331 TASK: ffff880472814b00 CPU: 2 COMMAND: "oraagent.bin" #0 [ffff880472a51b70] machine_kexec at ffffffff8103214b #1 [ffff880472a51bd0] crash_kexec at ffffffff810b91c2 #2 [ffff880472a51ca0] oops_end at ffffffff814f0b00 #3 [ffff880472a51cd0] die at ffffffff8100f26b #4 [ffff880472a51d00] do_trap at ffffffff814f03f4 #5 [ffff880472a51d60] do_divide_error at ffffffff8100cfff #6 [ffff880472a51e00] divide_error at ffffffff8100be7b [exception RIP: thread_group_times+0x56] RIP: ffffffff81056a16 RSP: ffff880472a51eb8 RFLAGS: 00010046 RAX: bc3572c9fe12d194 RBX: ffff880874150800 RCX: 0000000110266fad RDX: 0000000000000000 RSI: ffff880472a51eb8 RDI: 001038ae7d9633dc RBP: ffff880472a51ef8 R8: 00000000b10a3a64 R9: ffff880874150800 R10: 00007fcba27ab680 R11: 0000000000000202 R12: ffff880472a51f08 R13: ffff880472a51f10 R14: 0000000000000000 R15: 0000000000000007 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffff880472a51f00] do_sys_times at ffffffff8108845d #8 [ffff880472a51f40] sys_times at ffffffff81088524 #9 [ffff880472a51f80] system_call_fastpath at ffffffff8100b0f2 RIP: 0000003808caac3a RSP: 00007fcba27ab6d8 RFLAGS: 00000202 RAX: 0000000000000064 RBX: ffffffff8100b0f2 RCX: 0000000000000000 RDX: 00007fcba27ab6e0 RSI: 000000000076d58e RDI: 00007fcba27ab6e0 RBP: 00007fcba27ab700 R8: 0000000000000020 R9: 000000000000091b R10: 00007fcba27ab680 R11: 0000000000000202 R12: 00007fff9ca41940 R13: 0000000000000000 R14: 00007fcba27ac9c0 R15: 00007fff9ca41940 ORIG_RAX: 0000000000000064 CS: 0033 SS: 002b Cc: stable@vger.kernel.org Signed-off-by: Stanislaw Gruszka Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120808092714.GA3580@redhat.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2cb4e7777998..e2c5841c7dfe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) #endif +static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +{ + u64 temp = (__force u64) rtime; + + temp *= (__force u64) utime; + + if (sizeof(cputime_t) == 4) + temp = div_u64(temp, (__force u32) total); + else + temp = div64_u64(temp, (__force u64) total); + + return (__force cputime_t) temp; +} + void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; @@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) */ rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(utime, rtime, total); + else utime = rtime; /* @@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) total = cputime.utime + cputime.stime; rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) cputime.utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(cputime.utime, rtime, total); + else utime = rtime; sig->prev_utime = max(sig->prev_utime, utime); -- cgit v1.2.3-71-gd317 From 35cf4e50b16331def6cfcbee11e49270b6db07f5 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 7 Aug 2012 05:00:13 +0200 Subject: sched,cgroup: Fix up task_groups list With multiple instances of task_groups, for_each_rt_rq() is a noop, no task groups having been added to the rt.c list instance. This renders __enable/disable_runtime() and print_rt_stats() noop, the user (non) visible effect being that rt task groups are missing in /proc/sched_debug. Signed-off-by: Mike Galbraith Cc: stable@kernel.org # v3.3+ Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344308413.6846.7.camel@marge.simpson.net Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 1 + kernel/sched/sched.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e2c5841c7dfe..6aa212f3c581 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7252,6 +7252,7 @@ int in_sched_functions(unsigned long addr) #ifdef CONFIG_CGROUP_SCHED struct task_group root_task_group; +LIST_HEAD(task_groups); #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 531411b1044e..f6714d009e77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex; struct cfs_rq; struct rt_rq; -static LIST_HEAD(task_groups); +extern struct list_head task_groups; struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH -- cgit v1.2.3-71-gd317 From e221d028bb08b47e624c5f0a31732c642db9d19a Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 7 Aug 2012 10:02:38 +0200 Subject: sched,rt: fix isolated CPUs leaving root_task_group indefinitely throttled Root task group bandwidth replenishment must service all CPUs, regardless of where the timer was last started, and regardless of the isolation mechanism, lest 'Quoth the Raven, "Nevermore"' become rt scheduling policy. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344326558.6968.25.camel@marge.simpson.net Signed-off-by: Thomas Gleixner --- kernel/sched/rt.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 573e1ca01102..944cb68420e9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -788,6 +788,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) const struct cpumask *span; span = sched_rt_period_mask(); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * FIXME: isolated CPUs should really leave the root task group, + * whether they are isolcpus or were isolated via cpusets, lest + * the timer run on a CPU which does not service all runqueues, + * potentially leaving other CPUs indefinitely throttled. If + * isolation is really required, the user will turn the throttle + * off to kill the perturbations it causes anyway. Meanwhile, + * this maintains functionality for boot and/or troubleshooting. + */ + if (rt_b == &root_task_group.rt_bandwidth) + span = cpu_online_mask; +#endif for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); -- cgit v1.2.3-71-gd317 From 8f6189684eb4e85e6c593cd710693f09c944450a Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sat, 4 Aug 2012 05:44:14 +0200 Subject: sched: Fix migration thread runtime bogosity Make stop scheduler class do the same accounting as other classes, Migration threads can be caught in the act while doing exec balancing, leading to the below due to use of unmaintained ->se.exec_start. The load that triggered this particular instance was an apparently out of control heavily threaded application that does system monitoring in what equated to an exec bomb, with one of the VERY frequently migrated tasks being ps. %CPU PID USER CMD 99.3 45 root [migration/10] 97.7 53 root [migration/12] 97.0 57 root [migration/13] 90.1 49 root [migration/11] 89.6 65 root [migration/15] 88.7 17 root [migration/3] 80.4 37 root [migration/8] 78.1 41 root [migration/9] 44.2 13 root [migration/2] Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344051854.6739.19.camel@marge.simpson.net Signed-off-by: Thomas Gleixner --- kernel/sched/stop_task.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7b386e86fd23..da5eb5bed84a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->on_rq) + if (stop && stop->on_rq) { + stop->se.exec_start = rq->clock_task; return stop; + } return NULL; } @@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { + struct task_struct *curr = rq->curr; + u64 delta_exec; + + delta_exec = rq->clock_task - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq->clock_task; + cpuacct_charge(curr, delta_exec); } static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) @@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) static void set_curr_task_stop(struct rq *rq) { + struct task_struct *stop = rq->stop; + + stop->se.exec_start = rq->clock_task; } static void switched_to_stop(struct rq *rq, struct task_struct *p) -- cgit v1.2.3-71-gd317 From 0fe33aae0e94b4097dd433c9399e16e17d638cd8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 15 Aug 2012 12:55:22 +0200 Subject: audit: don't free_chunk() after fsnotify_add_mark() Don't do free_chunk() after fsnotify_add_mark(). That one does a delayed unref via the destroy list and this results in use-after-free. Signed-off-by: Miklos Szeredi Acked-by: Eric Paris CC: stable@vger.kernel.org --- kernel/audit_tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 3a5ca582ba1e..69a58515f43f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -259,7 +259,7 @@ static void untag_chunk(struct node *p) fsnotify_duplicate_mark(&new->mark, entry); if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { - free_chunk(new); + fsnotify_put_mark(&new->mark); goto Fallback; } @@ -322,7 +322,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) entry = &chunk->mark; if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { - free_chunk(chunk); + fsnotify_put_mark(entry); return -ENOSPC; } @@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) fsnotify_duplicate_mark(chunk_entry, old_entry); if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { spin_unlock(&old_entry->lock); - free_chunk(chunk); + fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); return -ENOSPC; } -- cgit v1.2.3-71-gd317 From a2140fc0cb0325bb6384e788edd27b9a568714e2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 15 Aug 2012 12:55:22 +0200 Subject: audit: fix refcounting in audit-tree Refcounting of fsnotify_mark in audit tree is broken. E.g: refcount create_chunk alloc_chunk 1 fsnotify_add_mark 2 untag_chunk fsnotify_get_mark 3 fsnotify_destroy_mark audit_tree_freeing_mark 2 fsnotify_put_mark 1 fsnotify_put_mark 0 via destroy_list fsnotify_mark_destroy -1 This was reported by various people as triggering Oops when stopping auditd. We could just remove the put_mark from audit_tree_freeing_mark() but that would break freeing via inode destruction. So this patch simply omits a put_mark after calling destroy_mark or adds a get_mark before. The additional get_mark is necessary where there's no other put_mark after fsnotify_destroy_mark() since it assumes that the caller is holding a reference (or the inode is keeping the mark pinned, not the case here AFAICS). Signed-off-by: Miklos Szeredi Reported-by: Valentin Avram Reported-by: Peter Moody Acked-by: Eric Paris CC: stable@vger.kernel.org --- kernel/audit_tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 69a58515f43f..2b2ffff9eca2 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -250,7 +250,6 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); goto out; } @@ -293,7 +292,6 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); goto out; Fallback: @@ -332,6 +330,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); chunk->dead = 1; spin_unlock(&entry->lock); + fsnotify_get_mark(entry); fsnotify_destroy_mark(entry); fsnotify_put_mark(entry); return 0; @@ -412,6 +411,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); + fsnotify_get_mark(chunk_entry); fsnotify_destroy_mark(chunk_entry); fsnotify_put_mark(chunk_entry); @@ -445,7 +445,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&old_entry->lock); fsnotify_destroy_mark(old_entry); fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ - fsnotify_put_mark(old_entry); /* and kill it */ return 0; } -- cgit v1.2.3-71-gd317 From b3e8692b4dde5cf5fc60e4b95385229a72623182 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 15 Aug 2012 12:55:22 +0200 Subject: audit: clean up refcounting in audit-tree Drop the initial reference by fsnotify_init_mark early instead of audit_tree_freeing_mark() at destroy time. In the cases we destroy the mark before we drop the initial reference we need to get rid of the get_mark that balances the put_mark in audit_tree_freeing_mark(). Signed-off-by: Miklos Szeredi --- kernel/audit_tree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 2b2ffff9eca2..ed206fd88cca 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -292,6 +292,7 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); + fsnotify_put_mark(&new->mark); /* drop initial reference */ goto out; Fallback: @@ -330,7 +331,6 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); chunk->dead = 1; spin_unlock(&entry->lock); - fsnotify_get_mark(entry); fsnotify_destroy_mark(entry); fsnotify_put_mark(entry); return 0; @@ -346,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) insert_hash(chunk); spin_unlock(&hash_lock); spin_unlock(&entry->lock); + fsnotify_put_mark(entry); /* drop initial reference */ return 0; } @@ -411,7 +412,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); - fsnotify_get_mark(chunk_entry); fsnotify_destroy_mark(chunk_entry); fsnotify_put_mark(chunk_entry); @@ -444,6 +444,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); fsnotify_destroy_mark(old_entry); + fsnotify_put_mark(chunk_entry); /* drop initial reference */ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ return 0; } @@ -915,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); evict_chunk(chunk); - fsnotify_put_mark(entry); + + /* + * We are guaranteed to have at least one reference to the mark from + * either the inode or the caller of fsnotify_destroy_mark(). + */ + BUG_ON(atomic_read(&entry->refcnt) < 1); } static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, -- cgit v1.2.3-71-gd317 From 4e8b14526ca7fb046a81c94002c1c43b6fdf0e9b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 8 Aug 2012 15:36:20 -0400 Subject: time: Improve sanity checking of timekeeping inputs Unexpected behavior could occur if the time is set to a value large enough to overflow a 64bit ktime_t (which is something larger then the year 2262). Also unexpected behavior could occur if large negative offsets are injected via adjtimex. So this patch improves the sanity check timekeeping inputs by improving the timespec_valid() check, and then makes better use of timespec_valid() to make sure we don't set the time to an invalid negative value or one that overflows ktime_t. Note: This does not protect from setting the time close to overflowing ktime_t and then letting natural accumulation cause the overflow. Reported-by: CAI Qian Reported-by: Sasha Levin Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Zhouping Liu Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1344454580-17031-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/ktime.h | 7 ------- include/linux/time.h | 22 ++++++++++++++++++++-- kernel/time/timekeeping.c | 26 ++++++++++++++++++++++++-- 3 files changed, 44 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 603bec2913b0..06177ba10a16 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -58,13 +58,6 @@ union ktime { typedef union ktime ktime_t; /* Kill this */ -#define KTIME_MAX ((s64)~((u64)1 << 63)) -#if (BITS_PER_LONG == 64) -# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) -#else -# define KTIME_SEC_MAX LONG_MAX -#endif - /* * ktime_t definitions when using the 64-bit scalar representation: */ diff --git a/include/linux/time.h b/include/linux/time.h index c81c5e40fcb5..b0bbd8f0130d 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -107,11 +107,29 @@ static inline struct timespec timespec_sub(struct timespec lhs, return ts_delta; } +#define KTIME_MAX ((s64)~((u64)1 << 63)) +#if (BITS_PER_LONG == 64) +# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) +#else +# define KTIME_SEC_MAX LONG_MAX +#endif + /* * Returns true if the timespec is norm, false if denorm: */ -#define timespec_valid(ts) \ - (((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC)) +static inline bool timespec_valid(const struct timespec *ts) +{ + /* Dates before 1970 are bogus */ + if (ts->tv_sec < 0) + return false; + /* Can't have more nanoseconds then a second */ + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return false; + /* Disallow values that could overflow ktime_t */ + if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + return false; + return true; +} extern void read_persistent_clock(struct timespec *ts); extern void read_boot_clock(struct timespec *ts); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e16af197a2bc..898bef066a44 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -427,7 +427,7 @@ int do_settimeofday(const struct timespec *tv) struct timespec ts_delta, xt; unsigned long flags; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + if (!timespec_valid(tv)) return -EINVAL; write_seqlock_irqsave(&tk->lock, flags); @@ -463,6 +463,8 @@ int timekeeping_inject_offset(struct timespec *ts) { struct timekeeper *tk = &timekeeper; unsigned long flags; + struct timespec tmp; + int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; @@ -471,10 +473,17 @@ int timekeeping_inject_offset(struct timespec *ts) timekeeping_forward_now(tk); + /* Make sure the proposed value is valid */ + tmp = timespec_add(tk_xtime(tk), *ts); + if (!timespec_valid(&tmp)) { + ret = -EINVAL; + goto error; + } tk_xtime_add(tk, ts); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); +error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, true); write_sequnlock_irqrestore(&tk->lock, flags); @@ -482,7 +491,7 @@ int timekeeping_inject_offset(struct timespec *ts) /* signal hrtimers about time change */ clock_was_set(); - return 0; + return ret; } EXPORT_SYMBOL(timekeeping_inject_offset); @@ -649,7 +658,20 @@ void __init timekeeping_init(void) struct timespec now, boot, tmp; read_persistent_clock(&now); + if (!timespec_valid(&now)) { + pr_warn("WARNING: Persistent clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + now.tv_sec = 0; + now.tv_nsec = 0; + } + read_boot_clock(&boot); + if (!timespec_valid(&boot)) { + pr_warn("WARNING: Boot clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + boot.tv_sec = 0; + boot.tv_nsec = 0; + } seqlock_init(&tk->lock); -- cgit v1.2.3-71-gd317 From 60916a9382e88fbf5e54fd36a3e658efd7ab7bed Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 16 Aug 2012 18:14:14 +0100 Subject: tracing/syscalls: Fix perf syscall tracing when syscall_nr == -1 syscall_get_nr can return -1 in the case that the task is not executing a system call. This patch fixes perf_syscall_{enter,exit} to check that the syscall number is valid before using it as an index into a bitmap. Link: http://lkml.kernel.org/r/1345137254-7377-1-git-send-email-will.deacon@arm.com Cc: Jason Baron Cc: Wade Farnsworth Cc: Frederic Weisbecker Signed-off-by: Will Deacon Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 60e4d7875672..6b245f64c8dd 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; -- cgit v1.2.3-71-gd317 From be53db6e4edd9dc013b21a929ad2b142dea8b9c0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Aug 2012 14:40:59 +1200 Subject: alpha: take a bunch of syscalls into osf_sys.c New helper: current_thread_info(). Allows to do a bunch of odd syscalls in C. While we are at it, there had never been a reason to do osf_getpriority() in assembler. We also get "namespace"-aware (read: consistent with getuid(2), etc.) behaviour from getx?id() syscalls now. Signed-off-by: Al Viro Signed-off-by: Michael Cree Acked-by: Matt Turner Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/ptrace.h | 5 +- arch/alpha/kernel/entry.S | 109 ---------------------------------------- arch/alpha/kernel/osf_sys.c | 49 ++++++++++++++++++ arch/alpha/kernel/systbls.S | 2 +- kernel/timer.c | 9 ---- 5 files changed, 54 insertions(+), 120 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/include/asm/ptrace.h b/arch/alpha/include/asm/ptrace.h index fd698a174f26..b87755a19554 100644 --- a/arch/alpha/include/asm/ptrace.h +++ b/arch/alpha/include/asm/ptrace.h @@ -76,7 +76,10 @@ struct switch_stack { #define task_pt_regs(task) \ ((struct pt_regs *) (task_stack_page(task) + 2*PAGE_SIZE) - 1) -#define force_successful_syscall_return() (task_pt_regs(current)->r0 = 0) +#define current_pt_regs() \ + ((struct pt_regs *) ((char *)current_thread_info() + 2*PAGE_SIZE) - 1) + +#define force_successful_syscall_return() (current_pt_regs()->r0 = 0) #endif diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index 6d159cee5f2f..22b0c4d414a7 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -796,115 +796,6 @@ sys_rt_sigreturn: br ret_from_sys_call .end sys_rt_sigreturn - .align 4 - .globl sys_sethae - .ent sys_sethae -sys_sethae: - .prologue 0 - stq $16, 152($sp) - ret -.end sys_sethae - - .align 4 - .globl osf_getpriority - .ent osf_getpriority -osf_getpriority: - lda $sp, -16($sp) - stq $26, 0($sp) - .prologue 0 - - jsr $26, sys_getpriority - - ldq $26, 0($sp) - blt $0, 1f - - /* Return value is the unbiased priority, i.e. 20 - prio. - This does result in negative return values, so signal - no error by writing into the R0 slot. */ - lda $1, 20 - stq $31, 16($sp) - subl $1, $0, $0 - unop - -1: lda $sp, 16($sp) - ret -.end osf_getpriority - - .align 4 - .globl sys_getxuid - .ent sys_getxuid -sys_getxuid: - .prologue 0 - ldq $2, TI_TASK($8) - ldq $3, TASK_CRED($2) - ldl $0, CRED_UID($3) - ldl $1, CRED_EUID($3) - stq $1, 80($sp) - ret -.end sys_getxuid - - .align 4 - .globl sys_getxgid - .ent sys_getxgid -sys_getxgid: - .prologue 0 - ldq $2, TI_TASK($8) - ldq $3, TASK_CRED($2) - ldl $0, CRED_GID($3) - ldl $1, CRED_EGID($3) - stq $1, 80($sp) - ret -.end sys_getxgid - - .align 4 - .globl sys_getxpid - .ent sys_getxpid -sys_getxpid: - .prologue 0 - ldq $2, TI_TASK($8) - - /* See linux/kernel/timer.c sys_getppid for discussion - about this loop. */ - ldq $3, TASK_GROUP_LEADER($2) - ldq $4, TASK_REAL_PARENT($3) - ldl $0, TASK_TGID($2) -1: ldl $1, TASK_TGID($4) -#ifdef CONFIG_SMP - mov $4, $5 - mb - ldq $3, TASK_GROUP_LEADER($2) - ldq $4, TASK_REAL_PARENT($3) - cmpeq $4, $5, $5 - beq $5, 1b -#endif - stq $1, 80($sp) - ret -.end sys_getxpid - - .align 4 - .globl sys_alpha_pipe - .ent sys_alpha_pipe -sys_alpha_pipe: - lda $sp, -16($sp) - stq $26, 0($sp) - .prologue 0 - - mov $31, $17 - lda $16, 8($sp) - jsr $26, do_pipe_flags - - ldq $26, 0($sp) - bne $0, 1f - - /* The return values are in $0 and $20. */ - ldl $1, 12($sp) - ldl $0, 8($sp) - - stq $1, 80+16($sp) -1: lda $sp, 16($sp) - ret -.end sys_alpha_pipe - .align 4 .globl sys_execve .ent sys_execve diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 98a103621af6..bc1acdda7a5e 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1404,3 +1404,52 @@ SYSCALL_DEFINE3(osf_writev, unsigned long, fd, } #endif + +SYSCALL_DEFINE2(osf_getpriority, int, which, int, who) +{ + int prio = sys_getpriority(which, who); + if (prio >= 0) { + /* Return value is the unbiased priority, i.e. 20 - prio. + This does result in negative return values, so signal + no error */ + force_successful_syscall_return(); + prio = 20 - prio; + } + return prio; +} + +SYSCALL_DEFINE0(getxuid) +{ + current_pt_regs()->r20 = sys_geteuid(); + return sys_getuid(); +} + +SYSCALL_DEFINE0(getxgid) +{ + current_pt_regs()->r20 = sys_getegid(); + return sys_getgid(); +} + +SYSCALL_DEFINE0(getxpid) +{ + current_pt_regs()->r20 = sys_getppid(); + return sys_getpid(); +} + +SYSCALL_DEFINE0(alpha_pipe) +{ + int fd[2]; + int res = do_pipe_flags(fd, 0); + if (!res) { + /* The return values are in $0 and $20. */ + current_pt_regs()->r20 = fd[1]; + res = fd[0]; + } + return res; +} + +SYSCALL_DEFINE1(sethae, unsigned long, val) +{ + current_pt_regs()->hae = val; + return 0; +} diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S index b64157c98e02..2ac6b45c3e00 100644 --- a/arch/alpha/kernel/systbls.S +++ b/arch/alpha/kernel/systbls.S @@ -111,7 +111,7 @@ sys_call_table: .quad sys_socket .quad sys_connect .quad sys_accept - .quad osf_getpriority /* 100 */ + .quad sys_osf_getpriority /* 100 */ .quad sys_send .quad sys_recv .quad sys_sigreturn diff --git a/kernel/timer.c b/kernel/timer.c index a61c09374eba..8c5e7b908c68 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1407,13 +1407,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif -#ifndef __alpha__ - -/* - * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this - * should be moved into arch/i386 instead? - */ - /** * sys_getpid - return the thread group id of the current process * @@ -1469,8 +1462,6 @@ SYSCALL_DEFINE0(getegid) return from_kgid_munged(current_user_ns(), current_egid()); } -#endif - static void process_timeout(unsigned long __data) { wake_up_process((struct task_struct *)__data); -- cgit v1.2.3-71-gd317 From c7a3a88c938fbe3d70c2278e082b80eb830d1c58 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 19 Aug 2012 19:10:42 +0200 Subject: uprobes: Fix mmap_region()'s mm->mm_rb corruption if uprobe_mmap() fails This patch fixes: https://bugzilla.redhat.com/show_bug.cgi?id=843640 If mmap_region()->uprobe_mmap() fails, unmap_and_free_vma path does unmap_region() but does not remove the soon-to-be-freed vma from rb tree. Actually there are more problems but this is how William noticed this bug. Perhaps we could do do_munmap() + return in this case, but in fact it is simply wrong to abort if uprobe_mmap() fails. Until at least we move the !UPROBE_COPY_INSN code from install_breakpoint() to uprobe_register(). For example, uprobe_mmap()->install_breakpoint() can fail if the probed insn is not supported (remember, uprobe_register() succeeds if nobody mmaps inode/offset), mmap() should not fail in this case. dup_mmap()->uprobe_mmap() is wrong too by the same reason, fork() can race with uprobe_register() and fail for no reason if it wins the race and does install_breakpoint() first. And, if nothing else, both mmap_region() and dup_mmap() return success if uprobe_mmap() fails. Change them to ignore the error code from uprobe_mmap(). Reported-and-tested-by: William Cohen Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: # v3.5 Cc: Anton Arapov Cc: William Cohen Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20120819171042.GB26957@redhat.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 ++-- mm/mmap.c | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3bd2280d79f6..2c8857e12855 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -455,8 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; - if (file && uprobe_mmap(tmp)) - goto out; + if (file) + uprobe_mmap(tmp); } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); diff --git a/mm/mmap.c b/mm/mmap.c index e3e86914f11a..52e08fc9186d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1356,9 +1356,8 @@ out: } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); - if (file && uprobe_mmap(vma)) - /* matching probes but cannot insert */ - goto unmap_and_free_vma; + if (file) + uprobe_mmap(vma); return addr; -- cgit v1.2.3-71-gd317 From f341861fb0b701139849f8a85c2d3cdff466e8e8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Aug 2012 15:05:14 +0200 Subject: task_work: add a scheduling point in task_work_run() It seems commit 4a9d4b024a31 ("switch fput to task_work_add") re- introduced the problem addressed in 944be0b22472 ("close_files(): add scheduling point") If a server process with a lot of files (say 2 million tcp sockets) is killed, we can spend a lot of time in task_work_run() and trigger a soft lockup. Signed-off-by: Eric Dumazet Signed-off-by: Linus Torvalds --- kernel/task_work.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index 91d4e1742a0c..d320d44903bd 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -75,6 +75,7 @@ void task_work_run(void) p = q->next; q->func(q); q = p; + cond_resched(); } } } -- cgit v1.2.3-71-gd317 From 784ffcbb96c3a97b4c64fd48b1dfe12ef3fcbcda Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 21 Aug 2012 20:30:46 -0400 Subject: time: Ensure we normalize the timekeeper in tk_xtime_add Andreas noticed problems with resume on specific hardware after commit 1e75fa8b (time: Condense timekeeper.xtime into xtime_sec) combined with commit b44d50dca (time: Fix casting issue in tk_set_xtime and tk_xtime_add) After some digging I realized we aren't normalizing the timekeeper after the add. Add the missing normalize call. Reported-by: Andreas Schwab Tested-by: Andreas Schwab Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1345595449-34965-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 898bef066a44..258164ae45cc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -115,6 +115,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec += ts->tv_sec; tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; + tk_normalize_xtime(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) -- cgit v1.2.3-71-gd317 From 85dc8f05c93c8105987de9d7e7cebf15a72ff4ec Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 21 Aug 2012 20:30:47 -0400 Subject: time: Fix casting issue in timekeeping_forward_now arch_gettimeoffset returns a u32 value which when shifted by tk->shift can overflow. This issue was introduced with 1e75fa8be (time: Condense timekeeper.xtime into xtime_sec) Cast it to u64 first. Signed-off-by: Andreas Schwab Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1345595449-34965-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 258164ae45cc..1dbf80ec7696 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -277,7 +277,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk->xtime_nsec += cycle_delta * tk->mult; /* If arch requires, add in gettimeoffset() */ - tk->xtime_nsec += arch_gettimeoffset() << tk->shift; + tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; tk_normalize_xtime(tk); -- cgit v1.2.3-71-gd317 From 6ea565a9be32a3c8d1092017686f183b6d8c4514 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 21 Aug 2012 20:30:48 -0400 Subject: time: Avoid potential shift overflow with large shift values Andreas Schwab noticed that the 1 << tk->shift could overflow if the shift value was greater than 30, since 1 would be a 32bit long on 32bit architectures. This issue was introduced by 1e75fa8be (time: Condense timekeeper.xtime into xtime_sec) Use 1ULL instead to ensure we don't overflow on the shift. Reported-by: Andreas Schwab Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1345595449-34965-4-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1dbf80ec7696..a5a9389c4c30 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1184,9 +1184,9 @@ static void update_wall_time(void) * the vsyscall implementations are converted to use xtime_nsec * (shifted nanoseconds), this can be killed. */ - remainder = tk->xtime_nsec & ((1 << tk->shift) - 1); + remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1 << tk->shift; + tk->xtime_nsec += 1ULL << tk->shift; tk->ntp_error += remainder << tk->ntp_error_shift; /* -- cgit v1.2.3-71-gd317 From bf2ac312195155511a0f79325515cbb61929898a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 21 Aug 2012 20:30:49 -0400 Subject: time: Avoid making adjustments if we haven't accumulated anything If update_wall_time() is called and the current offset isn't large enough to accumulate, avoid re-calling timekeeping_adjust which may change the clock freq and can cause 1ns inconsistencies with CLOCK_REALTIME_COARSE/CLOCK_MONOTONIC_COARSE. Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1345595449-34965-5-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a5a9389c4c30..0c1485e42be6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1152,6 +1152,10 @@ static void update_wall_time(void) offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif + /* Check if there's really nothing to do */ + if (offset < tk->cycle_interval) + goto out; + /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, -- cgit v1.2.3-71-gd317 From cee58483cf56e0ba355fdd97ff5e8925329aa936 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 31 Aug 2012 13:30:06 -0400 Subject: time: Move ktime_t overflow checking into timespec_valid_strict Andreas Bombe reported that the added ktime_t overflow checking added to timespec_valid in commit 4e8b14526ca7 ("time: Improve sanity checking of timekeeping inputs") was causing problems with X.org because it caused timeouts larger then KTIME_T to be invalid. Previously, these large timeouts would be clamped to KTIME_MAX and would never expire, which is valid. This patch splits the ktime_t overflow checking into a new timespec_valid_strict function, and converts the timekeeping codes internal checking to use this more strict function. Reported-and-tested-by: Andreas Bombe Cc: Zhouping Liu Cc: Ingo Molnar Cc: Prarit Bhargava Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: John Stultz Signed-off-by: Linus Torvalds --- include/linux/time.h | 7 +++++++ kernel/time/timekeeping.c | 10 +++++----- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index b0bbd8f0130d..b51e664c83e7 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -125,6 +125,13 @@ static inline bool timespec_valid(const struct timespec *ts) /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; + return true; +} + +static inline bool timespec_valid_strict(const struct timespec *ts) +{ + if (!timespec_valid(ts)) + return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0c1485e42be6..34e5eac81424 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -428,7 +428,7 @@ int do_settimeofday(const struct timespec *tv) struct timespec ts_delta, xt; unsigned long flags; - if (!timespec_valid(tv)) + if (!timespec_valid_strict(tv)) return -EINVAL; write_seqlock_irqsave(&tk->lock, flags); @@ -476,7 +476,7 @@ int timekeeping_inject_offset(struct timespec *ts) /* Make sure the proposed value is valid */ tmp = timespec_add(tk_xtime(tk), *ts); - if (!timespec_valid(&tmp)) { + if (!timespec_valid_strict(&tmp)) { ret = -EINVAL; goto error; } @@ -659,7 +659,7 @@ void __init timekeeping_init(void) struct timespec now, boot, tmp; read_persistent_clock(&now); - if (!timespec_valid(&now)) { + if (!timespec_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); now.tv_sec = 0; @@ -667,7 +667,7 @@ void __init timekeeping_init(void) } read_boot_clock(&boot); - if (!timespec_valid(&boot)) { + if (!timespec_valid_strict(&boot)) { pr_warn("WARNING: Boot clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); boot.tv_sec = 0; @@ -713,7 +713,7 @@ static struct timespec timekeeping_suspend_time; static void __timekeeping_inject_sleeptime(struct timekeeper *tk, struct timespec *delta) { - if (!timespec_valid(delta)) { + if (!timespec_valid_strict(delta)) { printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; -- cgit v1.2.3-71-gd317 From f319da0c6894fcf55e21320e40506418a2aad629 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Aug 2012 11:26:57 +0200 Subject: sched: Fix load avg vs cpu-hotplug Rabik and Paul reported two different issues related to the same few lines of code. Rabik's issue is that the nr_uninterruptible migration code is wrong in that he sees artifacts due to this (Rabik please do expand in more detail). Paul's issue is that this code as it stands relies on us using stop_machine() for unplug, we all would like to remove this assumption so that eventually we can remove this stop_machine() usage altogether. The only reason we'd have to migrate nr_uninterruptible is so that we could use for_each_online_cpu() loops in favour of for_each_possible_cpu() loops, however since nr_uninterruptible() is the only such loop and its using possible lets not bother at all. The problem Rabik sees is (probably) caused by the fact that by migrating nr_uninterruptible we screw rq->calc_load_active for both rqs involved. So don't bother with fancy migration schemes (meaning we now have to keep using for_each_possible_cpu()) and instead fold any nr_active delta after we migrate all tasks away to make sure we don't have any skewed nr_active accounting. Reported-by: Rakib Mullick Reported-by: Paul E. McKenney Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1345454817.23018.27.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbf1fd098dc6..207a81c769d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5304,27 +5304,17 @@ void idle_task_exit(void) } /* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; -} - -/* - * remove the tasks which were accounted by rq from calc_load_tasks. + * Since this CPU is going 'away' for a while, fold any nr_active delta + * we might have. Assumes we're called after migrate_tasks() so that the + * nr_active count is stable. + * + * Also see the comment "Global load-average calculations". */ -static void calc_global_load_remove(struct rq *rq) +static void calc_load_migrate(struct rq *rq) { - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + long delta = calc_load_fold_active(rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); } /* @@ -5618,8 +5608,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); - migrate_nr_uninterruptible(rq); - calc_global_load_remove(rq); + calc_load_migrate(rq); break; #endif } -- cgit v1.2.3-71-gd317 From 749c8814f08f12baa4a9c2812a7c6ede7d69507d Mon Sep 17 00:00:00 2001 From: Charles Wang Date: Mon, 20 Aug 2012 16:02:33 +0800 Subject: sched: Add missing call to calc_load_exit_idle() Azat Khuzhin reported high loadavg in Linux v3.6 After checking the upstream scheduler code, I found Peter's commit: 5167e8d5417b sched/nohz: Rewrite and fix load-avg computation -- again not fully applied, missing the call to calc_load_exit_idle(). After that idle exit in sampling window will always be calculated to non-idle, and the load will be higher than normal. This patch adds the missing call to calc_load_exit_idle(). Signed-off-by: Charles Wang Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1345449754-27130-1-git-send-email-muming.wq@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 024540f97f74..3a9e5d5c1091 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) tick_do_update_jiffies64(now); update_cpu_load_nohz(); + calc_load_exit_idle(); touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick -- cgit v1.2.3-71-gd317 From a4c96ae319b8047f62dedbe1eac79e321c185749 Mon Sep 17 00:00:00 2001 From: Peter Boonstoppel Date: Thu, 9 Aug 2012 15:34:47 -0700 Subject: sched: Unthrottle rt runqueues in __disable_runtime() migrate_tasks() uses _pick_next_task_rt() to get tasks from the real-time runqueues to be migrated. When rt_rq is throttled _pick_next_task_rt() won't return anything, in which case migrate_tasks() can't move all threads over and gets stuck in an infinite loop. Instead unthrottle rt runqueues before migrating tasks. Additionally: move unthrottle_offline_cfs_rqs() to rq_offline_fair() Signed-off-by: Peter Boonstoppel Signed-off-by: Peter Zijlstra Cc: Paul Turner Link: http://lkml.kernel.org/r/5FBF8E85CA34454794F0F7ECBA79798F379D3648B7@HQMAIL04.nvidia.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 --- kernel/sched/fair.c | 7 +++++-- kernel/sched/rt.c | 1 + kernel/sched/sched.h | 1 - 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 207a81c769d4..a4ea245f3d85 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5342,9 +5342,6 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; - /* Ensure any throttled groups are reachable by pick_next_task */ - unthrottle_offline_cfs_rqs(rq); - for ( ; ; ) { /* * There's this thread running, bail when that's the only diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c219bf8d704c..86ad83c45dae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } -void unthrottle_offline_cfs_rqs(struct rq *rq) +static void unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -void unthrottle_offline_cfs_rqs(struct rq *rq) {} +static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -4956,6 +4956,9 @@ static void rq_online_fair(struct rq *rq) static void rq_offline_fair(struct rq *rq) { update_sysctl(); + + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 944cb68420e9..e0b7ba9c040f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -691,6 +691,7 @@ balanced: * runtime - in which case borrowing doesn't make sense. */ rt_rq->rt_runtime = RUNTIME_INF; + rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f6714d009e77..0848fa36c383 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1144,7 +1144,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void unthrottle_offline_cfs_rqs(struct rq *rq); extern void account_cfs_bandwidth_used(int enabled, int was_enabled); -- cgit v1.2.3-71-gd317 From 9450d57eab5cad36774c297da123062744472588 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Aug 2012 17:45:08 -0700 Subject: sched: Fix kernel-doc warnings in kernel/sched/fair.c Fix two kernel-doc warnings in kernel/sched/fair.c: Warning(kernel/sched/fair.c:3660): Excess function parameter 'cpus' description in 'update_sg_lb_stats' Warning(kernel/sched/fair.c:3806): Excess function parameter 'cpus' description in 'update_sd_lb_stats' Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/50303714.3090204@xenotime.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 86ad83c45dae..42d9df6a5ca4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3658,7 +3658,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ @@ -3805,7 +3804,6 @@ static bool update_sd_pick_busiest(struct lb_env *env, /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ -- cgit v1.2.3-71-gd317 From a6fa941d94b411bbd2b6421ffbde6db3c93e65ab Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Aug 2012 14:59:25 +0100 Subject: perf_event: Switch to internal refcount, fix race with close() Don't mess with file refcounts (or keep a reference to file, for that matter) in perf_event. Use explicit refcount of its own instead. Deal with the race between the final reference to event going away and new children getting created for it by use of atomic_long_inc_not_zero() in inherit_event(); just have the latter free what it had allocated and return NULL, that works out just fine (children of siblings of something doomed are created as singletons, same as if the child of leader had been created and immediately killed). Signed-off-by: Al Viro Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120820135925.GG23464@ZenIV.linux.org.uk Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 62 ++++++++++++++++++++++++---------------------- 2 files changed, 34 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7602ccb3f40e..ad04dfcd6f35 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -926,7 +926,7 @@ struct perf_event { struct hw_perf_event hw; struct perf_event_context *ctx; - struct file *filp; + atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children diff --git a/kernel/events/core.c b/kernel/events/core.c index b7935fcec7d9..efef4282b8e8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2935,12 +2935,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ -static int perf_release(struct inode *inode, struct file *file) +static void put_event(struct perf_event *event) { - struct perf_event *event = file->private_data; struct task_struct *owner; - file->private_data = NULL; + if (!atomic_long_dec_and_test(&event->refcount)) + return; rcu_read_lock(); owner = ACCESS_ONCE(event->owner); @@ -2975,7 +2975,13 @@ static int perf_release(struct inode *inode, struct file *file) put_task_struct(owner); } - return perf_event_release_kernel(event); + perf_event_release_kernel(event); +} + +static int perf_release(struct inode *inode, struct file *file) +{ + put_event(file->private_data); + return 0; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) @@ -3227,7 +3233,7 @@ unlock: static const struct file_operations perf_fops; -static struct perf_event *perf_fget_light(int fd, int *fput_needed) +static struct file *perf_fget_light(int fd, int *fput_needed) { struct file *file; @@ -3241,7 +3247,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed) return ERR_PTR(-EBADF); } - return file->private_data; + return file; } static int perf_event_set_output(struct perf_event *event, @@ -3273,19 +3279,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: { + struct file *output_file = NULL; struct perf_event *output_event = NULL; int fput_needed = 0; int ret; if (arg != -1) { - output_event = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_event)) - return PTR_ERR(output_event); + output_file = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_file)) + return PTR_ERR(output_file); + output_event = output_file->private_data; } ret = perf_event_set_output(event, output_event); if (output_event) - fput_light(output_event->filp, fput_needed); + fput_light(output_file, fput_needed); return ret; } @@ -5950,6 +5958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->mmap_mutex); + atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; @@ -6260,12 +6269,12 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; if (group_fd != -1) { - group_leader = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_leader)) { - err = PTR_ERR(group_leader); + group_file = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_file)) { + err = PTR_ERR(group_file); goto err_fd; } - group_file = group_leader->filp; + group_leader = group_file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6402,7 +6411,6 @@ SYSCALL_DEFINE5(perf_event_open, put_ctx(gctx); } - event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); @@ -6496,7 +6504,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_free; } - event->filp = NULL; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); @@ -6578,7 +6585,7 @@ static void sync_child_event(struct perf_event *child_event, * Release the parent event, if this was the last * reference to it. */ - fput(parent_event->filp); + put_event(parent_event); } static void @@ -6654,9 +6661,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * * __perf_event_exit_task() * sync_child_event() - * fput(parent_event->filp) - * perf_release() - * mutex_lock(&ctx->mutex) + * put_event() + * mutex_lock(&ctx->mutex) * * But since its the parent context it won't be the same instance. */ @@ -6724,7 +6730,7 @@ static void perf_free_event(struct perf_event *event, list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); - fput(parent->filp); + put_event(parent); perf_group_detach(event); list_del_event(event, ctx); @@ -6804,6 +6810,12 @@ inherit_event(struct perf_event *parent_event, NULL, NULL); if (IS_ERR(child_event)) return child_event; + + if (!atomic_long_inc_not_zero(&parent_event->refcount)) { + free_event(child_event); + return NULL; + } + get_ctx(child_ctx); /* @@ -6844,14 +6856,6 @@ inherit_event(struct perf_event *parent_event, add_event_to_ctx(child_event, child_ctx); raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - /* * Link this into the parent event's child list */ -- cgit v1.2.3-71-gd317 From 500ad2d8b01390c98bc6dce068bccfa9534b8212 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Thu, 2 Aug 2012 13:46:35 +0530 Subject: perf/hwpb: Invoke __perf_event_disable() if interrupts are already disabled While debugging a warning message on PowerPC while using hardware breakpoints, it was discovered that when perf_event_disable is invoked through hw_breakpoint_handler function with interrupts disabled, a subsequent IPI in the code path would trigger a WARN_ON_ONCE message in smp_call_function_single function. This patch calls __perf_event_disable() when interrupts are already disabled, instead of perf_event_disable(). Reported-by: Edjunior Barbosa Machado Signed-off-by: K.Prasad [naveen.n.rao@linux.vnet.ibm.com: v3: Check to make sure we target current task] Signed-off-by: Naveen N. Rao Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120802081635.5811.17737.stgit@localhost.localdomain [ Fixed build error on MIPS. ] Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 2 +- kernel/events/hw_breakpoint.c | 11 ++++++++++- 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ad04dfcd6f35..33ed9d605f91 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1296,6 +1296,7 @@ extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); +extern int __perf_event_disable(void *info); extern void perf_event_task_tick(void); #else static inline void @@ -1334,6 +1335,7 @@ static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } +static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } #endif diff --git a/kernel/events/core.c b/kernel/events/core.c index efef4282b8e8..7fee567153f0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1253,7 +1253,7 @@ retry: /* * Cross CPU call to disable a performance event */ -static int __perf_event_disable(void *info) +int __perf_event_disable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bb38c4d3ee12..9a7b487c6fe2 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att int old_type = bp->attr.bp_type; int err = 0; - perf_event_disable(bp); + /* + * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it + * will not be possible to raise IPIs that invoke __perf_event_disable. + * So call the function directly after making sure we are targeting the + * current task. + */ + if (irqs_disabled() && bp->ctx && bp->ctx->task == current) + __perf_event_disable(bp); + else + perf_event_disable(bp); bp->attr.bp_addr = attr->bp_addr; bp->attr.bp_type = attr->bp_type; -- cgit v1.2.3-71-gd317 From 96e65306b81351b656835c15931d1d237b252f27 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 2 Sep 2012 00:28:19 +0800 Subject: workqueue: UNBOUND -> REBIND morphing in rebind_workers() should be atomic The compiler may compile the following code into TWO write/modify instructions. worker->flags &= ~WORKER_UNBOUND; worker->flags |= WORKER_REBIND; so the other CPU may temporarily see worker->flags which doesn't have either WORKER_UNBOUND or WORKER_REBIND set and perform local wakeup prematurely. Fix it by using single explicit assignment via ACCESS_ONCE(). Because idle workers have another WORKER_NOT_RUNNING flag, this bug doesn't exist for them; however, update it to use the same pattern for consistency. tj: Applied the change to idle workers too and updated comments and patch description a bit. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/workqueue.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 692d97628a10..c462cd60c374 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1396,12 +1396,15 @@ retry: /* set REBIND and kick idle ones, we'll wait for these later */ for_each_worker_pool(pool, gcwq) { list_for_each_entry(worker, &pool->idle_list, entry) { + unsigned long worker_flags = worker->flags; + if (worker->flags & WORKER_REBIND) continue; - /* morph UNBOUND to REBIND */ - worker->flags &= ~WORKER_UNBOUND; - worker->flags |= WORKER_REBIND; + /* morph UNBOUND to REBIND atomically */ + worker_flags &= ~WORKER_UNBOUND; + worker_flags |= WORKER_REBIND; + ACCESS_ONCE(worker->flags) = worker_flags; idle_rebind.cnt++; worker->idle_rebind = &idle_rebind; @@ -1434,10 +1437,12 @@ retry: /* rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; + unsigned long worker_flags = worker->flags; - /* morph UNBOUND to REBIND */ - worker->flags &= ~WORKER_UNBOUND; - worker->flags |= WORKER_REBIND; + /* morph UNBOUND to REBIND atomically */ + worker_flags &= ~WORKER_UNBOUND; + worker_flags |= WORKER_REBIND; + ACCESS_ONCE(worker->flags) = worker_flags; if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(rebind_work))) -- cgit v1.2.3-71-gd317 From 90beca5de591e12482a812f23a7f10690962ed4a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Sep 2012 23:12:33 -0700 Subject: workqueue: move WORKER_REBIND clearing in rebind_workers() to the end of the function This doesn't make any functional difference and is purely to help the next patch to be simpler. Signed-off-by: Tejun Heo Cc: Lai Jiangshan --- kernel/workqueue.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c462cd60c374..d79a18d0c42e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1422,19 +1422,7 @@ retry: goto retry; } - /* - * All idle workers are rebound and waiting for %WORKER_REBIND to - * be cleared inside idle_worker_rebind(). Clear and release. - * Clearing %WORKER_REBIND from this foreign context is safe - * because these workers are still guaranteed to be idle. - */ - for_each_worker_pool(pool, gcwq) - list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags &= ~WORKER_REBIND; - - wake_up_all(&gcwq->rebind_hold); - - /* rebind busy workers */ + /* all idle workers are rebound, rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; unsigned long worker_flags = worker->flags; @@ -1454,6 +1442,18 @@ retry: worker->scheduled.next, work_color_to_flags(WORK_NO_COLOR)); } + + /* + * All idle workers are rebound and waiting for %WORKER_REBIND to + * be cleared inside idle_worker_rebind(). Clear and release. + * Clearing %WORKER_REBIND from this foreign context is safe + * because these workers are still guaranteed to be idle. + */ + for_each_worker_pool(pool, gcwq) + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags &= ~WORKER_REBIND; + + wake_up_all(&gcwq->rebind_hold); } static struct worker *alloc_worker(void) -- cgit v1.2.3-71-gd317 From ec58815ab0409a921a7c9744eb4ca44866b14d71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Sep 2012 23:16:32 -0700 Subject: workqueue: fix possible deadlock in idle worker rebinding Currently, rebind_workers() and idle_worker_rebind() are two-way interlocked. rebind_workers() waits for idle workers to finish rebinding and rebound idle workers wait for rebind_workers() to finish rebinding busy workers before proceeding. Unfortunately, this isn't enough. The second wait from idle workers is implemented as follows. wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); rebind_workers() clears WORKER_REBIND, wakes up the idle workers and then returns. If CPU hotplug cycle happens again before one of the idle workers finishes the above wait_event(), rebind_workers() will repeat the first part of the handshake - set WORKER_REBIND again and wait for the idle worker to finish rebinding - and this leads to deadlock because the idle worker would be waiting for WORKER_REBIND to clear. This is fixed by adding another interlocking step at the end - rebind_workers() now waits for all the idle workers to finish the above WORKER_REBIND wait before returning. This ensures that all rebinding steps are complete on all idle workers before the next hotplug cycle can happen. This problem was diagnosed by Lai Jiangshan who also posted a patch to fix the issue, upon which this patch is based. This is the minimal fix and further patches are scheduled for the next merge window to simplify the CPU hotplug path. Signed-off-by: Tejun Heo Original-patch-by: Lai Jiangshan LKML-Reference: <1346516916-1991-3-git-send-email-laijs@cn.fujitsu.com> --- kernel/workqueue.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d79a18d0c42e..dc7b8458e275 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1326,6 +1326,15 @@ static void idle_worker_rebind(struct worker *worker) /* we did our part, wait for rebind_workers() to finish up */ wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); + + /* + * rebind_workers() shouldn't finish until all workers passed the + * above WORKER_REBIND wait. Tell it when done. + */ + spin_lock_irq(&worker->pool->gcwq->lock); + if (!--worker->idle_rebind->cnt) + complete(&worker->idle_rebind->done); + spin_unlock_irq(&worker->pool->gcwq->lock); } /* @@ -1448,12 +1457,28 @@ retry: * be cleared inside idle_worker_rebind(). Clear and release. * Clearing %WORKER_REBIND from this foreign context is safe * because these workers are still guaranteed to be idle. + * + * We need to make sure all idle workers passed WORKER_REBIND wait + * in idle_worker_rebind() before returning; otherwise, workers can + * get stuck at the wait if hotplug cycle repeats. */ - for_each_worker_pool(pool, gcwq) - list_for_each_entry(worker, &pool->idle_list, entry) + idle_rebind.cnt = 1; + INIT_COMPLETION(idle_rebind.done); + + for_each_worker_pool(pool, gcwq) { + list_for_each_entry(worker, &pool->idle_list, entry) { worker->flags &= ~WORKER_REBIND; + idle_rebind.cnt++; + } + } wake_up_all(&gcwq->rebind_hold); + + if (--idle_rebind.cnt) { + spin_unlock_irq(&gcwq->lock); + wait_for_completion(&idle_rebind.done); + spin_lock_irq(&gcwq->lock); + } } static struct worker *alloc_worker(void) -- cgit v1.2.3-71-gd317 From 552a37e9360a293cd20e7f8ff1fb326a244c5f1e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 10 Sep 2012 10:03:33 -0700 Subject: workqueue: restore POOL_MANAGING_WORKERS This patch restores POOL_MANAGING_WORKERS which was replaced by pool->manager_mutex by 6037315269 "workqueue: use mutex for global_cwq manager exclusion". There's a subtle idle worker depletion bug across CPU hotplug events and we need to distinguish an actual manager and CPU hotplug preventing management. POOL_MANAGING_WORKERS will be used for the former and manager_mutex the later. This patch just lays POOL_MANAGING_WORKERS on top of the existing manager_mutex and doesn't introduce any synchronization changes. The next patch will update it. Note that this patch fixes a non-critical anomaly where too_many_workers() may return %true spuriously while CPU hotplug is in progress. While the issue could schedule idle timer spuriously, it didn't trigger any actual misbehavior. tj: Rewrote patch description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dc7b8458e275..383548ed0b54 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -66,6 +66,7 @@ enum { /* pool flags */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ + POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ @@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = mutex_is_locked(&pool->manager_mutex); + bool managing = pool->flags & POOL_MANAGING_WORKERS; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -1827,6 +1828,7 @@ static bool manage_workers(struct worker *worker) if (!mutex_trylock(&pool->manager_mutex)) return ret; + pool->flags |= POOL_MANAGING_WORKERS; pool->flags &= ~POOL_MANAGE_WORKERS; /* @@ -1836,6 +1838,7 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); + pool->flags &= ~POOL_MANAGING_WORKERS; mutex_unlock(&pool->manager_mutex); return ret; } -- cgit v1.2.3-71-gd317 From ee378aa49b594da9bda6a2c768cc5b2ad585f911 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 10 Sep 2012 10:03:44 -0700 Subject: workqueue: fix possible idle worker depletion across CPU hotplug To simplify both normal and CPU hotplug paths, worker management is prevented while CPU hoplug is in progress. This is achieved by CPU hotplug holding the same exclusion mechanism used by workers to ensure there's only one manager per pool. If someone else seems to be performing the manager role, workers proceed to execute work items. CPU hotplug using the same mechanism can lead to idle worker depletion because all workers could proceed to execute work items while CPU hotplug is in progress and CPU hotplug itself wouldn't actually perform the worker management duty - it doesn't guarantee that there's an idle worker left when it releases management. This idle worker depletion, under extreme circumstances, can break forward-progress guarantee and thus lead to deadlock. This patch fixes the bug by using separate mechanisms for manager exclusion among workers and hotplug exclusion. For manager exclusion, POOL_MANAGING_WORKERS which was restored by the previous patch is used. pool->manager_mutex is now only used for exclusion between the elected manager and CPU hotplug. The elected manager won't proceed without holding pool->manager_mutex. This ensures that the worker which won the manager position can't skip managing while CPU hotplug is in progress. It will block on manager_mutex and perform management after CPU hotplug is complete. Note that hotplug may happen while waiting for manager_mutex. A manager isn't either on idle or busy list and thus the hoplug code can't unbind/rebind it. Make the manager handle its own un/rebinding. tj: Updated comment and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 383548ed0b54..1e1373bcb3e3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1825,10 +1825,45 @@ static bool manage_workers(struct worker *worker) struct worker_pool *pool = worker->pool; bool ret = false; - if (!mutex_trylock(&pool->manager_mutex)) + if (pool->flags & POOL_MANAGING_WORKERS) return ret; pool->flags |= POOL_MANAGING_WORKERS; + + /* + * To simplify both worker management and CPU hotplug, hold off + * management while hotplug is in progress. CPU hotplug path can't + * grab %POOL_MANAGING_WORKERS to achieve this because that can + * lead to idle worker depletion (all become busy thinking someone + * else is managing) which in turn can result in deadlock under + * extreme circumstances. Use @pool->manager_mutex to synchronize + * manager against CPU hotplug. + * + * manager_mutex would always be free unless CPU hotplug is in + * progress. trylock first without dropping @gcwq->lock. + */ + if (unlikely(!mutex_trylock(&pool->manager_mutex))) { + spin_unlock_irq(&pool->gcwq->lock); + mutex_lock(&pool->manager_mutex); + /* + * CPU hotplug could have happened while we were waiting + * for manager_mutex. Hotplug itself can't handle us + * because manager isn't either on idle or busy list, and + * @gcwq's state and ours could have deviated. + * + * As hotplug is now excluded via manager_mutex, we can + * simply try to bind. It will succeed or fail depending + * on @gcwq's current state. Try it and adjust + * %WORKER_UNBOUND accordingly. + */ + if (worker_maybe_bind_and_lock(worker)) + worker->flags &= ~WORKER_UNBOUND; + else + worker->flags |= WORKER_UNBOUND; + + ret = true; + } + pool->flags &= ~POOL_MANAGE_WORKERS; /* -- cgit v1.2.3-71-gd317 From ec145babe754f9ea1079034a108104b6001e001c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 11 Sep 2012 19:26:03 -0400 Subject: time: Fix timeekeping_get_ns overflow on 32bit systems Daniel Lezcano reported seeing multi-second stalls from keyboard input on his T61 laptop when NOHZ and CPU_IDLE were enabled on a 32bit kernel. He bisected the problem down to commit 1e75fa8be9fb6 ("time: Condense timekeeper.xtime into xtime_sec"). After reproducing this issue, I narrowed the problem down to the fact that timekeeping_get_ns() returns a 64bit nsec value that hasn't been accumulated. In some cases this value was being then stored in timespec.tv_nsec (which is a long). On 32bit systems, with idle times larger then 4 seconds (or less, depending on the value of xtime_nsec), the returned nsec value would overflow 32bits. This limited kept time from increasing, causing timers to not expire. The fix is to make sure we don't directly store the result of timekeeping_get_ns() into a tv_nsec field, instead using a 64bit nsec value which can then be added into the timespec via timespec_add_ns(). Reported-and-bisected-by: Daniel Lezcano Tested-by: Daniel Lezcano Signed-off-by: John Stultz Acked-by: Prarit Bhargava Cc: Richard Cochran Link: http://lkml.kernel.org/r/1347405963-35715-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 34e5eac81424..d3b91e75cecd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -303,10 +303,11 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&tk->lock); ts->tv_sec = tk->xtime_sec; - ts->tv_nsec = timekeeping_get_ns(tk); + nsecs = timekeeping_get_ns(tk); } while (read_seqretry(&tk->lock, seq)); + ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); } EXPORT_SYMBOL(getnstimeofday); @@ -345,6 +346,7 @@ void ktime_get_ts(struct timespec *ts) { struct timekeeper *tk = &timekeeper; struct timespec tomono; + s64 nsec; unsigned int seq; WARN_ON(timekeeping_suspended); @@ -352,13 +354,14 @@ void ktime_get_ts(struct timespec *ts) do { seq = read_seqbegin(&tk->lock); ts->tv_sec = tk->xtime_sec; - ts->tv_nsec = timekeeping_get_ns(tk); + nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; } while (read_seqretry(&tk->lock, seq)); - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec_add_ns(ts, nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); @@ -1244,6 +1247,7 @@ void get_monotonic_boottime(struct timespec *ts) { struct timekeeper *tk = &timekeeper; struct timespec tomono, sleep; + s64 nsec; unsigned int seq; WARN_ON(timekeeping_suspended); @@ -1251,14 +1255,15 @@ void get_monotonic_boottime(struct timespec *ts) do { seq = read_seqbegin(&tk->lock); ts->tv_sec = tk->xtime_sec; - ts->tv_nsec = timekeeping_get_ns(tk); + nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; sleep = tk->total_sleep_time; } while (read_seqretry(&tk->lock, seq)); - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, - ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); + ts->tv_sec += tomono.tv_sec + sleep.tv_sec; + ts->tv_nsec = 0; + timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); } EXPORT_SYMBOL_GPL(get_monotonic_boottime); -- cgit v1.2.3-71-gd317 From 37407ea7f93864c2cfc03edf8f37872ec539ea2b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Sep 2012 12:29:43 -0700 Subject: Revert "sched: Improve scalability via 'CPU buddies', which withstand random perturbations" This reverts commit 970e178985cadbca660feb02f4d2ee3a09f7fdda. Nikolay Ulyanitsky reported thatthe 3.6-rc5 kernel has a 15-20% performance drop on PostgreSQL 9.2 on his machine (running "pgbench"). Borislav Petkov was able to reproduce this, and bisected it to this commit 970e178985ca ("sched: Improve scalability via 'CPU buddies' ...") apparently because the new single-idle-buddy model simply doesn't find idle CPU's to reschedule on aggressively enough. Mike Galbraith suspects that it is likely due to the user-mode spinlocks in PostgreSQL not reacting well to preemption, but we don't really know the details - I'll just revert the commit for now. There are hopefully other approaches to improve scheduler scalability without it causing these kinds of downsides. Reported-by: Nikolay Ulyanitsky Bisected-by: Borislav Petkov Acked-by: Mike Galbraith Cc: Andrew Morton Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/sched/core.c | 39 +-------------------------------------- kernel/sched/fair.c | 28 +++++++++++++++++++++------- 3 files changed, 22 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b8c86648a2f9..23bddac4bad8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -954,7 +954,6 @@ struct sched_domain { unsigned int smt_gain; int flags; /* See SD_* */ int level; - int idle_buddy; /* cpu assigned to select_idle_sibling() */ /* Runtime fields. */ unsigned long last_balance; /* init to jiffies. units in jiffies */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a4ea245f3d85..649c9f876cb1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6014,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * - * Iterate domains and sched_groups downward, assigning CPUs to be - * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing - * due to random perturbation self canceling, ie sw buddies pull - * their counterpart to their CPU's hw counterpart. - * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6032,40 +6027,8 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) { - struct sched_domain *tmp = sd; - struct sched_group *sg, *prev; - bool right; - - /* - * Traverse to first CPU in group, and count hops - * to cpu from there, switching direction on each - * hop, never ever pointing the last CPU rightward. - */ - do { - id = cpumask_first(sched_domain_span(tmp)); - prev = sg = tmp->groups; - right = 1; - - while (cpumask_first(sched_group_cpus(sg)) != id) - sg = sg->next; - - while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { - prev = sg; - sg = sg->next; - right = !right; - } - - /* A CPU went down, never point back to domain start. */ - if (right && cpumask_first(sched_group_cpus(sg->next)) == id) - right = false; - - sg = right ? sg->next : prev; - tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); - } while ((tmp = tmp->child)); - + if (sd) id = cpumask_first(sched_domain_span(sd)); - } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42d9df6a5ca4..96e2b18b6283 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; + struct sched_group *sg; + int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, check assigned siblings to find an elegible idle cpu. + * Otherwise, iterate the domains and find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) - continue; - if (idle_cpu(sd->idle_buddy)) - return sd->idle_buddy; - } + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; + for_each_cpu(i, sched_group_cpus(sg)) { + if (!idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + goto done; +next: + sg = sg->next; + } while (sg != sd->groups); + } +done: return target; } -- cgit v1.2.3-71-gd317 From 579035dc5ddd6d48fd8529e7358b03d911ab9d8a Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Mon, 17 Sep 2012 14:09:12 -0700 Subject: pid-namespace: limit value of ns_last_pid to (0, max_pid) The kernel doesn't check the pid for negative values, so if you try to write -2 to /proc/sys/kernel/ns_last_pid, you will get a kernel panic. The crash happens because the next pid is -1, and alloc_pidmap() will try to access to a nonexistent pidmap. map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; Signed-off-by: Andrew Vagin Acked-by: Cyrill Gorcunov Acked-by: Oleg Nesterov Cc: Eric W. Biederman Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b3c7fd554250..6144bab8fd8e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -232,15 +232,19 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, */ tmp.data = ¤t->nsproxy->pid_ns->last_pid; - return proc_dointvec(&tmp, write, buffer, lenp, ppos); + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); } +extern int pid_max; +static int zero = 0; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, + .extra1 = &zero, + .extra2 = &pid_max, }, { } }; -- cgit v1.2.3-71-gd317 From 960bd11bf2daf669d0d910428fd9ef5a15c3d7cb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 17 Sep 2012 15:42:31 -0700 Subject: workqueue: always clear WORKER_REBIND in busy_worker_rebind_fn() busy_worker_rebind_fn() didn't clear WORKER_REBIND if rebinding failed (CPU is down again). This used to be okay because the flag wasn't used for anything else. However, after 25511a477 "workqueue: reimplement CPU online rebinding to handle idle workers", WORKER_REBIND is also used to command idle workers to rebind. If not cleared, the worker may confuse the next CPU_UP cycle by having REBIND spuriously set or oops / get stuck by prematurely calling idle_worker_rebind(). WARNING: at /work/os/wq/kernel/workqueue.c:1323 worker_thread+0x4cd/0x5 00() Hardware name: Bochs Modules linked in: test_wq(O-) Pid: 33, comm: kworker/1:1 Tainted: G O 3.6.0-rc1-work+ #3 Call Trace: [] warn_slowpath_common+0x7f/0xc0 [] warn_slowpath_null+0x1a/0x20 [] worker_thread+0x4cd/0x500 [] kthread+0xbe/0xd0 [] kernel_thread_helper+0x4/0x10 ---[ end trace e977cf20f4661968 ]--- BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] worker_thread+0x360/0x500 PGD 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: test_wq(O-) CPU 0 Pid: 33, comm: kworker/1:1 Tainted: G W O 3.6.0-rc1-work+ #3 Bochs Bochs RIP: 0010:[] [] worker_thread+0x360/0x500 RSP: 0018:ffff88001e1c9de0 EFLAGS: 00010086 RAX: 0000000000000000 RBX: ffff88001e633e00 RCX: 0000000000004140 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000009 RBP: ffff88001e1c9ea0 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000002 R11: 0000000000000000 R12: ffff88001fc8d580 R13: ffff88001fc8d590 R14: ffff88001e633e20 R15: ffff88001e1c6900 FS: 0000000000000000(0000) GS:ffff88001fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 00000000130e8000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kworker/1:1 (pid: 33, threadinfo ffff88001e1c8000, task ffff88001e1c6900) Stack: ffff880000000000 ffff88001e1c9e40 0000000000000001 ffff88001e1c8010 ffff88001e519c78 ffff88001e1c9e58 ffff88001e1c6900 ffff88001e1c6900 ffff88001e1c6900 ffff88001e1c6900 ffff88001fc8d340 ffff88001fc8d340 Call Trace: [] kthread+0xbe/0xd0 [] kernel_thread_helper+0x4/0x10 Code: b1 00 f6 43 48 02 0f 85 91 01 00 00 48 8b 43 38 48 89 df 48 8b 00 48 89 45 90 e8 ac f0 ff ff 3c 01 0f 85 60 01 00 00 48 8b 53 50 <8b> 02 83 e8 01 85 c0 89 02 0f 84 3b 01 00 00 48 8b 43 38 48 8b RIP [] worker_thread+0x360/0x500 RSP CR2: 0000000000000000 There was no reason to keep WORKER_REBIND on failure in the first place - WORKER_UNBOUND is guaranteed to be set in such cases preventing incorrectly activating concurrency management. Always clear WORKER_REBIND. tj: Updated comment and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1e1373bcb3e3..b80065a2450a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1349,8 +1349,16 @@ static void busy_worker_rebind_fn(struct work_struct *work) struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->pool->gcwq; - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_REBIND); + worker_maybe_bind_and_lock(worker); + + /* + * %WORKER_REBIND must be cleared even if the above binding failed; + * otherwise, we may confuse the next CPU_UP cycle or oops / get + * stuck by calling idle_worker_rebind() prematurely. If CPU went + * down again inbetween, %WORKER_UNBOUND would be set, so clearing + * %WORKER_REBIND is always safe. + */ + worker_clr_flags(worker, WORKER_REBIND); spin_unlock_irq(&gcwq->lock); } -- cgit v1.2.3-71-gd317 From ed48ece27cd3d5ee0354c32bbaec0f3e1d4715c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Sep 2012 12:48:43 -0700 Subject: workqueue: reimplement work_on_cpu() using system_wq The existing work_on_cpu() implementation is hugely inefficient. It creates a new kthread, execute that single function and then let the kthread die on each invocation. Now that system_wq can handle concurrent executions, there's no advantage of doing this. Reimplement work_on_cpu() using system_wq which makes it simpler and way more efficient. stable: While this isn't a fix in itself, it's needed to fix a workqueue related bug in cpufreq/powernow-k8. AFAICS, this shouldn't break other existing users. Signed-off-by: Tejun Heo Acked-by: Jiri Kosina Cc: Linus Torvalds Cc: Bjorn Helgaas Cc: Len Brown Cc: Rafael J. Wysocki Cc: stable@vger.kernel.org --- kernel/workqueue.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b80065a2450a..3c5a79e2134c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3576,18 +3576,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, #ifdef CONFIG_SMP struct work_for_cpu { - struct completion completion; + struct work_struct work; long (*fn)(void *); void *arg; long ret; }; -static int do_work_for_cpu(void *_wfc) +static void work_for_cpu_fn(struct work_struct *work) { - struct work_for_cpu *wfc = _wfc; + struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); + wfc->ret = wfc->fn(wfc->arg); - complete(&wfc->completion); - return 0; } /** @@ -3602,19 +3601,11 @@ static int do_work_for_cpu(void *_wfc) */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { - struct task_struct *sub_thread; - struct work_for_cpu wfc = { - .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion), - .fn = fn, - .arg = arg, - }; + struct work_for_cpu wfc = { .fn = fn, .arg = arg }; - sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); - if (IS_ERR(sub_thread)) - return PTR_ERR(sub_thread); - kthread_bind(sub_thread, cpu); - wake_up_process(sub_thread); - wait_for_completion(&wfc.completion); + INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); -- cgit v1.2.3-71-gd317