From 874bbfe600a660cba9c776b3957b1ce393151b76 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 30 Sep 2015 09:05:30 -0700 Subject: workqueue: make sure delayed work run in local cpu My system keeps crashing with below message. vmstat_update() schedules a delayed work in current cpu and expects the work runs in the cpu. schedule_delayed_work() is expected to make delayed work run in local cpu. The problem is timer can be migrated with NO_HZ. __queue_work() queues work in timer handler, which could run in a different cpu other than where the delayed work is scheduled. The end result is the delayed work runs in different cpu. The patch makes __queue_delayed_work records local cpu earlier. Where the timer runs doesn't change where the work runs with the change. [ 28.010131] ------------[ cut here ]------------ [ 28.010609] kernel BUG at ../mm/vmstat.c:1392! [ 28.011099] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN [ 28.011860] Modules linked in: [ 28.012245] CPU: 0 PID: 289 Comm: kworker/0:3 Tainted: G W4.3.0-rc3+ #634 [ 28.013065] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153802- 04/01/2014 [ 28.014160] Workqueue: events vmstat_update [ 28.014571] task: ffff880117682580 ti: ffff8800ba428000 task.ti: ffff8800ba428000 [ 28.015445] RIP: 0010:[] []vmstat_update+0x31/0x80 [ 28.016282] RSP: 0018:ffff8800ba42fd80 EFLAGS: 00010297 [ 28.016812] RAX: 0000000000000000 RBX: ffff88011a858dc0 RCX:0000000000000000 [ 28.017585] RDX: ffff880117682580 RSI: ffffffff81f14d8c RDI:ffffffff81f4df8d [ 28.018366] RBP: ffff8800ba42fd90 R08: 0000000000000001 R09:0000000000000000 [ 28.019169] R10: 0000000000000000 R11: 0000000000000121 R12:ffff8800baa9f640 [ 28.019947] R13: ffff88011a81e340 R14: ffff88011a823700 R15:0000000000000000 [ 28.020071] FS: 0000000000000000(0000) GS:ffff88011a800000(0000)knlGS:0000000000000000 [ 28.020071] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 28.020071] CR2: 00007ff6144b01d0 CR3: 00000000b8e93000 CR4:00000000000006f0 [ 28.020071] Stack: [ 28.020071] ffff88011a858dc0 ffff8800baa9f640 ffff8800ba42fe00ffffffff8106bd88 [ 28.020071] ffffffff8106bd0b 0000000000000096 0000000000000000ffffffff82f9b1e8 [ 28.020071] ffffffff829f0b10 0000000000000000 ffffffff81f18460ffff88011a81e340 [ 28.020071] Call Trace: [ 28.020071] [] process_one_work+0x1c8/0x540 [ 28.020071] [] ? process_one_work+0x14b/0x540 [ 28.020071] [] worker_thread+0x114/0x460 [ 28.020071] [] ? process_one_work+0x540/0x540 [ 28.020071] [] kthread+0xf8/0x110 [ 28.020071] [] ?kthread_create_on_node+0x200/0x200 [ 28.020071] [] ret_from_fork+0x3f/0x70 [ 28.020071] [] ?kthread_create_on_node+0x200/0x200 Signed-off-by: Shaohua Li Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v2.6.31+ --- kernel/workqueue.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca71582fcfab..bcb14cafe007 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1458,13 +1458,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); dwork->wq = wq; + /* timer isn't guaranteed to run in this cpu, record earlier */ + if (cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); dwork->cpu = cpu; timer->expires = jiffies + delay; - if (unlikely(cpu != WORK_CPU_UNBOUND)) - add_timer_on(timer, cpu); - else - add_timer(timer); + add_timer_on(timer, cpu); } /** -- cgit v1.2.3-71-gd317 From 9babcd7929bc8967ae3bb6093f603b93c2f9958f Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 8 Oct 2015 15:36:06 -0300 Subject: sched, tracing: Stop/start critical timings around the idle=poll idle loop When using idle=poll, the preemptoff tracer is always showing the idle task as the culprit for long latencies. That happens because critical timings are not stopped before idle loop. This patch stops critical timings before entering the idle loop, starting it again after the idle loop. This problem does not affect the irqsoff tracer because interruptions are enabled before entering the idle loop. Signed-off-by: Daniel Bristot de Oliveira Reviewed-by: Luis Claudio R. Goncalves Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/10fc3705874aef11dbe152a068b591a7be1899b4.1444314899.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f177c73ae19..4a2ef5a02fd3 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -57,9 +57,11 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); + stop_critical_timings(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); + start_critical_timings(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); return 1; -- cgit v1.2.3-71-gd317 From 0701c53e460ea64daf0ee789d0b08fef57800016 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 Oct 2015 19:14:45 +0100 Subject: genirq/msi: Do not use pci_msi_[un]mask_irq as default methods When we create a generic MSI domain, that MSI_FLAG_USE_DEF_CHIP_OPS is set, and that any of .mask or .unmask are NULL in the irq_chip structure, we set them to pci_msi_[un]mask_irq. This is a bad idea for at least two reasons: - PCI_MSI might not be selected, kernel fails to build (yes, this is legitimate, at least on arm64!) - This may not be a PCI/MSI domain at all (platform MSI, for example) Either way, this looks wrong. Move the overriding of mask/unmask to the PCI counterpart, and panic is any of these two methods is not set in the core code (they really should be present). Signed-off-by: Marc Zyngier Cc: Jiang Liu Cc: Bjorn Helgaas Link: http://lkml.kernel.org/r/1444760085-27857-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- drivers/pci/msi.c | 4 ++++ kernel/irq/msi.c | 6 +----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index d4497141d083..4a7da3c3e035 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1243,6 +1243,10 @@ static void pci_msi_domain_update_chip_ops(struct msi_domain_info *info) BUG_ON(!chip); if (!chip->irq_write_msi_msg) chip->irq_write_msi_msg = pci_msi_domain_write_msg; + if (!chip->irq_mask) + chip->irq_mask = pci_msi_mask_irq; + if (!chip->irq_unmask) + chip->irq_unmask = pci_msi_unmask_irq; } /** diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7e6512b9dc1f..be9149f62eb8 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -228,11 +228,7 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info) { struct irq_chip *chip = info->chip; - BUG_ON(!chip); - if (!chip->irq_mask) - chip->irq_mask = pci_msi_mask_irq; - if (!chip->irq_unmask) - chip->irq_unmask = pci_msi_unmask_irq; + BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask); if (!chip->irq_set_affinity) chip->irq_set_affinity = msi_domain_set_affinity; } -- cgit v1.2.3-71-gd317 From 56fd16cabac9cd8f15e2902898a9d0cc96e2fa70 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 16 Oct 2015 15:50:22 +0200 Subject: timekeeping: Increment clock_was_set_seq in timekeeping_init() timekeeping_init() can set the wall time offset, so we need to increment the clock_was_set_seq counter. That way hrtimers will pick up the early offset immediately. Otherwise on a machine which does not set wall time later in the boot process the hrtimer offset is stale at 0 and wall time timers are going to expire with a delay of 45 years. Fixes: 868a3e915f7f "hrtimer: Make offset update smarter" Reported-and-tested-by: Heiko Carstens Signed-off-by: Thomas Gleixner Cc: Stefan Liebler Cc: Peter Zijlstra Cc: John Stultz --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3739ac6aa473..44d2cc0436f4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1251,7 +1251,7 @@ void __init timekeeping_init(void) set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); tk_set_wall_to_mono(tk, tmp); - timekeeping_update(tk, TK_MIRROR); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); -- cgit v1.2.3-71-gd317 From fde7d22e01aa0d252fc5c95fa11f0dac35a4dd59 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Tue, 13 Oct 2015 09:18:22 +0800 Subject: sched/fair: Fix overly small weight for interactive group entities Commit: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking") led to an overly small weight for interactive group entities. The bad case can be easily reproduced when a number of CPU hogs compete for the CPUs at the same time (thanks to Mike). This is largly because the task group's load average tracking cross CPUs lags behind the real changes. To fix this we accelerate the group share distribution process by using the load.weight of the cfs_rq. This may increase the entire group's share, but we have to do so to protect the (fragile) interactive tasks, especially from CPU hogs. Reported-by: Mike Galbraith Tested-by: Dietmar Eggemann Tested-by: Mike Galbraith Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1444699103-20272-1-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e2e3483b1ec..bc62c5096e54 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2363,7 +2363,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) */ tg_weight = atomic_long_read(&tg->load_avg); tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq_load_avg(cfs_rq); + tg_weight += cfs_rq->load.weight; return tg_weight; } @@ -2373,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) long tg_weight, load, shares; tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq_load_avg(cfs_rq); + load = cfs_rq->load.weight; shares = (tg->shares * load); if (tg_weight) -- cgit v1.2.3-71-gd317 From 3e386d56bafbb6d2540b49367444997fc671ea69 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Tue, 13 Oct 2015 09:18:23 +0800 Subject: sched/fair: Update task group's load_avg after task migration When cfs_rq has cfs_rq->removed_load_avg set (when a task migrates from this cfs_rq), we need to update its contribution to the group's load_avg. This should not increase tg's update too much, because in most cases, the cfs_rq has already decayed its load_avg. Tested-by: Dietmar Eggemann Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1444699103-20272-2-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc62c5096e54..9a5e60fe721a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2664,13 +2664,14 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - int decayed; struct sched_avg *sa = &cfs_rq->avg; + int decayed, removed = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); + removed = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -2688,7 +2689,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - return decayed; + return decayed || removed; } /* Update task and its cfs_rq load average */ -- cgit v1.2.3-71-gd317 From 0baabb385eb4bce699ddab0db015112be6cf1e6a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 12 Oct 2015 17:21:23 +0200 Subject: nohz: Revert "nohz: Set isolcpus when nohz_full is set" This reverts: 8cb9764fc88b ("nohz: Set isolcpus when nohz_full is set") We assumed that full-nohz users always want scheduler isolation on full dynticks CPUs, therefore we included full-nohz CPUs on cpu_isolated_map. This means that tasks run by default on CPUs outside the nohz_full range unless their affinity is explicity overwritten. This suits pure isolation workloads but when the machine is needed to run common workloads, the available sets of CPUs to run common tasks becomes reduced. We reach an extreme case when CONFIG_NO_HZ_FULL_ALL is enabled as it leaves only CPU 0 for non-isolation tasks, which makes people think that their supercomputer regressed to 90's UP - which is true in a sense. Some full-nohz users appear to be interested in running normal workloads either before or after an isolation workload. Full-nohz isn't optimized toward normal workloads but it's still better than UP performance. We are reaching a limitation in kernel presets here. Lets revert this cpu_isolated_map inclusion and let userspace do its own scheduler isolation using cpusets or explicit affinity settings. Reported-by: Ingo Molnar Reported-by: Mike Galbraith Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Dave Jones Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Rik van Riel Link: http://lkml.kernel.org/r/1444663283-30068-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 10a8faa1b0d4..5bd7d60658d3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7238,9 +7238,6 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); - /* nohz_full won't take effect without isolating the cpus. */ - tick_nohz_full_add_cpus_to(cpu_isolated_map); - sched_init_numa(); /* -- cgit v1.2.3-71-gd317 From 5aa5050787f449e7eaef2c5ec93c7b357aa7dcdc Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Fri, 16 Oct 2015 10:06:21 +0200 Subject: sched/deadline: Fix migration of SCHED_DEADLINE tasks Commit: 9d5142624256 ("sched/deadline: Reduce rq lock contention by eliminating locking of non-feasible target") broke select_task_rq_dl() and find_lock_later_rq(), because it introduced a comparison between the local task's deadline and dl.earliest_dl.curr of the remote queue. However, if the remote runqueue does not contain any SCHED_DEADLINE task its earliest_dl.curr is 0 (always smaller than the deadline of the local task) and the remote runqueue is not selected for pushing. As a result, if an application creates multiple SCHED_DEADLINE threads, they will never be pushed to runqueues that do not already contain SCHED_DEADLINE tasks. This patch fixes the issue by checking if dl.dl_nr_running == 0. Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Fixes: 9d5142624256 ("sched/deadline: Reduce rq lock contention by eliminating locking of non-feasible target") Link: http://lkml.kernel.org/r/1444982781-15608-1-git-send-email-luca.abeni@unitn.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc8f01083527..142df2668e5d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1066,8 +1066,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) int target = find_later_rq(p); if (target != -1 && - dl_time_before(p->dl.deadline, - cpu_rq(target)->dl.earliest_dl.curr)) + (dl_time_before(p->dl.deadline, + cpu_rq(target)->dl.earliest_dl.curr) || + (cpu_rq(target)->dl.dl_nr_running == 0))) cpu = target; } rcu_read_unlock(); @@ -1417,7 +1418,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); - if (!dl_time_before(task->dl.deadline, + if (later_rq->dl.dl_nr_running && + !dl_time_before(task->dl.deadline, later_rq->dl.earliest_dl.curr)) { /* * Target rq has tasks of equal or earlier deadline, -- cgit v1.2.3-71-gd317 From a2d7629048322ae62bff57f34f5f995e25ed234c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Oct 2015 11:38:08 -0400 Subject: tracing: Have stack tracer force RCU to be watching The stack tracer was triggering the WARN_ON() in module.c: static void module_assert_mutex_or_preempt(void) { #ifdef CONFIG_LOCKDEP if (unlikely(!debug_locks)) return; WARN_ON(!rcu_read_lock_sched_held() && !lockdep_is_held(&module_mutex)); #endif } The reason is that the stack tracer traces all function calls, and some of those calls happen while exiting or entering user space and idle. Some of these functions are called after RCU had already stopped watching, as RCU does not watch userspace or idle CPUs. If a max stack is hit, then the save_stack_trace() is called, which will check module addresses and call module_assert_mutex_or_preempt(), and then trigger the warning. Sad part is, the warning itself will also do a stack trace and tigger the same warning. That probably should be fixed. The warning was added by 0be964be0d45 "module: Sanitize RCU usage and locking" but this bug has probably been around longer. But it's unlikely to cause much harm, but the new warning causes the system to lock up. Cc: stable@vger.kernel.org # 4.2+ Cc: Peter Zijlstra Cc:"Paul E. McKenney" Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b746399ab59c..5f29402bff0f 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -88,6 +88,12 @@ check_stack(unsigned long ip, unsigned long *stack) local_irq_save(flags); arch_spin_lock(&max_stack_lock); + /* + * RCU may not be watching, make it see us. + * The stack trace code uses rcu_sched. + */ + rcu_irq_enter(); + /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) this_size -= tracer_frame; @@ -169,6 +175,7 @@ check_stack(unsigned long ip, unsigned long *stack) } out: + rcu_irq_exit(); arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); } -- cgit v1.2.3-71-gd317 From 1904be1b6bb92058c8e00063dd59df2df294e258 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Oct 2015 21:48:02 -0400 Subject: tracing: Do not allow stack_tracer to record stack in NMI The code in stack tracer should not be executed within an NMI as it grabs spinlocks and stack tracing an NMI gives the possibility of causing a deadlock. Although this is safe on x86_64, because it does not perform stack traces when the task struct stack is not in use (interrupts and NMIs), it may be an issue for NMIs on i386 and other archs that use the same stack as the NMI. Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5f29402bff0f..8abf1ba18085 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -85,6 +85,10 @@ check_stack(unsigned long ip, unsigned long *stack) if (!object_is_on_stack(stack)) return; + /* Can't do this from NMI context (can cause deadlocks) */ + if (in_nmi()) + return; + local_irq_save(flags); arch_spin_lock(&max_stack_lock); -- cgit v1.2.3-71-gd317 From 5211613978cb7353a3237e4372958c0e7514683f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 22 Oct 2015 13:32:08 -0700 Subject: kmod: don't run async usermode helper as a child of kworker thread call_usermodehelper_exec_sync() does fork() + wait() with "unignored" SIGCHLD. What we have missed is that this worker thread can have other children previously forked by call_usermodehelper_exec_work() without UMH_WAIT_PROC. If such a child exits in between it becomes a zombie because auto-reaping only works if SIGCHLD is ignored, and nobody can reap it (unless/until this worker thread exits too). Change the !UMH_WAIT_PROC case to use CLONE_PARENT. Note: this is only first step. All PF_KTHREAD tasks, even created by kernel_thread() should have ->parent == kthreadd by default. Fixes: bb304a5c6fc63d8506c ("kmod: handle UMH_WAIT_PROC from system unbound workqueue") Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Rik van Riel Cc: Christoph Lameter Cc: Tejun Heo Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index da98d0593de2..0277d1216f80 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -327,9 +327,13 @@ static void call_usermodehelper_exec_work(struct work_struct *work) call_usermodehelper_exec_sync(sub_info); } else { pid_t pid; - + /* + * Use CLONE_PARENT to reparent it to kthreadd; we do not + * want to pollute current->children, and we need a parent + * that always ignores SIGCHLD to ensure auto-reaping. + */ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, - SIGCHLD); + CLONE_PARENT | SIGCHLD); if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); -- cgit v1.2.3-71-gd317 From 0aaafaabfcba8aa991913cd3280a5dbf7f111a2a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 Oct 2015 11:50:08 +0200 Subject: sched/core: Add missing lockdep_unpin() annotations Luca and Wanpeng reported two missing annotations that led to false lockdep complaints. Add the missing annotations. Reported-by: Luca Abeni Reported-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: cbce1a686700 ("sched,lockdep: Employ lock pinning") Link: http://lkml.kernel.org/r/20151023095008.GY17308@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++++- kernel/sched/deadline.c | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5bd7d60658d3..bcd214e4b4d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2366,8 +2366,15 @@ void wake_up_new_task(struct task_struct *p) trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_woken) + if (p->sched_class->task_woken) { + /* + * Nothing relies on rq->lock after this, so its fine to + * drop it. + */ + lockdep_unpin_lock(&rq->lock); p->sched_class->task_woken(rq, p); + lockdep_pin_lock(&rq->lock); + } #endif task_rq_unlock(rq, p, &flags); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 142df2668e5d..8b0a15e285f9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * Queueing this task back might have overloaded rq, check if we need * to kick someone away. */ - if (has_pushable_dl_tasks(rq)) + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + lockdep_unpin_lock(&rq->lock); push_dl_task(rq); + lockdep_pin_lock(&rq->lock); + } #endif unlock: -- cgit v1.2.3-71-gd317