From 874bbfe600a660cba9c776b3957b1ce393151b76 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 30 Sep 2015 09:05:30 -0700
Subject: workqueue: make sure delayed work run in local cpu

My system keeps crashing with below message. vmstat_update() schedules a delayed
work in current cpu and expects the work runs in the cpu.
schedule_delayed_work() is expected to make delayed work run in local cpu. The
problem is timer can be migrated with NO_HZ. __queue_work() queues work in
timer handler, which could run in a different cpu other than where the delayed
work is scheduled. The end result is the delayed work runs in different cpu.
The patch makes __queue_delayed_work records local cpu earlier. Where the timer
runs doesn't change where the work runs with the change.

[   28.010131] ------------[ cut here ]------------
[   28.010609] kernel BUG at ../mm/vmstat.c:1392!
[   28.011099] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN
[   28.011860] Modules linked in:
[   28.012245] CPU: 0 PID: 289 Comm: kworker/0:3 Tainted: G        W4.3.0-rc3+ #634
[   28.013065] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153802- 04/01/2014
[   28.014160] Workqueue: events vmstat_update
[   28.014571] task: ffff880117682580 ti: ffff8800ba428000 task.ti: ffff8800ba428000
[   28.015445] RIP: 0010:[<ffffffff8115f921>]  [<ffffffff8115f921>]vmstat_update+0x31/0x80
[   28.016282] RSP: 0018:ffff8800ba42fd80  EFLAGS: 00010297
[   28.016812] RAX: 0000000000000000 RBX: ffff88011a858dc0 RCX:0000000000000000
[   28.017585] RDX: ffff880117682580 RSI: ffffffff81f14d8c RDI:ffffffff81f4df8d
[   28.018366] RBP: ffff8800ba42fd90 R08: 0000000000000001 R09:0000000000000000
[   28.019169] R10: 0000000000000000 R11: 0000000000000121 R12:ffff8800baa9f640
[   28.019947] R13: ffff88011a81e340 R14: ffff88011a823700 R15:0000000000000000
[   28.020071] FS:  0000000000000000(0000) GS:ffff88011a800000(0000)knlGS:0000000000000000
[   28.020071] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[   28.020071] CR2: 00007ff6144b01d0 CR3: 00000000b8e93000 CR4:00000000000006f0
[   28.020071] Stack:
[   28.020071]  ffff88011a858dc0 ffff8800baa9f640 ffff8800ba42fe00ffffffff8106bd88
[   28.020071]  ffffffff8106bd0b 0000000000000096 0000000000000000ffffffff82f9b1e8
[   28.020071]  ffffffff829f0b10 0000000000000000 ffffffff81f18460ffff88011a81e340
[   28.020071] Call Trace:
[   28.020071]  [<ffffffff8106bd88>] process_one_work+0x1c8/0x540
[   28.020071]  [<ffffffff8106bd0b>] ? process_one_work+0x14b/0x540
[   28.020071]  [<ffffffff8106c214>] worker_thread+0x114/0x460
[   28.020071]  [<ffffffff8106c100>] ? process_one_work+0x540/0x540
[   28.020071]  [<ffffffff81071bf8>] kthread+0xf8/0x110
[   28.020071]  [<ffffffff81071b00>] ?kthread_create_on_node+0x200/0x200
[   28.020071]  [<ffffffff81a6522f>] ret_from_fork+0x3f/0x70
[   28.020071]  [<ffffffff81071b00>] ?kthread_create_on_node+0x200/0x200

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org # v2.6.31+
---
 kernel/workqueue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca71582fcfab..bcb14cafe007 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1458,13 +1458,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	timer_stats_timer_set_start_info(&dwork->timer);
 
 	dwork->wq = wq;
+	/* timer isn't guaranteed to run in this cpu, record earlier */
+	if (cpu == WORK_CPU_UNBOUND)
+		cpu = raw_smp_processor_id();
 	dwork->cpu = cpu;
 	timer->expires = jiffies + delay;
 
-	if (unlikely(cpu != WORK_CPU_UNBOUND))
-		add_timer_on(timer, cpu);
-	else
-		add_timer(timer);
+	add_timer_on(timer, cpu);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From 9babcd7929bc8967ae3bb6093f603b93c2f9958f Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Thu, 8 Oct 2015 15:36:06 -0300
Subject: sched, tracing: Stop/start critical timings around the idle=poll idle
 loop

When using idle=poll, the preemptoff tracer is always showing
the idle task as the culprit for long latencies. That happens
because critical timings are not stopped before idle loop. This
patch stops critical timings before entering the idle loop,
starting it again after the idle loop.

This problem does not affect the irqsoff tracer because
interruptions are enabled before entering the idle loop.

Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reviewed-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/10fc3705874aef11dbe152a068b591a7be1899b4.1444314899.git.bristot@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/idle.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f177c73ae19..4a2ef5a02fd3 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -57,9 +57,11 @@ static inline int cpu_idle_poll(void)
 	rcu_idle_enter();
 	trace_cpu_idle_rcuidle(0, smp_processor_id());
 	local_irq_enable();
+	stop_critical_timings();
 	while (!tif_need_resched() &&
 		(cpu_idle_force_poll || tick_check_broadcast_expired()))
 		cpu_relax();
+	start_critical_timings();
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 	rcu_idle_exit();
 	return 1;
-- 
cgit v1.2.3-71-gd317


From 0701c53e460ea64daf0ee789d0b08fef57800016 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 13 Oct 2015 19:14:45 +0100
Subject: genirq/msi: Do not use pci_msi_[un]mask_irq as default methods

When we create a generic MSI domain, that MSI_FLAG_USE_DEF_CHIP_OPS
is set, and that any of .mask or .unmask are NULL in the irq_chip
structure, we set them to pci_msi_[un]mask_irq.

This is a bad idea for at least two reasons:
- PCI_MSI might not be selected, kernel fails to build (yes, this is
  legitimate, at least on arm64!)
- This may not be a PCI/MSI domain at all (platform MSI, for example)

Either way, this looks wrong. Move the overriding of mask/unmask to
the PCI counterpart, and panic is any of these two methods is not
set in the core code (they really should be present).

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Link: http://lkml.kernel.org/r/1444760085-27857-1-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c | 4 ++++
 kernel/irq/msi.c  | 6 +-----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index d4497141d083..4a7da3c3e035 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1243,6 +1243,10 @@ static void pci_msi_domain_update_chip_ops(struct msi_domain_info *info)
 	BUG_ON(!chip);
 	if (!chip->irq_write_msi_msg)
 		chip->irq_write_msi_msg = pci_msi_domain_write_msg;
+	if (!chip->irq_mask)
+		chip->irq_mask = pci_msi_mask_irq;
+	if (!chip->irq_unmask)
+		chip->irq_unmask = pci_msi_unmask_irq;
 }
 
 /**
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7e6512b9dc1f..be9149f62eb8 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -228,11 +228,7 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
 {
 	struct irq_chip *chip = info->chip;
 
-	BUG_ON(!chip);
-	if (!chip->irq_mask)
-		chip->irq_mask = pci_msi_mask_irq;
-	if (!chip->irq_unmask)
-		chip->irq_unmask = pci_msi_unmask_irq;
+	BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask);
 	if (!chip->irq_set_affinity)
 		chip->irq_set_affinity = msi_domain_set_affinity;
 }
-- 
cgit v1.2.3-71-gd317


From 56fd16cabac9cd8f15e2902898a9d0cc96e2fa70 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 16 Oct 2015 15:50:22 +0200
Subject: timekeeping: Increment clock_was_set_seq in timekeeping_init()

timekeeping_init() can set the wall time offset, so we need to
increment the clock_was_set_seq counter. That way hrtimers will pick
up the early offset immediately. Otherwise on a machine which does not
set wall time later in the boot process the hrtimer offset is stale at
0 and wall time timers are going to expire with a delay of 45 years.

Fixes: 868a3e915f7f "hrtimer: Make offset update smarter"
Reported-and-tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stefan Liebler <stli@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3739ac6aa473..44d2cc0436f4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1251,7 +1251,7 @@ void __init timekeeping_init(void)
 	set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
 	tk_set_wall_to_mono(tk, tmp);
 
-	timekeeping_update(tk, TK_MIRROR);
+	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
 
 	write_seqcount_end(&tk_core.seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-- 
cgit v1.2.3-71-gd317


From fde7d22e01aa0d252fc5c95fa11f0dac35a4dd59 Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Tue, 13 Oct 2015 09:18:22 +0800
Subject: sched/fair: Fix overly small weight for interactive group entities

Commit:

  9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")

led to an overly small weight for interactive group entities. The bad case
can be easily reproduced when a number of CPU hogs compete for the CPUs
at the same time (thanks to Mike). This is largly because the task group's
load average tracking cross CPUs lags behind the real changes.

To fix this we accelerate the group share distribution process by using
the load.weight of the cfs_rq. This may increase the entire group's
share, but we have to do so to protect the (fragile) interactive
tasks, especially from CPU hogs.

Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1444699103-20272-1-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e2e3483b1ec..bc62c5096e54 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2363,7 +2363,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 	 */
 	tg_weight = atomic_long_read(&tg->load_avg);
 	tg_weight -= cfs_rq->tg_load_avg_contrib;
-	tg_weight += cfs_rq_load_avg(cfs_rq);
+	tg_weight += cfs_rq->load.weight;
 
 	return tg_weight;
 }
@@ -2373,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 	long tg_weight, load, shares;
 
 	tg_weight = calc_tg_weight(tg, cfs_rq);
-	load = cfs_rq_load_avg(cfs_rq);
+	load = cfs_rq->load.weight;
 
 	shares = (tg->shares * load);
 	if (tg_weight)
-- 
cgit v1.2.3-71-gd317


From 3e386d56bafbb6d2540b49367444997fc671ea69 Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Tue, 13 Oct 2015 09:18:23 +0800
Subject: sched/fair: Update task group's load_avg after task migration

When cfs_rq has cfs_rq->removed_load_avg set (when a task migrates from
this cfs_rq), we need to update its contribution to the group's load_avg.

This should not increase tg's update too much, because in most cases, the
cfs_rq has already decayed its load_avg.

Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1444699103-20272-2-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bc62c5096e54..9a5e60fe721a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2664,13 +2664,14 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
 static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
-	int decayed;
 	struct sched_avg *sa = &cfs_rq->avg;
+	int decayed, removed = 0;
 
 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
 		long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
 		sa->load_avg = max_t(long, sa->load_avg - r, 0);
 		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+		removed = 1;
 	}
 
 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -2688,7 +2689,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
-	return decayed;
+	return decayed || removed;
 }
 
 /* Update task and its cfs_rq load average */
-- 
cgit v1.2.3-71-gd317


From 0baabb385eb4bce699ddab0db015112be6cf1e6a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 12 Oct 2015 17:21:23 +0200
Subject: nohz: Revert "nohz: Set isolcpus when nohz_full is set"

This reverts:

  8cb9764fc88b ("nohz: Set isolcpus when nohz_full is set")

We assumed that full-nohz users always want scheduler isolation on full
dynticks CPUs, therefore we included full-nohz CPUs on cpu_isolated_map.

This means that tasks run by default on CPUs outside the nohz_full range
unless their affinity is explicity overwritten.

This suits pure isolation workloads but when the machine is needed to
run common workloads, the available sets of CPUs to run common tasks
becomes reduced.

We reach an extreme case when CONFIG_NO_HZ_FULL_ALL is enabled as it
leaves only CPU 0 for non-isolation tasks, which makes people think that
their supercomputer regressed to 90's UP - which is true in a sense.

Some full-nohz users appear to be interested in running normal workloads
either before or after an isolation workload. Full-nohz isn't optimized
toward normal workloads but it's still better than UP performance.

We are reaching a limitation in kernel presets here. Lets revert this
cpu_isolated_map inclusion and let userspace do its own scheduler
isolation using cpusets or explicit affinity settings.

Reported-by: Ingo Molnar <mingo@kernel.org>
Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Link: http://lkml.kernel.org/r/1444663283-30068-1-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 10a8faa1b0d4..5bd7d60658d3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7238,9 +7238,6 @@ void __init sched_init_smp(void)
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
-	/* nohz_full won't take effect without isolating the cpus. */
-	tick_nohz_full_add_cpus_to(cpu_isolated_map);
-
 	sched_init_numa();
 
 	/*
-- 
cgit v1.2.3-71-gd317


From 5aa5050787f449e7eaef2c5ec93c7b357aa7dcdc Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@unitn.it>
Date: Fri, 16 Oct 2015 10:06:21 +0200
Subject: sched/deadline: Fix migration of SCHED_DEADLINE tasks

Commit:

  9d5142624256 ("sched/deadline: Reduce rq lock contention by eliminating locking of non-feasible target")

broke select_task_rq_dl() and find_lock_later_rq(), because it introduced
a comparison between the local task's deadline and dl.earliest_dl.curr of
the remote queue.

However, if the remote runqueue does not contain any SCHED_DEADLINE
task its earliest_dl.curr is 0 (always smaller than the deadline of
the local task) and the remote runqueue is not selected for pushing.

As a result, if an application creates multiple SCHED_DEADLINE
threads, they will never be pushed to runqueues that do not already
contain SCHED_DEADLINE tasks.

This patch fixes the issue by checking if dl.dl_nr_running == 0.

Signed-off-by: Luca Abeni <luca.abeni@unitn.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@linux.intel.com>
Fixes: 9d5142624256 ("sched/deadline: Reduce rq lock contention by eliminating locking of non-feasible target")
Link: http://lkml.kernel.org/r/1444982781-15608-1-git-send-email-luca.abeni@unitn.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc8f01083527..142df2668e5d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1066,8 +1066,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 		int target = find_later_rq(p);
 
 		if (target != -1 &&
-				dl_time_before(p->dl.deadline,
-					cpu_rq(target)->dl.earliest_dl.curr))
+				(dl_time_before(p->dl.deadline,
+					cpu_rq(target)->dl.earliest_dl.curr) ||
+				(cpu_rq(target)->dl.dl_nr_running == 0)))
 			cpu = target;
 	}
 	rcu_read_unlock();
@@ -1417,7 +1418,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 
 		later_rq = cpu_rq(cpu);
 
-		if (!dl_time_before(task->dl.deadline,
+		if (later_rq->dl.dl_nr_running &&
+		    !dl_time_before(task->dl.deadline,
 					later_rq->dl.earliest_dl.curr)) {
 			/*
 			 * Target rq has tasks of equal or earlier deadline,
-- 
cgit v1.2.3-71-gd317


From a2d7629048322ae62bff57f34f5f995e25ed234c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 20 Oct 2015 11:38:08 -0400
Subject: tracing: Have stack tracer force RCU to be watching

The stack tracer was triggering the WARN_ON() in module.c:

 static void module_assert_mutex_or_preempt(void)
 {
 #ifdef CONFIG_LOCKDEP
	if (unlikely(!debug_locks))
		return;

	WARN_ON(!rcu_read_lock_sched_held() &&
		!lockdep_is_held(&module_mutex));
 #endif
 }

The reason is that the stack tracer traces all function calls, and some of
those calls happen while exiting or entering user space and idle. Some of
these functions are called after RCU had already stopped watching, as RCU
does not watch userspace or idle CPUs.

If a max stack is hit, then the save_stack_trace() is called, which will
check module addresses and call module_assert_mutex_or_preempt(), and then
trigger the warning. Sad part is, the warning itself will also do a stack
trace and tigger the same warning. That probably should be fixed.

The warning was added by 0be964be0d45 "module: Sanitize RCU usage and
locking" but this bug has probably been around longer. But it's unlikely to
cause much harm, but the new warning causes the system to lock up.

Cc: stable@vger.kernel.org # 4.2+
Cc: Peter Zijlstra <peterz@infradead.org>
Cc:"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b746399ab59c..5f29402bff0f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -88,6 +88,12 @@ check_stack(unsigned long ip, unsigned long *stack)
 	local_irq_save(flags);
 	arch_spin_lock(&max_stack_lock);
 
+	/*
+	 * RCU may not be watching, make it see us.
+	 * The stack trace code uses rcu_sched.
+	 */
+	rcu_irq_enter();
+
 	/* In case another CPU set the tracer_frame on us */
 	if (unlikely(!frame_size))
 		this_size -= tracer_frame;
@@ -169,6 +175,7 @@ check_stack(unsigned long ip, unsigned long *stack)
 	}
 
  out:
+	rcu_irq_exit();
 	arch_spin_unlock(&max_stack_lock);
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3-71-gd317


From 1904be1b6bb92058c8e00063dd59df2df294e258 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 20 Oct 2015 21:48:02 -0400
Subject: tracing: Do not allow stack_tracer to record stack in NMI

The code in stack tracer should not be executed within an NMI as it grabs
spinlocks and stack tracing an NMI gives the possibility of causing a
deadlock. Although this is safe on x86_64, because it does not perform stack
traces when the task struct stack is not in use (interrupts and NMIs), it
may be an issue for NMIs on i386 and other archs that use the same stack as
the NMI.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 5f29402bff0f..8abf1ba18085 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -85,6 +85,10 @@ check_stack(unsigned long ip, unsigned long *stack)
 	if (!object_is_on_stack(stack))
 		return;
 
+	/* Can't do this from NMI context (can cause deadlocks) */
+	if (in_nmi())
+		return;
+
 	local_irq_save(flags);
 	arch_spin_lock(&max_stack_lock);
 
-- 
cgit v1.2.3-71-gd317


From 5211613978cb7353a3237e4372958c0e7514683f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 22 Oct 2015 13:32:08 -0700
Subject: kmod: don't run async usermode helper as a child of kworker thread

call_usermodehelper_exec_sync() does fork() + wait() with "unignored"
SIGCHLD.  What we have missed is that this worker thread can have other
children previously forked by call_usermodehelper_exec_work() without
UMH_WAIT_PROC.  If such a child exits in between it becomes a zombie
because auto-reaping only works if SIGCHLD is ignored, and nobody can
reap it (unless/until this worker thread exits too).

Change the !UMH_WAIT_PROC case to use CLONE_PARENT.

Note: this is only first step.  All PF_KTHREAD tasks, even created by
kernel_thread() should have ->parent == kthreadd by default.

Fixes: bb304a5c6fc63d8506c ("kmod: handle UMH_WAIT_PROC from system unbound workqueue")
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index da98d0593de2..0277d1216f80 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -327,9 +327,13 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
 		call_usermodehelper_exec_sync(sub_info);
 	} else {
 		pid_t pid;
-
+		/*
+		 * Use CLONE_PARENT to reparent it to kthreadd; we do not
+		 * want to pollute current->children, and we need a parent
+		 * that always ignores SIGCHLD to ensure auto-reaping.
+		 */
 		pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
-				    SIGCHLD);
+				    CLONE_PARENT | SIGCHLD);
 		if (pid < 0) {
 			sub_info->retval = pid;
 			umh_complete(sub_info);
-- 
cgit v1.2.3-71-gd317


From 0aaafaabfcba8aa991913cd3280a5dbf7f111a2a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 23 Oct 2015 11:50:08 +0200
Subject: sched/core: Add missing lockdep_unpin() annotations

Luca and Wanpeng reported two missing annotations that led to
false lockdep complaints. Add the missing annotations.

Reported-by: Luca Abeni <luca.abeni@unitn.it>
Reported-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: cbce1a686700 ("sched,lockdep: Employ lock pinning")
Link: http://lkml.kernel.org/r/20151023095008.GY17308@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     | 9 ++++++++-
 kernel/sched/deadline.c | 9 ++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5bd7d60658d3..bcd214e4b4d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2366,8 +2366,15 @@ void wake_up_new_task(struct task_struct *p)
 	trace_sched_wakeup_new(p);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
-	if (p->sched_class->task_woken)
+	if (p->sched_class->task_woken) {
+		/*
+		 * Nothing relies on rq->lock after this, so its fine to
+		 * drop it.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		p->sched_class->task_woken(rq, p);
+		lockdep_pin_lock(&rq->lock);
+	}
 #endif
 	task_rq_unlock(rq, p, &flags);
 }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 142df2668e5d..8b0a15e285f9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	 * Queueing this task back might have overloaded rq, check if we need
 	 * to kick someone away.
 	 */
-	if (has_pushable_dl_tasks(rq))
+	if (has_pushable_dl_tasks(rq)) {
+		/*
+		 * Nothing relies on rq->lock after this, so its safe to drop
+		 * rq->lock.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		push_dl_task(rq);
+		lockdep_pin_lock(&rq->lock);
+	}
 #endif
 
 unlock:
-- 
cgit v1.2.3-71-gd317