From d49db342f0e276b354383b3281c5668b6b80f5c2 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 24 Jun 2015 12:41:47 -0700
Subject: sched/fair: Test list head instead of list entry in throttle_cfs_rq()

According to the comments, we need to test if this is
the first throttled task, however, list_empty() tests on
the entry cfs_rq->throttled_list, not the head, this is wrong.

This is a bug because we don't re-init the list entry after
removing it from the list, so list_empty() could return false
even if the list is really empty.

Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1435174907-432-1-git-send-email-xiyou.wangcong@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 65c8f3ebdc3c..d113c3ba8bc4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3683,7 +3683,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
 	raw_spin_lock(&cfs_b->lock);
-	empty = list_empty(&cfs_rq->throttled_list);
+	empty = list_empty(&cfs_b->throttled_cfs_rq);
 
 	/*
 	 * Add to the _head_ of the list, so that an already-started
-- 
cgit v1.2.3-71-gd317


From 6224beb12e190ff11f3c7d4bf50cb2922878f600 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 7 Jul 2015 15:05:03 -0400
Subject: tracing: Have branch tracer use recursive field of task struct

Fengguang Wu's tests triggered a bug in the branch tracer's start up
test when CONFIG_DEBUG_PREEMPT set. This was because that config
adds some debug logic in the per cpu field, which calls back into
the branch tracer.

The branch tracer has its own recursive checks, but uses a per cpu
variable to implement it. If retrieving the per cpu variable calls
back into the branch tracer, you can see how things will break.

Instead of using a per cpu variable, use the trace_recursion field
of the current task struct. Simply set a bit when entering the
branch tracing and clear it when leaving. If the bit is set on
entry, just don't do the tracing.

There's also the case with lockdep, as the local_irq_save() called
before the recursion can also trigger code that can call back into
the function. Changing that to a raw_local_irq_save() will protect
that as well.

This prevents the recursion and the inevitable crash that follows.

Link: http://lkml.kernel.org/r/20150630141803.GA28071@wfg-t540p.sh.intel.com

Cc: stable@vger.kernel.org # 3.10+
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h        |  1 +
 kernel/trace/trace_branch.c | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f060716b02ae..74bde81601a9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -444,6 +444,7 @@ enum {
 
 	TRACE_CONTROL_BIT,
 
+	TRACE_BRANCH_BIT,
 /*
  * Abuse of the trace_recursion.
  * As we need a way to maintain state if we are tracing the function
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index a87b43f49eb4..e2e12ad3186f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -36,9 +36,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	struct trace_branch *entry;
 	struct ring_buffer *buffer;
 	unsigned long flags;
-	int cpu, pc;
+	int pc;
 	const char *p;
 
+	if (current->trace_recursion & TRACE_BRANCH_BIT)
+		return;
+
 	/*
 	 * I would love to save just the ftrace_likely_data pointer, but
 	 * this code can also be used by modules. Ugly things can happen
@@ -49,10 +52,10 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (unlikely(!tr))
 		return;
 
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
-	if (atomic_inc_return(&data->disabled) != 1)
+	raw_local_irq_save(flags);
+	current->trace_recursion |= TRACE_BRANCH_BIT;
+	data = this_cpu_ptr(tr->trace_buffer.data);
+	if (atomic_read(&data->disabled))
 		goto out;
 
 	pc = preempt_count();
@@ -81,8 +84,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 		__buffer_unlock_commit(buffer, event);
 
  out:
-	atomic_dec(&data->disabled);
-	local_irq_restore(flags);
+	current->trace_recursion &= ~TRACE_BRANCH_BIT;
+	raw_local_irq_restore(flags);
 }
 
 static inline
-- 
cgit v1.2.3-71-gd317


From 0f44705175347ec96935d60b765b5d14ecc763bb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Jul 2015 12:01:04 +0200
Subject: tick: Move the export of tick_broadcast_oneshot_control to the proper
 place

tick_broadcast_oneshot_control got moved from tick-broadcast to
tick-common, but the export stayed in the old place. Fix it up.

Fixes: f32dd1170511 'tick/broadcast: Make idle check independent from mode and config'
Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 1 -
 kernel/time/tick-common.c    | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 52b9e199b5ac..f6aae7977824 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -839,7 +839,6 @@ out:
 	raw_spin_unlock(&tick_broadcast_lock);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
 
 /*
  * Reset the one shot broadcast for a cpu
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 55e13efff1ab..f8bf47571dda 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -363,6 +363,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 
 	return __tick_broadcast_oneshot_control(state);
 }
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
 
 #ifdef CONFIG_HOTPLUG_CPU
 /*
-- 
cgit v1.2.3-71-gd317


From ce0d3c0a6fb1422101498ef378c0851dabbbf67f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Jul 2015 22:03:57 +0200
Subject: genirq: Revert sparse irq locking around __cpu_up() and move it to
 x86 for now

Boris reported that the sparse_irq protection around __cpu_up() in the
generic code causes a regression on Xen. Xen allocates interrupts and
some more in the xen_cpu_up() function, so it deadlocks on the
sparse_irq_lock.

There is no simple fix for this and we really should have the
protection for all architectures, but for now the only solution is to
move it to x86 where actual wreckage due to the lack of protection has
been observed.

Reported-and-tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Fixes: a89941816726 'hotplug: Prevent alloc/free of irq descriptors during cpu up/down'
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: xiao jin <jin.xiao@intel.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Cc: xen-devel <xen-devel@lists.xenproject.org>
---
 arch/x86/kernel/smpboot.c | 11 +++++++++++
 kernel/cpu.c              |  9 ---------
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d3010aa79daf..b1f3ed9c7a9e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -992,8 +992,17 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 
 	common_cpu_up(cpu, tidle);
 
+	/*
+	 * We have to walk the irq descriptors to setup the vector
+	 * space for the cpu which comes online.  Prevent irq
+	 * alloc/free across the bringup.
+	 */
+	irq_lock_sparse();
+
 	err = do_boot_cpu(apicid, cpu, tidle);
+
 	if (err) {
+		irq_unlock_sparse();
 		pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
 		return -EIO;
 	}
@@ -1011,6 +1020,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 		touch_nmi_watchdog();
 	}
 
+	irq_unlock_sparse();
+
 	return 0;
 }
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6a374544d495..5644ec5582b9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -527,18 +527,9 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	}
 
-	/*
-	 * Some architectures have to walk the irq descriptors to
-	 * setup the vector space for the cpu which comes online.
-	 * Prevent irq alloc/free across the bringup.
-	 */
-	irq_lock_sparse();
-
 	/* Arch-specific enabling code. */
 	ret = __cpu_up(cpu, idle);
 
-	irq_unlock_sparse();
-
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
-- 
cgit v1.2.3-71-gd317


From 75a06189fc508a2acf470b0b12710362ffb2c4b1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Jul 2015 14:10:17 +0200
Subject: genirq: Prevent resend to interrupts marked IRQ_NESTED_THREAD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The resend mechanism happily calls the interrupt handler of interrupts
which are marked IRQ_NESTED_THREAD from softirq context. This can
result in crashes because the interrupt handler is not the proper way
to invoke the device handlers. They must be invoked via
handle_nested_irq.

Prevent the resend even if the interrupt has no valid parent irq
set. Its better to have a lost interrupt than a crashing machine.

Reported-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
---
 kernel/irq/resend.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 9065107f083e..7a5237a1bce5 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -75,13 +75,21 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 		    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
 #ifdef CONFIG_HARDIRQS_SW_RESEND
 			/*
-			 * If the interrupt has a parent irq and runs
-			 * in the thread context of the parent irq,
-			 * retrigger the parent.
+			 * If the interrupt is running in the thread
+			 * context of the parent irq we need to be
+			 * careful, because we cannot trigger it
+			 * directly.
 			 */
-			if (desc->parent_irq &&
-			    irq_settings_is_nested_thread(desc))
+			if (irq_settings_is_nested_thread(desc)) {
+				/*
+				 * If the parent_irq is valid, we
+				 * retrigger the parent, otherwise we
+				 * do nothing.
+				 */
+				if (!desc->parent_irq)
+					return;
 				irq = desc->parent_irq;
+			}
 			/* Set it pending and activate the softirq: */
 			set_bit(irq, irqs_resend);
 			tasklet_schedule(&resend_tasklet);
-- 
cgit v1.2.3-71-gd317


From 0c8c0f03e3a292e031596484275c14cf39c0ab7a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@sr71.net>
Date: Fri, 17 Jul 2015 12:28:11 +0200
Subject: x86/fpu, sched: Dynamically allocate 'struct fpu'

The FPU rewrite removed the dynamic allocations of 'struct fpu'.
But, this potentially wastes massive amounts of memory (2k per
task on systems that do not have AVX-512 for instance).

Instead of having a separate slab, this patch just appends the
space that we need to the 'task_struct' which we dynamically
allocate already.  This saves from doing an extra slab
allocation at fork().

The only real downside here is that we have to stick everything
and the end of the task_struct.  But, I think the
BUILD_BUG_ON()s I stuck in there should keep that from being too
fragile.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437128892-9831-2-git-send-email-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/types.h | 72 +++++++++++++++++++++-------------------
 arch/x86/include/asm/processor.h | 10 ++++--
 arch/x86/kernel/fpu/init.c       | 39 ++++++++++++++++++++++
 arch/x86/kernel/process.c        |  2 +-
 fs/proc/kcore.c                  |  4 +--
 include/linux/sched.h            | 12 +++++--
 kernel/fork.c                    |  8 ++++-
 7 files changed, 104 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 0637826292de..c49c5173158e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -189,6 +189,7 @@ union fpregs_state {
 	struct fxregs_state		fxsave;
 	struct swregs_state		soft;
 	struct xregs_state		xsave;
+	u8 __padding[PAGE_SIZE];
 };
 
 /*
@@ -197,40 +198,6 @@ union fpregs_state {
  * state fields:
  */
 struct fpu {
-	/*
-	 * @state:
-	 *
-	 * In-memory copy of all FPU registers that we save/restore
-	 * over context switches. If the task is using the FPU then
-	 * the registers in the FPU are more recent than this state
-	 * copy. If the task context-switches away then they get
-	 * saved here and represent the FPU state.
-	 *
-	 * After context switches there may be a (short) time period
-	 * during which the in-FPU hardware registers are unchanged
-	 * and still perfectly match this state, if the tasks
-	 * scheduled afterwards are not using the FPU.
-	 *
-	 * This is the 'lazy restore' window of optimization, which
-	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
-	 *
-	 * We detect whether a subsequent task uses the FPU via setting
-	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
-	 *
-	 * During this window, if the task gets scheduled again, we
-	 * might be able to skip having to do a restore from this
-	 * memory buffer to the hardware registers - at the cost of
-	 * incurring the overhead of #NM fault traps.
-	 *
-	 * Note that on modern CPUs that support the XSAVEOPT (or other
-	 * optimized XSAVE instructions), we don't use #NM traps anymore,
-	 * as the hardware can track whether FPU registers need saving
-	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
-	 * logic, which unconditionally saves/restores all FPU state
-	 * across context switches. (if FPU state exists.)
-	 */
-	union fpregs_state		state;
-
 	/*
 	 * @last_cpu:
 	 *
@@ -288,6 +255,43 @@ struct fpu {
 	 * deal with bursty apps that only use the FPU for a short time:
 	 */
 	unsigned char			counter;
+	/*
+	 * @state:
+	 *
+	 * In-memory copy of all FPU registers that we save/restore
+	 * over context switches. If the task is using the FPU then
+	 * the registers in the FPU are more recent than this state
+	 * copy. If the task context-switches away then they get
+	 * saved here and represent the FPU state.
+	 *
+	 * After context switches there may be a (short) time period
+	 * during which the in-FPU hardware registers are unchanged
+	 * and still perfectly match this state, if the tasks
+	 * scheduled afterwards are not using the FPU.
+	 *
+	 * This is the 'lazy restore' window of optimization, which
+	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+	 *
+	 * We detect whether a subsequent task uses the FPU via setting
+	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+	 *
+	 * During this window, if the task gets scheduled again, we
+	 * might be able to skip having to do a restore from this
+	 * memory buffer to the hardware registers - at the cost of
+	 * incurring the overhead of #NM fault traps.
+	 *
+	 * Note that on modern CPUs that support the XSAVEOPT (or other
+	 * optimized XSAVE instructions), we don't use #NM traps anymore,
+	 * as the hardware can track whether FPU registers need saving
+	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+	 * logic, which unconditionally saves/restores all FPU state
+	 * across context switches. (if FPU state exists.)
+	 */
+	union fpregs_state		state;
+	/*
+	 * WARNING: 'state' is dynamically-sized.  Do not put
+	 * anything after it here.
+	 */
 };
 
 #endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519df0d5..944f1785ed0d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -390,9 +390,6 @@ struct thread_struct {
 #endif
 	unsigned long		gs;
 
-	/* Floating point and extended processor state */
-	struct fpu		fpu;
-
 	/* Save middle states of ptrace breakpoints */
 	struct perf_event	*ptrace_bps[HBP_NUM];
 	/* Debug status used for traps, single steps, etc... */
@@ -418,6 +415,13 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
+
+	/* Floating point and extended processor state */
+	struct fpu		fpu;
+	/*
+	 * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at
+	 * the end.
+	 */
 };
 
 /*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 32826791e675..deacbfa6b33e 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -136,6 +136,45 @@ static void __init fpu__init_system_generic(void)
 unsigned int xstate_size;
 EXPORT_SYMBOL_GPL(xstate_size);
 
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER)	\
+	BUILD_BUG_ON((sizeof(TYPE) -			\
+			offsetof(TYPE, MEMBER) -	\
+			sizeof(((TYPE *)0)->MEMBER)) > 	\
+			0)				\
+
+/*
+ * We append the 'struct fpu' to the task_struct.
+ */
+int __weak arch_task_struct_size(void)
+{
+	int task_size = sizeof(struct task_struct);
+
+	/*
+	 * Subtract off the static size of the register state.
+	 * It potentially has a bunch of padding.
+	 */
+	task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+
+	/*
+	 * Add back the dynamically-calculated register state
+	 * size.
+	 */
+	task_size += xstate_size;
+
+	/*
+	 * We dynamically size 'struct fpu', so we require that
+	 * it be at the end of 'thread_struct' and that
+	 * 'thread_struct' be at the end of 'task_struct'.  If
+	 * you hit a compile error here, check the structure to
+	 * see if something got added to the end.
+	 */
+	CHECK_MEMBER_AT_END_OF(struct fpu, state);
+	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+
+	return task_size;
+}
+
 /*
  * Set up the xstate_size based on the legacy FPU context size.
  *
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9cad694ed7c4..975420eac105 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	*dst = *src;
+	memcpy(dst, src, arch_task_struct_size());
 
 	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 91a4e6426321..a0fe99485687 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 			     roundup(sizeof(CORE_STR), 4)) +
 			roundup(sizeof(struct elf_prstatus), 4) +
 			roundup(sizeof(struct elf_prpsinfo), 4) +
-			roundup(sizeof(struct task_struct), 4);
+			roundup(arch_task_struct_size(), 4);
 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
 	return size + *elf_buflen;
 }
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	/* set up the task structure */
 	notes[2].name	= CORE_STR;
 	notes[2].type	= NT_TASKSTRUCT;
-	notes[2].datasz	= sizeof(struct task_struct);
+	notes[2].datasz	= arch_task_struct_size();
 	notes[2].data	= current;
 
 	nhdr->p_filesz	+= notesize(&notes[2]);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae21f1591615..e43a41d892b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1522,8 +1522,6 @@ struct task_struct {
 /* hung task detection */
 	unsigned long last_switch_count;
 #endif
-/* CPU-specific state of this task */
-	struct thread_struct thread;
 /* filesystem information */
 	struct fs_struct *fs;
 /* open file information */
@@ -1778,8 +1776,18 @@ struct task_struct {
 	unsigned long	task_state_change;
 #endif
 	int pagefault_disabled;
+/* CPU-specific state of this task */
+	struct thread_struct thread;
+/*
+ * WARNING: on x86, 'thread_struct' contains a variable-sized
+ * structure.  It *MUST* be at the end of 'task_struct'.
+ *
+ * Do not put anything below here!
+ */
 };
 
+extern int arch_task_struct_size(void);
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..431b67a6098c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,15 +287,21 @@ static void set_max_threads(unsigned int max_threads_suggested)
 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
+int __weak arch_task_struct_size(void)
+{
+	return sizeof(struct task_struct);
+}
+
 void __init fork_init(void)
 {
+	int task_struct_size = arch_task_struct_size();
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
-		kmem_cache_create("task_struct", sizeof(struct task_struct),
+		kmem_cache_create("task_struct", task_struct_size,
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
-- 
cgit v1.2.3-71-gd317


From 5aaeb5c01c5b6c0be7b7aadbf3ace9f3a4458c3d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 17 Jul 2015 12:28:12 +0200
Subject: x86/fpu, sched: Introduce CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT and
 use it on x86

Don't burden architectures without dynamic task_struct sizing
with the overhead of dynamic sizing.

Also optimize the x86 code a bit by caching task_struct_size.

Acked-and-Tested-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437128892-9831-3-git-send-email-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/Kconfig               |  4 ++++
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/fpu/init.c | 17 +++++++++--------
 arch/x86/kernel/process.c  |  2 +-
 fs/proc/kcore.c            |  4 ++--
 include/linux/sched.h      |  6 +++++-
 kernel/fork.c              | 11 +++++------
 7 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index bec6666a3cc4..8a8ea7110de8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -221,6 +221,10 @@ config ARCH_TASK_STRUCT_ALLOCATOR
 config ARCH_THREAD_INFO_ALLOCATOR
 	bool
 
+# Select if arch wants to size task_struct dynamically via arch_task_struct_size:
+config ARCH_WANTS_DYNAMIC_TASK_STRUCT
+	bool
+
 config HAVE_REGS_AND_STACK_ACCESS_API
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3dbb7e7909ca..b3a1a5d77d92 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
 	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
+	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
 	select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index deacbfa6b33e..0b39173dd971 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -4,6 +4,8 @@
 #include <asm/fpu/internal.h>
 #include <asm/tlbflush.h>
 
+#include <linux/sched.h>
+
 /*
  * Initialize the TS bit in CR0 according to the style of context-switches
  * we are using:
@@ -136,16 +138,14 @@ static void __init fpu__init_system_generic(void)
 unsigned int xstate_size;
 EXPORT_SYMBOL_GPL(xstate_size);
 
-#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER)	\
-	BUILD_BUG_ON((sizeof(TYPE) -			\
-			offsetof(TYPE, MEMBER) -	\
-			sizeof(((TYPE *)0)->MEMBER)) > 	\
-			0)				\
+/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+	BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
 
 /*
- * We append the 'struct fpu' to the task_struct.
+ * We append the 'struct fpu' to the task_struct:
  */
-int __weak arch_task_struct_size(void)
+static void __init fpu__init_task_struct_size(void)
 {
 	int task_size = sizeof(struct task_struct);
 
@@ -172,7 +172,7 @@ int __weak arch_task_struct_size(void)
 	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
 	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
 
-	return task_size;
+	arch_task_struct_size = task_size;
 }
 
 /*
@@ -326,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 	fpu__init_system_generic();
 	fpu__init_system_xstate_size_legacy();
 	fpu__init_system_xstate();
+	fpu__init_task_struct_size();
 
 	fpu__init_system_ctx_switch();
 }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 975420eac105..397688beed4b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	memcpy(dst, src, arch_task_struct_size());
+	memcpy(dst, src, arch_task_struct_size);
 
 	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a0fe99485687..92e6726f6e37 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 			     roundup(sizeof(CORE_STR), 4)) +
 			roundup(sizeof(struct elf_prstatus), 4) +
 			roundup(sizeof(struct elf_prpsinfo), 4) +
-			roundup(arch_task_struct_size(), 4);
+			roundup(arch_task_struct_size, 4);
 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
 	return size + *elf_buflen;
 }
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	/* set up the task structure */
 	notes[2].name	= CORE_STR;
 	notes[2].type	= NT_TASKSTRUCT;
-	notes[2].datasz	= arch_task_struct_size();
+	notes[2].datasz	= arch_task_struct_size;
 	notes[2].data	= current;
 
 	nhdr->p_filesz	+= notesize(&notes[2]);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e43a41d892b6..04b5ada460b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1786,7 +1786,11 @@ struct task_struct {
  */
 };
 
-extern int arch_task_struct_size(void);
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+extern int arch_task_struct_size __read_mostly;
+#else
+# define arch_task_struct_size (sizeof(struct task_struct))
+#endif
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
diff --git a/kernel/fork.c b/kernel/fork.c
index 431b67a6098c..dbd9b8d7b7cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,21 +287,20 @@ static void set_max_threads(unsigned int max_threads_suggested)
 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
-int __weak arch_task_struct_size(void)
-{
-	return sizeof(struct task_struct);
-}
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
 
 void __init fork_init(void)
 {
-	int task_struct_size = arch_task_struct_size();
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
-		kmem_cache_create("task_struct", task_struct_size,
+		kmem_cache_create("task_struct", arch_task_struct_size,
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
-- 
cgit v1.2.3-71-gd317