From 1aa5dfb751d275ae7117d3b73ac423b4a46f2a73 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Wed, 20 Aug 2008 16:37:28 -0700
Subject: clocksource: keep track of original clocksource frequency

The clocksource frequency is represented by
clocksource->mult/2^(clocksource->shift).  Currently, when NTP makes
adjustments to the clock frequency, they are made directly to the mult
value.

This has the drawback that once changed, we cannot know what the orignal
mult value was, or how much adjustment has been applied.

This property causes problems in calculating proper ntp intervals when
switching back and forth between clocksources.

This patch separates the current mult value into a mult and mult_orig
pair.  The mult_orig value stays constant, while the ntp clocksource
adjustments are done only to the mult value.

This allows for correct ntp interval calculation and additionally lays the
groundwork for a new notion of time, what I'm calling the monotonic-raw
time, which is introduced in a following patch.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/clocksource.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 55e434feec99..f0a7fb984413 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -45,7 +45,8 @@ struct clocksource;
  * @read:		returns a cycle value
  * @mask:		bitmask for two's complement
  *			subtraction of non 64 bit counters
- * @mult:		cycle to nanosecond multiplier
+ * @mult:		cycle to nanosecond multiplier (adjusted by NTP)
+ * @mult_orig:		cycle to nanosecond multiplier (unadjusted by NTP)
  * @shift:		cycle to nanosecond divisor (power of two)
  * @flags:		flags describing special properties
  * @vread:		vsyscall based read
@@ -63,6 +64,7 @@ struct clocksource {
 	cycle_t (*read)(void);
 	cycle_t mask;
 	u32 mult;
+	u32 mult_orig;
 	u32 shift;
 	unsigned long flags;
 	cycle_t (*vread)(void);
@@ -201,16 +203,17 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 {
 	u64 tmp;
 
-	/* XXX - All of this could use a whole lot of optimization */
+	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = length_nsec;
 	tmp <<= c->shift;
-	tmp += c->mult/2;
-	do_div(tmp, c->mult);
+	tmp += c->mult_orig/2;
+	do_div(tmp, c->mult_orig);
 
 	c->cycle_interval = (cycle_t)tmp;
 	if (c->cycle_interval == 0)
 		c->cycle_interval = 1;
 
+	/* Go back from cycles -> shifted ns, this time use ntp adjused mult */
 	c->xtime_interval = (u64)c->cycle_interval * c->mult;
 }
 
-- 
cgit v1.2.3-71-gd317


From 2d42244ae71d6c7b0884b5664cf2eda30fb2ae68 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Wed, 20 Aug 2008 16:37:30 -0700
Subject: clocksource: introduce CLOCK_MONOTONIC_RAW

In talking with Josip Loncaric, and his work on clock synchronization (see
btime.sf.net), he mentioned that for really close synchronization, it is
useful to have access to "hardware time", that is a notion of time that is
not in any way adjusted by the clock slewing done to keep close time sync.

Part of the issue is if we are using the kernel's ntp adjusted
representation of time in order to measure how we should correct time, we
can run into what Paul McKenney aptly described as "Painting a road using
the lines we're painting as the guide".

I had been thinking of a similar problem, and was trying to come up with a
way to give users access to a purely hardware based time representation
that avoided users having to know the underlying frequency and mask values
needed to deal with the wide variety of possible underlying hardware
counters.

My solution is to introduce CLOCK_MONOTONIC_RAW.  This exposes a
nanosecond based time value, that increments starting at bootup and has no
frequency adjustments made to it what so ever.

The time is accessed from userspace via the posix_clock_gettime() syscall,
passing CLOCK_MONOTONIC_RAW as the clock_id.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/clocksource.h |  3 +++
 include/linux/time.h        |  2 ++
 kernel/posix-timers.c       | 15 +++++++++++++++
 kernel/time/timekeeping.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+)

(limited to 'include')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f0a7fb984413..f88d32f8ff7c 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -79,6 +79,7 @@ struct clocksource {
 	/* timekeeping specific data, ignore */
 	cycle_t cycle_interval;
 	u64	xtime_interval;
+	u32	raw_interval;
 	/*
 	 * Second part is written at each timer interrupt
 	 * Keep it in a different cache line to dirty no
@@ -87,6 +88,7 @@ struct clocksource {
 	cycle_t cycle_last ____cacheline_aligned_in_smp;
 	u64 xtime_nsec;
 	s64 error;
+	struct timespec raw_time;
 
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 	/* Watchdog related data, used by the framework */
@@ -215,6 +217,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 
 	/* Go back from cycles -> shifted ns, this time use ntp adjused mult */
 	c->xtime_interval = (u64)c->cycle_interval * c->mult;
+	c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift;
 }
 
 
diff --git a/include/linux/time.h b/include/linux/time.h
index e15206a7e82e..205f974b9ebf 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -117,6 +117,7 @@ extern int do_setitimer(int which, struct itimerval *value,
 extern unsigned int alarm_setitimer(unsigned int seconds);
 extern int do_getitimer(int which, struct itimerval *value);
 extern void getnstimeofday(struct timespec *tv);
+extern void getrawmonotonic(struct timespec *ts);
 extern void getboottime(struct timespec *ts);
 extern void monotonic_to_bootbased(struct timespec *ts);
 
@@ -214,6 +215,7 @@ struct itimerval {
 #define CLOCK_MONOTONIC			1
 #define CLOCK_PROCESS_CPUTIME_ID	2
 #define CLOCK_THREAD_CPUTIME_ID		3
+#define CLOCK_MONOTONIC_RAW		4
 
 /*
  * The IDs of various hardware clocks:
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e36d5798cbff..d3c66b53dff6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -222,6 +222,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+/*
+ * Get monotonic time for posix timers
+ */
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+{
+	getrawmonotonic(tp);
+	return 0;
+}
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -235,9 +244,15 @@ static __init int init_posix_timers(void)
 		.clock_get = posix_ktime_get_ts,
 		.clock_set = do_posix_clock_nosettime,
 	};
+	struct k_clock clock_monotonic_raw = {
+		.clock_getres = hrtimer_get_res,
+		.clock_get = posix_get_monotonic_raw,
+		.clock_set = do_posix_clock_nosettime,
+	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+	register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 83d3555a6998..5099c95b8aa2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -75,6 +75,9 @@ static void clocksource_forward_now(void)
 
 	nsec = cyc2ns(clock, cycle_delta);
 	timespec_add_ns(&xtime, nsec);
+
+	nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+	clock->raw_time.tv_nsec += nsec;
 }
 
 /**
@@ -183,6 +186,8 @@ static void change_clocksource(void)
 
 	clocksource_forward_now();
 
+	new->raw_time = clock->raw_time;
+
 	clock = new;
 	clock->cycle_last = 0;
 	clock->cycle_last = clocksource_read(new);
@@ -204,6 +209,39 @@ static inline void clocksource_forward_now(void) { }
 static inline void change_clocksource(void) { }
 #endif
 
+/**
+ * getrawmonotonic - Returns the raw monotonic time in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the raw monotonic time (completely un-modified by ntp)
+ */
+void getrawmonotonic(struct timespec *ts)
+{
+	unsigned long seq;
+	s64 nsecs;
+	cycle_t cycle_now, cycle_delta;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		/* read clocksource: */
+		cycle_now = clocksource_read(clock);
+
+		/* calculate the delta since the last update_wall_time: */
+		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+		/* convert to nanoseconds: */
+		nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+
+		*ts = clock->raw_time;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	timespec_add_ns(ts, nsecs);
+}
+EXPORT_SYMBOL(getrawmonotonic);
+
+
 /**
  * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
  */
@@ -466,6 +504,12 @@ void update_wall_time(void)
 			second_overflow();
 		}
 
+		clock->raw_time.tv_nsec += clock->raw_interval;
+		if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
+			clock->raw_time.tv_nsec -= NSEC_PER_SEC;
+			clock->raw_time.tv_sec++;
+		}
+
 		/* accumulate error between NTP and clock interval */
 		clock->error += tick_length;
 		clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
-- 
cgit v1.2.3-71-gd317


From 916c7a855174e3b53d182b97a26b2e27a29726a1 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Wed, 20 Aug 2008 16:46:08 -0700
Subject: ntp: fix ADJ_OFFSET_SS_READ bug and do_adjtimex() cleanup

Thanks to the review by Michael Kerrisk a bug in the recent
ADJ_OFFSET_SS_READ option was discovered, where the ntp time_offset was
inadvertently set by it.  This fixes this by making the adjtime code
more separate from the ntp_adjtime code (both of which really want to
be separate syscalls).

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/timex.h |  9 +++++-
 kernel/time/ntp.c     | 76 +++++++++++++++++++++++++++------------------------
 2 files changed, 48 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index fc6035d29d56..c00bcdd3ae42 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -141,8 +141,15 @@ struct timex {
 #define ADJ_MICRO		0x1000	/* select microsecond resolution */
 #define ADJ_NANO		0x2000	/* select nanosecond resolution */
 #define ADJ_TICK		0x4000	/* tick value */
+
+#ifdef __KERNEL__
+#define ADJ_ADJTIME		0x8000	/* switch between adjtime/adjtimex modes */
+#define ADJ_OFFSET_SINGLESHOT	0x0001	/* old-fashioned adjtime */
+#define ADJ_OFFSET_READONLY	0x2000	/* read-only adjtime */
+#else
 #define ADJ_OFFSET_SINGLESHOT	0x8001	/* old-fashioned adjtime */
-#define ADJ_OFFSET_SS_READ	0xa001  /* read-only adjtime */
+#define ADJ_OFFSET_SS_READ	0xa001	/* read-only adjtime */
+#endif
 
 /* xntp 3.4 compatibility names */
 #define MOD_OFFSET	ADJ_OFFSET
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196b..c6921aa1a42a 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -277,38 +277,50 @@ static inline void notify_cmos_timer(void) { }
 int do_adjtimex(struct timex *txc)
 {
 	struct timespec ts;
-	long save_adjust, sec;
 	int result;
 
-	/* In order to modify anything, you gotta be super-user! */
-	if (txc->modes && !capable(CAP_SYS_TIME))
-		return -EPERM;
-
-	/* Now we validate the data before disabling interrupts */
-
-	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
+	/* Validate the data before disabling interrupts */
+	if (txc->modes & ADJ_ADJTIME) {
 		/* singleshot must not be used with any other mode bits */
-		if (txc->modes & ~ADJ_OFFSET_SS_READ)
+		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
 			return -EINVAL;
+		if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+		    !capable(CAP_SYS_TIME))
+			return -EPERM;
+	} else {
+		/* In order to modify anything, you gotta be super-user! */
+		 if (txc->modes && !capable(CAP_SYS_TIME))
+			return -EPERM;
+
+		/* if the quartz is off by more than 10% something is VERY wrong! */
+		if (txc->modes & ADJ_TICK &&
+		    (txc->tick <  900000/USER_HZ ||
+		     txc->tick > 1100000/USER_HZ))
+				return -EINVAL;
+
+		if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
+			hrtimer_cancel(&leap_timer);
 	}
 
-	/* if the quartz is off by more than 10% something is VERY wrong ! */
-	if (txc->modes & ADJ_TICK)
-		if (txc->tick <  900000/USER_HZ ||
-		    txc->tick > 1100000/USER_HZ)
-			return -EINVAL;
-
-	if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
-		hrtimer_cancel(&leap_timer);
 	getnstimeofday(&ts);
 
 	write_seqlock_irq(&xtime_lock);
 
-	/* Save for later - semantics of adjtime is to return old value */
-	save_adjust = time_adjust;
-
 	/* If there are input parameters, then process them */
+	if (txc->modes & ADJ_ADJTIME) {
+		long save_adjust = time_adjust;
+
+		if (!(txc->modes & ADJ_OFFSET_READONLY)) {
+			/* adjtime() is independent from ntp_adjtime() */
+			time_adjust = txc->offset;
+			ntp_update_frequency();
+		}
+		txc->offset = save_adjust;
+		goto adj_done;
+	}
 	if (txc->modes) {
+		long sec;
+
 		if (txc->modes & ADJ_STATUS) {
 			if ((time_status & STA_PLL) &&
 			    !(txc->status & STA_PLL)) {
@@ -375,13 +387,8 @@ int do_adjtimex(struct timex *txc)
 		if (txc->modes & ADJ_TAI && txc->constant > 0)
 			time_tai = txc->constant;
 
-		if (txc->modes & ADJ_OFFSET) {
-			if (txc->modes == ADJ_OFFSET_SINGLESHOT)
-				/* adjtime() is independent from ntp_adjtime() */
-				time_adjust = txc->offset;
-			else
-				ntp_update_offset(txc->offset);
-		}
+		if (txc->modes & ADJ_OFFSET)
+			ntp_update_offset(txc->offset);
 		if (txc->modes & ADJ_TICK)
 			tick_usec = txc->tick;
 
@@ -389,19 +396,16 @@ int do_adjtimex(struct timex *txc)
 			ntp_update_frequency();
 	}
 
+	txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+				  NTP_SCALE_SHIFT);
+	if (!(time_status & STA_NANO))
+		txc->offset /= NSEC_PER_USEC;
+
+adj_done:
 	result = time_state;	/* mostly `TIME_OK' */
 	if (time_status & (STA_UNSYNC|STA_CLOCKERR))
 		result = TIME_ERROR;
 
-	if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
-	    (txc->modes == ADJ_OFFSET_SS_READ))
-		txc->offset = save_adjust;
-	else {
-		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
-					  NTP_SCALE_SHIFT);
-		if (!(time_status & STA_NANO))
-			txc->offset /= NSEC_PER_USEC;
-	}
 	txc->freq	   = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
 					 (s64)PPM_SCALE_INV,
 					 NTP_SCALE_SHIFT);
-- 
cgit v1.2.3-71-gd317


From f06febc96ba8e0af80bcc3eaec0a109e88275fac Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Fri, 12 Sep 2008 09:54:39 -0700
Subject: timers: fix itimer/many thread hang

Overview

This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling.  It was put together
with the help of Roland McGrath, the owner and original writer of this code.

The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads.  It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.

This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."

Code Changes

This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine.  (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.)  To do this, at each tick we now update fields in
signal_struct as well as task_struct.  The run_posix_cpu_timers() function
uses those fields to make its decisions.

We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:

struct task_cputime {
	cputime_t utime;
	cputime_t stime;
	unsigned long long sum_exec_runtime;
};

This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels.  For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:

struct thread_group_cputime {
	struct task_cputime totals;
};

struct thread_group_cputime {
	struct task_cputime *totals;
};

We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers).  The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends.  In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention).  For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu().  The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().

We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel.  The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields.  The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures.  The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated.  The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU.  Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.

Non-SMP operation is trivial and will not be mentioned further.

The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().

All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.

Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away.  All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline.  When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.

Performance

The fix appears not to add significant overhead to existing operations.  It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below).  Overall it's a wash except in those
two cases.

I've since done somewhat more involved testing on a dual-core Opteron system.

Case 1: With no itimer running, for a test with 100,000 threads, the fixed
	kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
	all of which was spent in the system.  There were twice as many
	voluntary context switches with the fix as without it.

Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
	an unmodified kernel can handle), the fixed kernel ran the test in
	eight percent of the time (5.8 seconds as opposed to 70 seconds) and
	had better tick accuracy (.012 seconds per tick as opposed to .023
	seconds per tick).

Case 3: A 4000-thread test with an initial timer tick of .01 second and an
	interval of 10,000 seconds (i.e. a timer that ticks only once) had
	very nearly the same performance in both cases:  6.3 seconds elapsed
	for the fixed kernel versus 5.5 seconds for the unfixed kernel.

With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds).  The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.

Since the fix affected the rlimit code, I also tested soft and hard CPU limits.

Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
	running), the modified kernel was very slightly favored in that while
	it killed the process in 19.997 seconds of CPU time (5.002 seconds of
	wall time), only .003 seconds of that was system time, the rest was
	user time.  The unmodified kernel killed the process in 20.001 seconds
	of CPU (5.014 seconds of wall time) of which .016 seconds was system
	time.  Really, though, the results were too close to call.  The results
	were essentially the same with no itimer running.

Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
	(where the hard limit would never be reached) and an itimer running,
	the modified kernel exhibited worse tick accuracy than the unmodified
	kernel: .050 seconds/tick versus .028 seconds/tick.  Otherwise,
	performance was almost indistinguishable.  With no itimer running this
	test exhibited virtually identical behavior and times in both cases.

In times past I did some limited performance testing.  those results are below.

On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s.  On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds.  Performance with eight, four and one
thread were comparable.  Interestingly, the timer ticks with the fix seemed
more accurate:  The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick.  Both cases were configured for an interval of
0.01 seconds.  Again, the other tests were comparable.  Each thread in this
test computed the primes up to 25,000,000.

I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix.  In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable).  System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s).  It received 147651 ticks for 0.015 seconds per tick, still quite
accurate.  There is obviously no comparable test without the fix.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/binfmt_elf.c              |  19 +-
 fs/proc/array.c              |   8 +-
 include/linux/posix-timers.h |   2 +
 include/linux/sched.h        | 257 +++++++++++++++++++++--
 include/linux/time.h         |   3 +
 kernel/compat.c              |  53 ++---
 kernel/exit.c                |  19 +-
 kernel/fork.c                |  88 ++++----
 kernel/itimer.c              |  33 +--
 kernel/posix-cpu-timers.c    | 471 +++++++++++++++++++++++--------------------
 kernel/sched.c               |  53 ++++-
 kernel/sched_fair.c          |   1 +
 kernel/sched_rt.c            |   4 +-
 kernel/signal.c              |   8 +-
 kernel/sys.c                 |  75 +++----
 security/selinux/hooks.c     |   9 +-
 16 files changed, 677 insertions(+), 426 deletions(-)

(limited to 'include')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 655ed8d30a86..a8635f637038 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1333,20 +1333,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
+		struct task_cputime cputime;
+
 		/*
-		 * This is the record for the group leader.  Add in the
-		 * cumulative times of previous dead threads.  This total
-		 * won't include the time of each live thread whose state
-		 * is included in the core dump.  The final total reported
-		 * to our parent process when it calls wait4 will include
-		 * those sums as well as the little bit more time it takes
-		 * this and each other thread to finish dying after the
-		 * core dump synchronization phase.
+		 * This is the record for the group leader.  It shows the
+		 * group-wide total, not its individual thread total.
 		 */
-		cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
-				   &prstatus->pr_utime);
-		cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
-				   &prstatus->pr_stime);
+		thread_group_cputime(p, &cputime);
+		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
 		cputime_to_timeval(p->utime, &prstatus->pr_utime);
 		cputime_to_timeval(p->stime, &prstatus->pr_stime);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 71c9be59c9c2..933953c4e407 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -395,20 +395,20 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 		/* add up live thread stats at the group level */
 		if (whole) {
+			struct task_cputime cputime;
 			struct task_struct *t = task;
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				utime = cputime_add(utime, task_utime(t));
-				stime = cputime_add(stime, task_stime(t));
 				gtime = cputime_add(gtime, task_gtime(t));
 				t = next_thread(t);
 			} while (t != task);
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			utime = cputime_add(utime, sig->utime);
-			stime = cputime_add(stime, sig->stime);
+			thread_group_cputime(task, &cputime);
+			utime = cputime.utime;
+			stime = cputime.stime;
 			gtime = cputime_add(gtime, sig->gtime);
 		}
 
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a7dd38f30ade..f9d8e9e94e9b 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -115,4 +115,6 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 
 long clock_nanosleep_restart(struct restart_block *restart_block);
 
+void update_rlimit_cpu(unsigned long rlim_new);
+
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d9120c5ad15..26d7a5f2d0ba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -425,6 +425,45 @@ struct pacct_struct {
 	unsigned long		ac_minflt, ac_majflt;
 };
 
+/**
+ * struct task_cputime - collected CPU time counts
+ * @utime:		time spent in user mode, in &cputime_t units
+ * @stime:		time spent in kernel mode, in &cputime_t units
+ * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
+ * 
+ * This structure groups together three kinds of CPU time that are
+ * tracked for threads and thread groups.  Most things considering
+ * CPU time want to group these counts together and treat all three
+ * of them in parallel.
+ */
+struct task_cputime {
+	cputime_t utime;
+	cputime_t stime;
+	unsigned long long sum_exec_runtime;
+};
+/* Alternate field names when used to cache expirations. */
+#define prof_exp	stime
+#define virt_exp	utime
+#define sched_exp	sum_exec_runtime
+
+/**
+ * struct thread_group_cputime - thread group interval timer counts
+ * @totals:		thread group interval timers; substructure for
+ *			uniprocessor kernel, per-cpu for SMP kernel.
+ *
+ * This structure contains the version of task_cputime, above, that is
+ * used for thread group CPU clock calculations.
+ */
+#ifdef CONFIG_SMP
+struct thread_group_cputime {
+	struct task_cputime *totals;
+};
+#else
+struct thread_group_cputime {
+	struct task_cputime totals;
+};
+#endif
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -470,6 +509,17 @@ struct signal_struct {
 	cputime_t it_prof_expires, it_virt_expires;
 	cputime_t it_prof_incr, it_virt_incr;
 
+	/*
+	 * Thread group totals for process CPU clocks.
+	 * See thread_group_cputime(), et al, for details.
+	 */
+	struct thread_group_cputime cputime;
+
+	/* Earliest-expiration cache. */
+	struct task_cputime cputime_expires;
+
+	struct list_head cpu_timers[3];
+
 	/* job control IDs */
 
 	/*
@@ -500,7 +550,7 @@ struct signal_struct {
 	 * Live threads maintain their own counters and add to these
 	 * in __exit_signal, except for the group leader.
 	 */
-	cputime_t utime, stime, cutime, cstime;
+	cputime_t cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@ -508,14 +558,6 @@ struct signal_struct {
 	unsigned long inblock, oublock, cinblock, coublock;
 	struct task_io_accounting ioac;
 
-	/*
-	 * Cumulative ns of scheduled CPU time for dead threads in the
-	 * group, not including a zombie group leader.  (This only differs
-	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
-	 * other than jiffies.)
-	 */
-	unsigned long long sum_sched_runtime;
-
 	/*
 	 * We don't bother to synchronize most readers of this at all,
 	 * because there is no reader checking a limit that actually needs
@@ -527,8 +569,6 @@ struct signal_struct {
 	 */
 	struct rlimit rlim[RLIM_NLIMITS];
 
-	struct list_head cpu_timers[3];
-
 	/* keep the process-shared keyrings here so that they do the right
 	 * thing in threads created with CLONE_THREAD */
 #ifdef CONFIG_KEYS
@@ -1134,8 +1174,7 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 
-  	cputime_t it_prof_expires, it_virt_expires;
-	unsigned long long it_sched_expires;
+	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
 
 /* process credentials */
@@ -1585,6 +1624,7 @@ extern unsigned long long cpu_clock(int cpu);
 
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
+extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
@@ -2081,6 +2121,197 @@ static inline int spin_needbreak(spinlock_t *lock)
 #endif
 }
 
+/*
+ * Thread group CPU time accounting.
+ */
+#ifdef CONFIG_SMP
+
+extern int thread_group_cputime_alloc_smp(struct task_struct *);
+extern void thread_group_cputime_smp(struct task_struct *, struct task_cputime *);
+
+static inline void thread_group_cputime_init(struct signal_struct *sig)
+{
+	sig->cputime.totals = NULL;
+}
+
+static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
+						    struct task_struct *new)
+{
+	if (curr->signal->cputime.totals)
+		return 0;
+	return thread_group_cputime_alloc_smp(curr);
+}
+
+static inline void thread_group_cputime_free(struct signal_struct *sig)
+{
+	free_percpu(sig->cputime.totals);
+}
+
+/**
+ * thread_group_cputime - Sum the thread group time fields across all CPUs.
+ *
+ * This is a wrapper for the real routine, thread_group_cputime_smp().  See
+ * that routine for details.
+ */
+static inline void thread_group_cputime(
+	struct task_struct *tsk,
+	struct task_cputime *times)
+{
+	thread_group_cputime_smp(tsk, times);
+}
+
+/**
+ * thread_group_cputime_account_user - Maintain utime for a thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @cputime:	Time value by which to increment the utime field of that
+ *		structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void thread_group_cputime_account_user(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->utime = cputime_add(times->utime, cputime);
+		put_cpu_no_resched();
+	}
+}
+
+/**
+ * thread_group_cputime_account_system - Maintain stime for a thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @cputime:	Time value by which to increment the stime field of that
+ *		structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void thread_group_cputime_account_system(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->stime = cputime_add(times->stime, cputime);
+		put_cpu_no_resched();
+	}
+}
+
+/**
+ * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a
+ *						thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @ns:		Time value by which to increment the sum_exec_runtime field
+ *		of that structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void thread_group_cputime_account_exec_runtime(
+	struct thread_group_cputime *tgtimes,
+	unsigned long long ns)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->sum_exec_runtime += ns;
+		put_cpu_no_resched();
+	}
+}
+
+#else /* CONFIG_SMP */
+
+static inline void thread_group_cputime_init(struct signal_struct *sig)
+{
+	sig->cputime.totals.utime = cputime_zero;
+	sig->cputime.totals.stime = cputime_zero;
+	sig->cputime.totals.sum_exec_runtime = 0;
+}
+
+static inline int thread_group_cputime_alloc(struct task_struct *tsk)
+{
+	return 0;
+}
+
+static inline void thread_group_cputime_free(struct signal_struct *sig)
+{
+}
+
+static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
+						     struct task_struct *tsk)
+{
+}
+
+static inline void thread_group_cputime(struct task_struct *tsk,
+					 struct task_cputime *cputime)
+{
+	*cputime = tsk->signal->cputime.totals;
+}
+
+static inline void thread_group_cputime_account_user(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime);
+}
+
+static inline void thread_group_cputime_account_system(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime);
+}
+
+static inline void thread_group_cputime_account_exec_runtime(
+	struct thread_group_cputime *tgtimes,
+	unsigned long long ns)
+{
+	tgtimes->totals->sum_exec_runtime += ns;
+}
+
+#endif /* CONFIG_SMP */
+
+static inline void account_group_user_time(struct task_struct *tsk,
+					    cputime_t cputime)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_user(&sig->cputime, cputime);
+}
+
+static inline void account_group_system_time(struct task_struct *tsk,
+					      cputime_t cputime)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_system(&sig->cputime, cputime);
+}
+
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+					       unsigned long long ns)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_exec_runtime(&sig->cputime, ns);
+}
+
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
diff --git a/include/linux/time.h b/include/linux/time.h
index e15206a7e82e..1b70b3c293e9 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -125,6 +125,9 @@ extern int timekeeping_valid_for_hres(void);
 extern void update_wall_time(void);
 extern void update_xtime_cache(u64 nsec);
 
+struct tms;
+extern void do_sys_times(struct tms *);
+
 /**
  * timespec_to_ns - Convert timespec to nanoseconds
  * @ts:		pointer to the timespec variable to be converted
diff --git a/kernel/compat.c b/kernel/compat.c
index 32c254a8ab9a..72650e39b3e6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -23,6 +23,7 @@
 #include <linux/timex.h>
 #include <linux/migrate.h>
 #include <linux/posix-timers.h>
+#include <linux/times.h>
 
 #include <asm/uaccess.h>
 
@@ -150,49 +151,23 @@ asmlinkage long compat_sys_setitimer(int which,
 	return 0;
 }
 
+static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
+{
+	return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
+}
+
 asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
 {
-	/*
-	 *	In the SMP world we might just be unlucky and have one of
-	 *	the times increment as we use it. Since the value is an
-	 *	atomically safe type this is just fine. Conceptually its
-	 *	as if the syscall took an instant longer to occur.
-	 */
 	if (tbuf) {
+		struct tms tms;
 		struct compat_tms tmp;
-		struct task_struct *tsk = current;
-		struct task_struct *t;
-		cputime_t utime, stime, cutime, cstime;
-
-		read_lock(&tasklist_lock);
-		utime = tsk->signal->utime;
-		stime = tsk->signal->stime;
-		t = tsk;
-		do {
-			utime = cputime_add(utime, t->utime);
-			stime = cputime_add(stime, t->stime);
-			t = next_thread(t);
-		} while (t != tsk);
-
-		/*
-		 * While we have tasklist_lock read-locked, no dying thread
-		 * can be updating current->signal->[us]time.  Instead,
-		 * we got their counts included in the live thread loop.
-		 * However, another thread can come in right now and
-		 * do a wait call that updates current->signal->c[us]time.
-		 * To make sure we always see that pair updated atomically,
-		 * we take the siglock around fetching them.
-		 */
-		spin_lock_irq(&tsk->sighand->siglock);
-		cutime = tsk->signal->cutime;
-		cstime = tsk->signal->cstime;
-		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
-
-		tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
-		tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
-		tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
-		tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
+
+		do_sys_times(&tms);
+		/* Convert our struct tms to the compat version. */
+		tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
+		tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
+		tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
+		tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
 		if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
 			return -EFAULT;
 	}
diff --git a/kernel/exit.c b/kernel/exit.c
index 16395644a98f..40036ac04271 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -112,8 +112,6 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		sig->utime = cputime_add(sig->utime, task_utime(tsk));
-		sig->stime = cputime_add(sig->stime, task_stime(tsk));
 		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
@@ -122,7 +120,6 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
 		task_io_accounting_add(&sig->ioac, &tsk->ioac);
-		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
 
@@ -1294,6 +1291,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	if (likely(!traced)) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
+		struct task_cputime cputime;
 
 		/*
 		 * The resource counters for the group leader are in its
@@ -1309,20 +1307,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		 * need to protect the access to p->parent->signal fields,
 		 * as other threads in the parent group can be right
 		 * here reaping other children at the same time.
+		 *
+		 * We use thread_group_cputime() to get times for the thread
+		 * group, which consolidates times for all threads in the
+		 * group including the group leader.
 		 */
 		spin_lock_irq(&p->parent->sighand->siglock);
 		psig = p->parent->signal;
 		sig = p->signal;
+		thread_group_cputime(p, &cputime);
 		psig->cutime =
 			cputime_add(psig->cutime,
-			cputime_add(p->utime,
-			cputime_add(sig->utime,
-				    sig->cutime)));
+			cputime_add(cputime.utime,
+				    sig->cutime));
 		psig->cstime =
 			cputime_add(psig->cstime,
-			cputime_add(p->stime,
-			cputime_add(sig->stime,
-				    sig->cstime)));
+			cputime_add(cputime.stime,
+				    sig->cstime));
 		psig->cgtime =
 			cputime_add(psig->cgtime,
 			cputime_add(p->gtime,
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe84796..a8ac2efb8e30 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -759,15 +759,44 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 		kmem_cache_free(sighand_cachep, sighand);
 }
 
+
+/*
+ * Initialize POSIX timer handling for a thread group.
+ */
+static void posix_cpu_timers_init_group(struct signal_struct *sig)
+{
+	/* Thread group counters. */
+	thread_group_cputime_init(sig);
+
+	/* Expiration times and increments. */
+	sig->it_virt_expires = cputime_zero;
+	sig->it_virt_incr = cputime_zero;
+	sig->it_prof_expires = cputime_zero;
+	sig->it_prof_incr = cputime_zero;
+
+	/* Cached expiration times. */
+	sig->cputime_expires.prof_exp = cputime_zero;
+	sig->cputime_expires.virt_exp = cputime_zero;
+	sig->cputime_expires.sched_exp = 0;
+
+	/* The timer lists. */
+	INIT_LIST_HEAD(&sig->cpu_timers[0]);
+	INIT_LIST_HEAD(&sig->cpu_timers[1]);
+	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+}
+
 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 {
 	struct signal_struct *sig;
 	int ret;
 
 	if (clone_flags & CLONE_THREAD) {
-		atomic_inc(&current->signal->count);
-		atomic_inc(&current->signal->live);
-		return 0;
+		ret = thread_group_cputime_clone_thread(current, tsk);
+		if (likely(!ret)) {
+			atomic_inc(&current->signal->count);
+			atomic_inc(&current->signal->live);
+		}
+		return ret;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
 	tsk->signal = sig;
@@ -795,15 +824,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
 
-	sig->it_virt_expires = cputime_zero;
-	sig->it_virt_incr = cputime_zero;
-	sig->it_prof_expires = cputime_zero;
-	sig->it_prof_incr = cputime_zero;
-
 	sig->leader = 0;	/* session leadership doesn't inherit */
 	sig->tty_old_pgrp = NULL;
 
-	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+	sig->cutime = sig->cstime = cputime_zero;
 	sig->gtime = cputime_zero;
 	sig->cgtime = cputime_zero;
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
@@ -820,14 +844,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
 	task_unlock(current->group_leader);
 
-	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
-		/*
-		 * New sole thread in the process gets an expiry time
-		 * of the whole CPU time limit.
-		 */
-		tsk->it_prof_expires =
-			secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
-	}
+	posix_cpu_timers_init_group(sig);
+
 	acct_init_pacct(&sig->pacct);
 
 	tty_audit_fork(sig);
@@ -837,6 +855,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 
 void __cleanup_signal(struct signal_struct *sig)
 {
+	thread_group_cputime_free(sig);
 	exit_thread_group_keys(sig);
 	kmem_cache_free(signal_cachep, sig);
 }
@@ -885,6 +904,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 }
 #endif /* CONFIG_MM_OWNER */
 
+/*
+ * Initialize POSIX timer handling for a single task.
+ */
+static void posix_cpu_timers_init(struct task_struct *tsk)
+{
+	tsk->cputime_expires.prof_exp = cputime_zero;
+	tsk->cputime_expires.virt_exp = cputime_zero;
+	tsk->cputime_expires.sched_exp = 0;
+	INIT_LIST_HEAD(&tsk->cpu_timers[0]);
+	INIT_LIST_HEAD(&tsk->cpu_timers[1]);
+	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -995,12 +1027,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
-	p->it_virt_expires = cputime_zero;
-	p->it_prof_expires = cputime_zero;
-	p->it_sched_expires = 0;
-	INIT_LIST_HEAD(&p->cpu_timers[0]);
-	INIT_LIST_HEAD(&p->cpu_timers[1]);
-	INIT_LIST_HEAD(&p->cpu_timers[2]);
+	posix_cpu_timers_init(p);
 
 	p->lock_depth = -1;		/* -1 = no lock */
 	do_posix_clock_monotonic_gettime(&p->start_time);
@@ -1201,21 +1228,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (clone_flags & CLONE_THREAD) {
 		p->group_leader = current->group_leader;
 		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
-
-		if (!cputime_eq(current->signal->it_virt_expires,
-				cputime_zero) ||
-		    !cputime_eq(current->signal->it_prof_expires,
-				cputime_zero) ||
-		    current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
-		    !list_empty(&current->signal->cpu_timers[0]) ||
-		    !list_empty(&current->signal->cpu_timers[1]) ||
-		    !list_empty(&current->signal->cpu_timers[2])) {
-			/*
-			 * Have child wake up on its first tick to check
-			 * for process CPU timers.
-			 */
-			p->it_prof_expires = jiffies_to_cputime(1);
-		}
 	}
 
 	if (likely(p->pid)) {
diff --git a/kernel/itimer.c b/kernel/itimer.c
index ab982747d9bd..db7c358b9a02 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value)
 		spin_unlock_irq(&tsk->sighand->siglock);
 		break;
 	case ITIMER_VIRTUAL:
-		read_lock(&tasklist_lock);
 		spin_lock_irq(&tsk->sighand->siglock);
 		cval = tsk->signal->it_virt_expires;
 		cinterval = tsk->signal->it_virt_incr;
 		if (!cputime_eq(cval, cputime_zero)) {
-			struct task_struct *t = tsk;
-			cputime_t utime = tsk->signal->utime;
-			do {
-				utime = cputime_add(utime, t->utime);
-				t = next_thread(t);
-			} while (t != tsk);
+			struct task_cputime cputime;
+			cputime_t utime;
+
+			thread_group_cputime(tsk, &cputime);
+			utime = cputime.utime;
 			if (cputime_le(cval, utime)) { /* about to fire */
 				cval = jiffies_to_cputime(1);
 			} else {
@@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value)
 			}
 		}
 		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
 		cputime_to_timeval(cval, &value->it_value);
 		cputime_to_timeval(cinterval, &value->it_interval);
 		break;
 	case ITIMER_PROF:
-		read_lock(&tasklist_lock);
 		spin_lock_irq(&tsk->sighand->siglock);
 		cval = tsk->signal->it_prof_expires;
 		cinterval = tsk->signal->it_prof_incr;
 		if (!cputime_eq(cval, cputime_zero)) {
-			struct task_struct *t = tsk;
-			cputime_t ptime = cputime_add(tsk->signal->utime,
-						      tsk->signal->stime);
-			do {
-				ptime = cputime_add(ptime,
-						    cputime_add(t->utime,
-								t->stime));
-				t = next_thread(t);
-			} while (t != tsk);
+			struct task_cputime times;
+			cputime_t ptime;
+
+			thread_group_cputime(tsk, &times);
+			ptime = cputime_add(times.utime, times.stime);
 			if (cputime_le(cval, ptime)) { /* about to fire */
 				cval = jiffies_to_cputime(1);
 			} else {
@@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value)
 			}
 		}
 		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
 		cputime_to_timeval(cval, &value->it_value);
 		cputime_to_timeval(cinterval, &value->it_interval);
 		break;
@@ -185,7 +176,6 @@ again:
 	case ITIMER_VIRTUAL:
 		nval = timeval_to_cputime(&value->it_value);
 		ninterval = timeval_to_cputime(&value->it_interval);
-		read_lock(&tasklist_lock);
 		spin_lock_irq(&tsk->sighand->siglock);
 		cval = tsk->signal->it_virt_expires;
 		cinterval = tsk->signal->it_virt_incr;
@@ -200,7 +190,6 @@ again:
 		tsk->signal->it_virt_expires = nval;
 		tsk->signal->it_virt_incr = ninterval;
 		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
 		if (ovalue) {
 			cputime_to_timeval(cval, &ovalue->it_value);
 			cputime_to_timeval(cinterval, &ovalue->it_interval);
@@ -209,7 +198,6 @@ again:
 	case ITIMER_PROF:
 		nval = timeval_to_cputime(&value->it_value);
 		ninterval = timeval_to_cputime(&value->it_interval);
-		read_lock(&tasklist_lock);
 		spin_lock_irq(&tsk->sighand->siglock);
 		cval = tsk->signal->it_prof_expires;
 		cinterval = tsk->signal->it_prof_incr;
@@ -224,7 +212,6 @@ again:
 		tsk->signal->it_prof_expires = nval;
 		tsk->signal->it_prof_incr = ninterval;
 		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
 		if (ovalue) {
 			cputime_to_timeval(cval, &ovalue->it_value);
 			cputime_to_timeval(cinterval, &ovalue->it_interval);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c42a03aef36f..dba1c334c3e8 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -8,6 +8,99 @@
 #include <linux/math64.h>
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_SMP
+/*
+ * Allocate the thread_group_cputime structure appropriately for SMP kernels
+ * and fill in the current values of the fields.  Called from copy_signal()
+ * via thread_group_cputime_clone_thread() when adding a second or subsequent
+ * thread to a thread group.  Assumes interrupts are enabled when called.
+ */
+int thread_group_cputime_alloc_smp(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	struct task_cputime *cputime;
+
+	/*
+	 * If we have multiple threads and we don't already have a
+	 * per-CPU task_cputime struct, allocate one and fill it in with
+	 * the times accumulated so far.
+	 */
+	if (sig->cputime.totals)
+		return 0;
+	cputime = alloc_percpu(struct task_cputime);
+	if (cputime == NULL)
+		return -ENOMEM;
+	read_lock(&tasklist_lock);
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (sig->cputime.totals) {
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		free_percpu(cputime);
+		return 0;
+	}
+	sig->cputime.totals = cputime;
+	cputime = per_cpu_ptr(sig->cputime.totals, get_cpu());
+	cputime->utime = tsk->utime;
+	cputime->stime = tsk->stime;
+	cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
+	put_cpu_no_resched();
+	spin_unlock_irq(&tsk->sighand->siglock);
+	read_unlock(&tasklist_lock);
+	return 0;
+}
+
+/**
+ * thread_group_cputime_smp - Sum the thread group time fields across all CPUs.
+ *
+ * @tsk:	The task we use to identify the thread group.
+ * @times:	task_cputime structure in which we return the summed fields.
+ *
+ * Walk the list of CPUs to sum the per-CPU time fields in the thread group
+ * time structure.
+ */
+void thread_group_cputime_smp(
+	struct task_struct *tsk,
+	struct task_cputime *times)
+{
+	struct signal_struct *sig;
+	int i;
+	struct task_cputime *tot;
+
+	sig = tsk->signal;
+	if (unlikely(!sig) || !sig->cputime.totals) {
+		times->utime = tsk->utime;
+		times->stime = tsk->stime;
+		times->sum_exec_runtime = tsk->se.sum_exec_runtime;
+		return;
+	}
+	times->stime = times->utime = cputime_zero;
+	times->sum_exec_runtime = 0;
+	for_each_possible_cpu(i) {
+		tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
+		times->utime = cputime_add(times->utime, tot->utime);
+		times->stime = cputime_add(times->stime, tot->stime);
+		times->sum_exec_runtime += tot->sum_exec_runtime;
+	}
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Called after updating RLIMIT_CPU to set timer expiration if necessary.
+ */
+void update_rlimit_cpu(unsigned long rlim_new)
+{
+	cputime_t cputime;
+
+	cputime = secs_to_cputime(rlim_new);
+	if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
+            cputime_lt(current->signal->it_prof_expires, cputime)) {
+		spin_lock_irq(&current->sighand->siglock);
+		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+}
+
 static int check_clock(const clockid_t which_clock)
 {
 	int error = 0;
@@ -158,10 +251,6 @@ static inline cputime_t virt_ticks(struct task_struct *p)
 {
 	return p->utime;
 }
-static inline unsigned long long sched_ns(struct task_struct *p)
-{
-	return task_sched_runtime(p);
-}
 
 int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
@@ -211,7 +300,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 		cpu->cpu = virt_ticks(p);
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = sched_ns(p);
+		cpu->sched = task_sched_runtime(p);
 		break;
 	}
 	return 0;
@@ -226,31 +315,20 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
 					 struct task_struct *p,
 					 union cpu_time_count *cpu)
 {
-	struct task_struct *t = p;
- 	switch (clock_idx) {
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+	switch (clock_idx) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		cpu->cpu = cputime_add(p->signal->utime, p->signal->stime);
-		do {
-			cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
-			t = next_thread(t);
-		} while (t != p);
+		cpu->cpu = cputime_add(cputime.utime, cputime.stime);
 		break;
 	case CPUCLOCK_VIRT:
-		cpu->cpu = p->signal->utime;
-		do {
-			cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
-			t = next_thread(t);
-		} while (t != p);
+		cpu->cpu = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = p->signal->sum_sched_runtime;
-		/* Add in each other live thread.  */
-		while ((t = next_thread(t)) != p) {
-			cpu->sched += t->se.sum_exec_runtime;
-		}
-		cpu->sched += sched_ns(p);
+		cpu->sched = thread_group_sched_runtime(p);
 		break;
 	}
 	return 0;
@@ -471,80 +549,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
-	cleanup_timers(tsk->signal->cpu_timers,
-		       cputime_add(tsk->utime, tsk->signal->utime),
-		       cputime_add(tsk->stime, tsk->signal->stime),
-		     tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
-}
-
-
-/*
- * Set the expiry times of all the threads in the process so one of them
- * will go off before the process cumulative expiry total is reached.
- */
-static void process_timer_rebalance(struct task_struct *p,
-				    unsigned int clock_idx,
-				    union cpu_time_count expires,
-				    union cpu_time_count val)
-{
-	cputime_t ticks, left;
-	unsigned long long ns, nsleft;
- 	struct task_struct *t = p;
-	unsigned int nthreads = atomic_read(&p->signal->live);
-
-	if (!nthreads)
-		return;
+	struct task_cputime cputime;
 
-	switch (clock_idx) {
-	default:
-		BUG();
-		break;
-	case CPUCLOCK_PROF:
-		left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
-				       nthreads);
-		do {
-			if (likely(!(t->flags & PF_EXITING))) {
-				ticks = cputime_add(prof_ticks(t), left);
-				if (cputime_eq(t->it_prof_expires,
-					       cputime_zero) ||
-				    cputime_gt(t->it_prof_expires, ticks)) {
-					t->it_prof_expires = ticks;
-				}
-			}
-			t = next_thread(t);
-		} while (t != p);
-		break;
-	case CPUCLOCK_VIRT:
-		left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
-				       nthreads);
-		do {
-			if (likely(!(t->flags & PF_EXITING))) {
-				ticks = cputime_add(virt_ticks(t), left);
-				if (cputime_eq(t->it_virt_expires,
-					       cputime_zero) ||
-				    cputime_gt(t->it_virt_expires, ticks)) {
-					t->it_virt_expires = ticks;
-				}
-			}
-			t = next_thread(t);
-		} while (t != p);
-		break;
-	case CPUCLOCK_SCHED:
-		nsleft = expires.sched - val.sched;
-		do_div(nsleft, nthreads);
-		nsleft = max_t(unsigned long long, nsleft, 1);
-		do {
-			if (likely(!(t->flags & PF_EXITING))) {
-				ns = t->se.sum_exec_runtime + nsleft;
-				if (t->it_sched_expires == 0 ||
-				    t->it_sched_expires > ns) {
-					t->it_sched_expires = ns;
-				}
-			}
-			t = next_thread(t);
-		} while (t != p);
-		break;
-	}
+	thread_group_cputime(tsk, &cputime);
+	cleanup_timers(tsk->signal->cpu_timers,
+		       cputime.utime, cputime.stime, cputime.sum_exec_runtime);
 }
 
 static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -608,29 +617,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 			default:
 				BUG();
 			case CPUCLOCK_PROF:
-				if (cputime_eq(p->it_prof_expires,
+				if (cputime_eq(p->cputime_expires.prof_exp,
 					       cputime_zero) ||
-				    cputime_gt(p->it_prof_expires,
+				    cputime_gt(p->cputime_expires.prof_exp,
 					       nt->expires.cpu))
-					p->it_prof_expires = nt->expires.cpu;
+					p->cputime_expires.prof_exp =
+						nt->expires.cpu;
 				break;
 			case CPUCLOCK_VIRT:
-				if (cputime_eq(p->it_virt_expires,
+				if (cputime_eq(p->cputime_expires.virt_exp,
 					       cputime_zero) ||
-				    cputime_gt(p->it_virt_expires,
+				    cputime_gt(p->cputime_expires.virt_exp,
 					       nt->expires.cpu))
-					p->it_virt_expires = nt->expires.cpu;
+					p->cputime_expires.virt_exp =
+						nt->expires.cpu;
 				break;
 			case CPUCLOCK_SCHED:
-				if (p->it_sched_expires == 0 ||
-				    p->it_sched_expires > nt->expires.sched)
-					p->it_sched_expires = nt->expires.sched;
+				if (p->cputime_expires.sched_exp == 0 ||
+				    p->cputime_expires.sched_exp >
+							nt->expires.sched)
+					p->cputime_expires.sched_exp =
+						nt->expires.sched;
 				break;
 			}
 		} else {
 			/*
-			 * For a process timer, we must balance
-			 * all the live threads' expirations.
+			 * For a process timer, set the cached expiration time.
 			 */
 			switch (CPUCLOCK_WHICH(timer->it_clock)) {
 			default:
@@ -641,7 +653,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 				    cputime_lt(p->signal->it_virt_expires,
 					       timer->it.cpu.expires.cpu))
 					break;
-				goto rebalance;
+				p->signal->cputime_expires.virt_exp =
+					timer->it.cpu.expires.cpu;
+				break;
 			case CPUCLOCK_PROF:
 				if (!cputime_eq(p->signal->it_prof_expires,
 						cputime_zero) &&
@@ -652,13 +666,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 				if (i != RLIM_INFINITY &&
 				    i <= cputime_to_secs(timer->it.cpu.expires.cpu))
 					break;
-				goto rebalance;
+				p->signal->cputime_expires.prof_exp =
+					timer->it.cpu.expires.cpu;
+				break;
 			case CPUCLOCK_SCHED:
-			rebalance:
-				process_timer_rebalance(
-					timer->it.cpu.task,
-					CPUCLOCK_WHICH(timer->it_clock),
-					timer->it.cpu.expires, now);
+				p->signal->cputime_expires.sched_exp =
+					timer->it.cpu.expires.sched;
 				break;
 			}
 		}
@@ -969,13 +982,13 @@ static void check_thread_timers(struct task_struct *tsk,
 	struct signal_struct *const sig = tsk->signal;
 
 	maxfire = 20;
-	tsk->it_prof_expires = cputime_zero;
+	tsk->cputime_expires.prof_exp = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
-			tsk->it_prof_expires = t->expires.cpu;
+			tsk->cputime_expires.prof_exp = t->expires.cpu;
 			break;
 		}
 		t->firing = 1;
@@ -984,13 +997,13 @@ static void check_thread_timers(struct task_struct *tsk,
 
 	++timers;
 	maxfire = 20;
-	tsk->it_virt_expires = cputime_zero;
+	tsk->cputime_expires.virt_exp = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
-			tsk->it_virt_expires = t->expires.cpu;
+			tsk->cputime_expires.virt_exp = t->expires.cpu;
 			break;
 		}
 		t->firing = 1;
@@ -999,13 +1012,13 @@ static void check_thread_timers(struct task_struct *tsk,
 
 	++timers;
 	maxfire = 20;
-	tsk->it_sched_expires = 0;
+	tsk->cputime_expires.sched_exp = 0;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
 		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
-			tsk->it_sched_expires = t->expires.sched;
+			tsk->cputime_expires.sched_exp = t->expires.sched;
 			break;
 		}
 		t->firing = 1;
@@ -1055,10 +1068,10 @@ static void check_process_timers(struct task_struct *tsk,
 {
 	int maxfire;
 	struct signal_struct *const sig = tsk->signal;
-	cputime_t utime, stime, ptime, virt_expires, prof_expires;
+	cputime_t utime, ptime, virt_expires, prof_expires;
 	unsigned long long sum_sched_runtime, sched_expires;
-	struct task_struct *t;
 	struct list_head *timers = sig->cpu_timers;
+	struct task_cputime cputime;
 
 	/*
 	 * Don't sample the current process CPU clocks if there are no timers.
@@ -1074,18 +1087,10 @@ static void check_process_timers(struct task_struct *tsk,
 	/*
 	 * Collect the current process totals.
 	 */
-	utime = sig->utime;
-	stime = sig->stime;
-	sum_sched_runtime = sig->sum_sched_runtime;
-	t = tsk;
-	do {
-		utime = cputime_add(utime, t->utime);
-		stime = cputime_add(stime, t->stime);
-		sum_sched_runtime += t->se.sum_exec_runtime;
-		t = next_thread(t);
-	} while (t != tsk);
-	ptime = cputime_add(utime, stime);
-
+	thread_group_cputime(tsk, &cputime);
+	utime = cputime.utime;
+	ptime = cputime_add(utime, cputime.stime);
+	sum_sched_runtime = cputime.sum_exec_runtime;
 	maxfire = 20;
 	prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
@@ -1193,60 +1198,18 @@ static void check_process_timers(struct task_struct *tsk,
 		}
 	}
 
-	if (!cputime_eq(prof_expires, cputime_zero) ||
-	    !cputime_eq(virt_expires, cputime_zero) ||
-	    sched_expires != 0) {
-		/*
-		 * Rebalance the threads' expiry times for the remaining
-		 * process CPU timers.
-		 */
-
-		cputime_t prof_left, virt_left, ticks;
-		unsigned long long sched_left, sched;
-		const unsigned int nthreads = atomic_read(&sig->live);
-
-		if (!nthreads)
-			return;
-
-		prof_left = cputime_sub(prof_expires, utime);
-		prof_left = cputime_sub(prof_left, stime);
-		prof_left = cputime_div_non_zero(prof_left, nthreads);
-		virt_left = cputime_sub(virt_expires, utime);
-		virt_left = cputime_div_non_zero(virt_left, nthreads);
-		if (sched_expires) {
-			sched_left = sched_expires - sum_sched_runtime;
-			do_div(sched_left, nthreads);
-			sched_left = max_t(unsigned long long, sched_left, 1);
-		} else {
-			sched_left = 0;
-		}
-		t = tsk;
-		do {
-			if (unlikely(t->flags & PF_EXITING))
-				continue;
-
-			ticks = cputime_add(cputime_add(t->utime, t->stime),
-					    prof_left);
-			if (!cputime_eq(prof_expires, cputime_zero) &&
-			    (cputime_eq(t->it_prof_expires, cputime_zero) ||
-			     cputime_gt(t->it_prof_expires, ticks))) {
-				t->it_prof_expires = ticks;
-			}
-
-			ticks = cputime_add(t->utime, virt_left);
-			if (!cputime_eq(virt_expires, cputime_zero) &&
-			    (cputime_eq(t->it_virt_expires, cputime_zero) ||
-			     cputime_gt(t->it_virt_expires, ticks))) {
-				t->it_virt_expires = ticks;
-			}
-
-			sched = t->se.sum_exec_runtime + sched_left;
-			if (sched_expires && (t->it_sched_expires == 0 ||
-					      t->it_sched_expires > sched)) {
-				t->it_sched_expires = sched;
-			}
-		} while ((t = next_thread(t)) != tsk);
-	}
+	if (!cputime_eq(prof_expires, cputime_zero) &&
+	    (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
+	     cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
+		sig->cputime_expires.prof_exp = prof_expires;
+	if (!cputime_eq(virt_expires, cputime_zero) &&
+	    (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
+	     cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
+		sig->cputime_expires.virt_exp = virt_expires;
+	if (sched_expires != 0 &&
+	    (sig->cputime_expires.sched_exp == 0 ||
+	     sig->cputime_expires.sched_exp > sched_expires))
+		sig->cputime_expires.sched_exp = sched_expires;
 }
 
 /*
@@ -1314,6 +1277,78 @@ out:
 	++timer->it_requeue_pending;
 }
 
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime:	The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero.  Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+	if (cputime_eq(cputime->utime, cputime_zero) &&
+	    cputime_eq(cputime->stime, cputime_zero) &&
+	    cputime->sum_exec_runtime == 0)
+		return 1;
+	return 0;
+}
+
+/**
+ * task_cputime_expired - Compare two task_cputime entities.
+ *
+ * @sample:	The task_cputime structure to be checked for expiration.
+ * @expires:	Expiration times, against which @sample will be checked.
+ *
+ * Checks @sample against @expires to see if any field of @sample has expired.
+ * Returns true if any field of the former is greater than the corresponding
+ * field of the latter if the latter field is set.  Otherwise returns false.
+ */
+static inline int task_cputime_expired(const struct task_cputime *sample,
+					const struct task_cputime *expires)
+{
+	if (!cputime_eq(expires->utime, cputime_zero) &&
+	    cputime_ge(sample->utime, expires->utime))
+		return 1;
+	if (!cputime_eq(expires->stime, cputime_zero) &&
+	    cputime_ge(cputime_add(sample->utime, sample->stime),
+		       expires->stime))
+		return 1;
+	if (expires->sum_exec_runtime != 0 &&
+	    sample->sum_exec_runtime >= expires->sum_exec_runtime)
+		return 1;
+	return 0;
+}
+
+/**
+ * fastpath_timer_check - POSIX CPU timers fast path.
+ *
+ * @tsk:	The task (thread) being checked.
+ * @sig:	The signal pointer for that task.
+ *
+ * If there are no timers set return false.  Otherwise snapshot the task and
+ * thread group timers, then compare them with the corresponding expiration
+ # times.  Returns true if a timer has expired, else returns false.
+ */
+static inline int fastpath_timer_check(struct task_struct *tsk,
+					struct signal_struct *sig)
+{
+	struct task_cputime task_sample = {
+		.utime = tsk->utime,
+		.stime = tsk->stime,
+		.sum_exec_runtime = tsk->se.sum_exec_runtime
+	};
+	struct task_cputime group_sample;
+
+	if (task_cputime_zero(&tsk->cputime_expires) &&
+	    task_cputime_zero(&sig->cputime_expires))
+		return 0;
+	if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
+		return 1;
+	thread_group_cputime(tsk, &group_sample);
+	return task_cputime_expired(&group_sample, &sig->cputime_expires);
+}
+
 /*
  * This is called from the timer interrupt handler.  The irq handler has
  * already updated our counts.  We need to check if any timers fire now.
@@ -1323,30 +1358,29 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
+	struct signal_struct *sig;
+	struct sighand_struct *sighand;
+	unsigned long flags;
 
 	BUG_ON(!irqs_disabled());
 
-#define UNEXPIRED(clock) \
-		(cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
-		 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires))
-
-	if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
-	    (tsk->it_sched_expires == 0 ||
-	     tsk->se.sum_exec_runtime < tsk->it_sched_expires))
-		return;
-
-#undef	UNEXPIRED
-
+	/* Pick up tsk->signal and make sure it's valid. */
+	sig = tsk->signal;
 	/*
-	 * Double-check with locks held.
+	 * The fast path checks that there are no expired thread or thread
+	 * group timers.  If that's so, just return.  Also check that
+	 * tsk->signal is non-NULL; this probably can't happen but cover the
+	 * possibility anyway.
 	 */
-	read_lock(&tasklist_lock);
-	if (likely(tsk->signal != NULL)) {
-		spin_lock(&tsk->sighand->siglock);
-
+	if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) {
+		return;
+	}
+	sighand = lock_task_sighand(tsk, &flags);
+	if (likely(sighand)) {
 		/*
-		 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
-		 * all the timers that are firing, and put them on the firing list.
+		 * Here we take off tsk->signal->cpu_timers[N] and
+		 * tsk->cpu_timers[N] all the timers that are firing, and
+		 * put them on the firing list.
 		 */
 		check_thread_timers(tsk, &firing);
 		check_process_timers(tsk, &firing);
@@ -1359,9 +1393,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 		 * that gets the timer lock before we do will give it up and
 		 * spin until we've taken care of that timer below.
 		 */
-		spin_unlock(&tsk->sighand->siglock);
 	}
-	read_unlock(&tasklist_lock);
+	unlock_task_sighand(tsk, &flags);
 
 	/*
 	 * Now that all the timers on our list have the firing flag,
@@ -1389,10 +1422,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
 /*
  * Set one of the process-wide special case CPU timers.
- * The tasklist_lock and tsk->sighand->siglock must be held by the caller.
- * The oldval argument is null for the RLIMIT_CPU timer, where *newval is
- * absolute; non-null for ITIMER_*, where *newval is relative and we update
- * it to be absolute, *oldval is absolute and we update it to be relative.
+ * The tsk->sighand->siglock must be held by the caller.
+ * The *newval argument is relative and we update it to be absolute, *oldval
+ * is absolute and we update it to be relative.
  */
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval)
@@ -1435,13 +1467,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	    cputime_ge(list_first_entry(head,
 				  struct cpu_timer_list, entry)->expires.cpu,
 		       *newval)) {
-		/*
-		 * Rejigger each thread's expiry time so that one will
-		 * notice before we hit the process-cumulative expiry time.
-		 */
-		union cpu_time_count expires = { .sched = 0 };
-		expires.cpu = *newval;
-		process_timer_rebalance(tsk, clock_idx, expires, now);
+		switch (clock_idx) {
+		case CPUCLOCK_PROF:
+			tsk->signal->cputime_expires.prof_exp = *newval;
+			break;
+		case CPUCLOCK_VIRT:
+			tsk->signal->cputime_expires.virt_exp = *newval;
+			break;
+		}
 	}
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index cc1f81b50b82..c51b5d276665 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4036,6 +4036,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
+/*
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
+ */
+static unsigned long long task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+	if (task_current(rq, p)) {
+		u64 delta_exec;
+
+		update_rq_clock(rq);
+		delta_exec = rq->clock - p->se.exec_start;
+		if ((s64)delta_exec > 0)
+			return delta_exec;
+	}
+	return 0;
+}
+
 /*
  * Return p->sum_exec_runtime plus any more ns on the sched_clock
  * that have not yet been banked in case the task is currently running.
@@ -4043,17 +4062,31 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	unsigned long flags;
-	u64 ns, delta_exec;
+	u64 ns;
 	struct rq *rq;
 
 	rq = task_rq_lock(p, &flags);
-	ns = p->se.sum_exec_runtime;
-	if (task_current(rq, p)) {
-		update_rq_clock(rq);
-		delta_exec = rq->clock - p->se.exec_start;
-		if ((s64)delta_exec > 0)
-			ns += delta_exec;
-	}
+	ns = p->se.sum_exec_runtime + task_delta_exec(p, rq);
+	task_rq_unlock(rq, &flags);
+
+	return ns;
+}
+
+/*
+ * Return sum_exec_runtime for the thread group plus any more ns on the
+ * sched_clock that have not yet been banked in case the task is currently
+ * running.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+	unsigned long flags;
+	u64 ns;
+	struct rq *rq;
+	struct task_cputime totals;
+
+	rq = task_rq_lock(p, &flags);
+	thread_group_cputime(p, &totals);
+	ns = totals.sum_exec_runtime + task_delta_exec(p, rq);
 	task_rq_unlock(rq, &flags);
 
 	return ns;
@@ -4070,6 +4103,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 	cputime64_t tmp;
 
 	p->utime = cputime_add(p->utime, cputime);
+	account_group_user_time(p, cputime);
 
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
@@ -4094,6 +4128,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
 	tmp = cputime_to_cputime64(cputime);
 
 	p->utime = cputime_add(p->utime, cputime);
+	account_group_user_time(p, cputime);
 	p->gtime = cputime_add(p->gtime, cputime);
 
 	cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4129,6 +4164,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	}
 
 	p->stime = cputime_add(p->stime, cputime);
+	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
@@ -4170,6 +4206,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 
 	if (p == rq->idle) {
 		p->stime = cputime_add(p->stime, steal);
+		account_group_system_time(p, steal);
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 		else
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..99aa31acc544 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -507,6 +507,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		struct task_struct *curtask = task_of(curr);
 
 		cpuacct_charge(curtask, delta_exec);
+		account_group_exec_runtime(curtask, delta_exec);
 	}
 }
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 552310798dad..8375e69af36a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -483,6 +483,8 @@ static void update_curr_rt(struct rq *rq)
 	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 
 	curr->se.sum_exec_runtime += delta_exec;
+	account_group_exec_runtime(curr, delta_exec);
+
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
@@ -1412,7 +1414,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 		p->rt.timeout++;
 		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
 		if (p->rt.timeout > next)
-			p->it_sched_expires = p->se.sum_exec_runtime;
+			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
 	}
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index e661b01d340f..6eea5826d618 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1338,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	struct siginfo info;
 	unsigned long flags;
 	struct sighand_struct *psig;
+	struct task_cputime cputime;
 	int ret = sig;
 
 	BUG_ON(sig == -1);
@@ -1368,10 +1369,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	info.si_uid = tsk->uid;
 
-	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
-						       tsk->signal->utime));
-	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
-						       tsk->signal->stime));
+	thread_group_cputime(tsk, &cputime);
+	info.si_utime = cputime_to_jiffies(cputime.utime);
+	info.si_stime = cputime_to_jiffies(cputime.stime);
 
 	info.si_status = tsk->exit_code & 0x7f;
 	if (tsk->exit_code & 0x80)
diff --git a/kernel/sys.c b/kernel/sys.c
index 038a7bc0901d..d046a7a055c2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -853,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid)
 	return old_fsgid;
 }
 
+void do_sys_times(struct tms *tms)
+{
+	struct task_cputime cputime;
+	cputime_t cutime, cstime;
+
+	spin_lock_irq(&current->sighand->siglock);
+	thread_group_cputime(current, &cputime);
+	cutime = current->signal->cutime;
+	cstime = current->signal->cstime;
+	spin_unlock_irq(&current->sighand->siglock);
+	tms->tms_utime = cputime_to_clock_t(cputime.utime);
+	tms->tms_stime = cputime_to_clock_t(cputime.stime);
+	tms->tms_cutime = cputime_to_clock_t(cutime);
+	tms->tms_cstime = cputime_to_clock_t(cstime);
+}
+
 asmlinkage long sys_times(struct tms __user * tbuf)
 {
-	/*
-	 *	In the SMP world we might just be unlucky and have one of
-	 *	the times increment as we use it. Since the value is an
-	 *	atomically safe type this is just fine. Conceptually its
-	 *	as if the syscall took an instant longer to occur.
-	 */
 	if (tbuf) {
 		struct tms tmp;
-		struct task_struct *tsk = current;
-		struct task_struct *t;
-		cputime_t utime, stime, cutime, cstime;
-
-		spin_lock_irq(&tsk->sighand->siglock);
-		utime = tsk->signal->utime;
-		stime = tsk->signal->stime;
-		t = tsk;
-		do {
-			utime = cputime_add(utime, t->utime);
-			stime = cputime_add(stime, t->stime);
-			t = next_thread(t);
-		} while (t != tsk);
-
-		cutime = tsk->signal->cutime;
-		cstime = tsk->signal->cstime;
-		spin_unlock_irq(&tsk->sighand->siglock);
-
-		tmp.tms_utime = cputime_to_clock_t(utime);
-		tmp.tms_stime = cputime_to_clock_t(stime);
-		tmp.tms_cutime = cputime_to_clock_t(cutime);
-		tmp.tms_cstime = cputime_to_clock_t(cstime);
+
+		do_sys_times(&tmp);
 		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
 			return -EFAULT;
 	}
@@ -1445,7 +1435,6 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 {
 	struct rlimit new_rlim, *old_rlim;
-	unsigned long it_prof_secs;
 	int retval;
 
 	if (resource >= RLIM_NLIMITS)
@@ -1491,18 +1480,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 	if (new_rlim.rlim_cur == RLIM_INFINITY)
 		goto out;
 
-	it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
-	if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
-		unsigned long rlim_cur = new_rlim.rlim_cur;
-		cputime_t cputime;
-
-		cputime = secs_to_cputime(rlim_cur);
-		read_lock(&tasklist_lock);
-		spin_lock_irq(&current->sighand->siglock);
-		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
-		spin_unlock_irq(&current->sighand->siglock);
-		read_unlock(&tasklist_lock);
-	}
+	update_rlimit_cpu(new_rlim.rlim_cur);
 out:
 	return 0;
 }
@@ -1540,11 +1518,8 @@ out:
  *
  */
 
-static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
-				     cputime_t *utimep, cputime_t *stimep)
+static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
 {
-	*utimep = cputime_add(*utimep, t->utime);
-	*stimep = cputime_add(*stimep, t->stime);
 	r->ru_nvcsw += t->nvcsw;
 	r->ru_nivcsw += t->nivcsw;
 	r->ru_minflt += t->min_flt;
@@ -1558,12 +1533,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	struct task_struct *t;
 	unsigned long flags;
 	cputime_t utime, stime;
+	struct task_cputime cputime;
 
 	memset((char *) r, 0, sizeof *r);
 	utime = stime = cputime_zero;
 
 	if (who == RUSAGE_THREAD) {
-		accumulate_thread_rusage(p, r, &utime, &stime);
+		accumulate_thread_rusage(p, r);
 		goto out;
 	}
 
@@ -1586,8 +1562,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 				break;
 
 		case RUSAGE_SELF:
-			utime = cputime_add(utime, p->signal->utime);
-			stime = cputime_add(stime, p->signal->stime);
+			thread_group_cputime(p, &cputime);
+			utime = cputime_add(utime, cputime.utime);
+			stime = cputime_add(stime, cputime.stime);
 			r->ru_nvcsw += p->signal->nvcsw;
 			r->ru_nivcsw += p->signal->nivcsw;
 			r->ru_minflt += p->signal->min_flt;
@@ -1596,7 +1573,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			r->ru_oublock += p->signal->oublock;
 			t = p;
 			do {
-				accumulate_thread_rusage(t, r, &utime, &stime);
+				accumulate_thread_rusage(t, r);
 				t = next_thread(t);
 			} while (t != p);
 			break;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 03fc6a81ae32..69649783c266 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -75,6 +75,7 @@
 #include <linux/string.h>
 #include <linux/selinux.h>
 #include <linux/mutex.h>
+#include <linux/posix-timers.h>
 
 #include "avc.h"
 #include "objsec.h"
@@ -2321,13 +2322,7 @@ static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm)
 			initrlim = init_task.signal->rlim+i;
 			rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
 		}
-		if (current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
-			/*
-			 * This will cause RLIMIT_CPU calculations
-			 * to be refigured.
-			 */
-			current->it_prof_expires = jiffies_to_cputime(1);
-		}
+		update_rlimit_cpu(rlim->rlim_cur);
 	}
 
 	/* Wake up the parent if it is waiting so that it can
-- 
cgit v1.2.3-71-gd317


From 0a8eaa4f9b58759595a1bfe13a1295fdc25ba026 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Sep 2008 17:03:52 +0200
Subject: timers: fix itimer/many thread hang, fix #2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix the UP build:

In file included from arch/x86/kernel/asm-offsets_32.c:9,
                 from arch/x86/kernel/asm-offsets.c:3:
include/linux/sched.h: In function ‘thread_group_cputime_clone_thread’:
include/linux/sched.h:2272: warning: no return statement in function returning non-void
include/linux/sched.h: In function ‘thread_group_cputime_account_user’:
include/linux/sched.h:2284: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h:2284: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h: In function ‘thread_group_cputime_account_system’:
include/linux/sched.h:2291: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h:2291: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h: In function ‘thread_group_cputime_account_exec_runtime’:
include/linux/sched.h:2298: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
distcc[14501] ERROR: compile arch/x86/kernel/asm-offsets.c on a/30 failed
make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26d7a5f2d0ba..ed355f02d329 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2251,6 +2251,7 @@ static inline void thread_group_cputime_free(struct signal_struct *sig)
 static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
 						     struct task_struct *tsk)
 {
+	return 0;
 }
 
 static inline void thread_group_cputime(struct task_struct *tsk,
@@ -2263,21 +2264,21 @@ static inline void thread_group_cputime_account_user(
 	struct thread_group_cputime *tgtimes,
 	cputime_t cputime)
 {
-	tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime);
+	tgtimes->totals.utime = cputime_add(tgtimes->totals.utime, cputime);
 }
 
 static inline void thread_group_cputime_account_system(
 	struct thread_group_cputime *tgtimes,
 	cputime_t cputime)
 {
-	tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime);
+	tgtimes->totals.stime = cputime_add(tgtimes->totals.stime, cputime);
 }
 
 static inline void thread_group_cputime_account_exec_runtime(
 	struct thread_group_cputime *tgtimes,
 	unsigned long long ns)
 {
-	tgtimes->totals->sum_exec_runtime += ns;
+	tgtimes->totals.sum_exec_runtime += ns;
 }
 
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.3-71-gd317


From 5ce73a4a5a4893a1aa4cdeed1b1a5a6de42c43b6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Sep 2008 17:11:46 +0200
Subject: timers: fix itimer/many thread hang, cleanups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h     | 2 +-
 kernel/posix-cpu-timers.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ed355f02d329..7ce8d4e53565 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -430,7 +430,7 @@ struct pacct_struct {
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
- * 
+ *
  * This structure groups together three kinds of CPU time that are
  * tracked for threads and thread groups.  Most things considering
  * CPU time want to group these counts together and treat all three
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index dba1c334c3e8..9a7ea049fcdc 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -94,7 +94,7 @@ void update_rlimit_cpu(unsigned long rlim_new)
 
 	cputime = secs_to_cputime(rlim_new);
 	if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
-            cputime_lt(current->signal->it_prof_expires, cputime)) {
+	    cputime_lt(current->signal->it_prof_expires, cputime)) {
 		spin_lock_irq(&current->sighand->siglock);
 		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
 		spin_unlock_irq(&current->sighand->siglock);
@@ -1372,9 +1372,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	 * tsk->signal is non-NULL; this probably can't happen but cover the
 	 * possibility anyway.
 	 */
-	if (unlikely(!sig) || !fastpath_timer_check(tsk, sig)) {
+	if (unlikely(!sig) || !fastpath_timer_check(tsk, sig))
 		return;
-	}
+
 	sighand = lock_task_sighand(tsk, &flags);
 	if (likely(sighand)) {
 		/*
-- 
cgit v1.2.3-71-gd317


From d7cfb60c5cf904ecf1e0ae23ec178175b86f0d4a Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 19 Sep 2008 13:13:44 +0100
Subject: hrtimer: remove hrtimer_clock_base::get_softirq_time()

Peter Zijlstra noticed this 8 months ago and I just noticed
it again.

hrtimer_clock_base::get_softirq_time() is currently unused
in the entire tree. In fact, looking at the logs, it appears
as if it was never used. Remove it.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h | 2 --
 kernel/hrtimer.c        | 4 +---
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 6d93dce61cbb..1b079bd29c35 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -145,7 +145,6 @@ struct hrtimer_sleeper {
  * @first:		pointer to the timer node which expires first
  * @resolution:		the resolution of the clock, in nanoseconds
  * @get_time:		function to retrieve the current time of the clock
- * @get_softirq_time:	function to retrieve the current time from the softirq
  * @softirq_time:	the time when running the hrtimer queue in the softirq
  * @offset:		offset of this clock to the monotonic base
  * @reprogram:		function to reprogram the timer event
@@ -157,7 +156,6 @@ struct hrtimer_clock_base {
 	struct rb_node		*first;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
-	ktime_t			(*get_softirq_time)(void);
 	ktime_t			softirq_time;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t			offset;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 03ea1378c43b..4d761d50c529 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1401,9 +1401,7 @@ void hrtimer_run_queues(void)
 		if (!base->first)
 			continue;
 
-		if (base->get_softirq_time)
-			base->softirq_time = base->get_softirq_time();
-		else if (gettime) {
+		if (gettime) {
 			hrtimer_get_softirq_time(cpu_base);
 			gettime = 0;
 		}
-- 
cgit v1.2.3-71-gd317


From b91c4996df56fcd201f85c392a1de7bc3f6641f5 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 19 Sep 2008 13:13:48 +0100
Subject: hrtimer: remove hrtimer_clock_base::reprogram()

hrtimer_clock_base::reprogram() also appears to never
have been used, so remove it.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 1b079bd29c35..68b0196d8696 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -147,7 +147,6 @@ struct hrtimer_sleeper {
  * @get_time:		function to retrieve the current time of the clock
  * @softirq_time:	the time when running the hrtimer queue in the softirq
  * @offset:		offset of this clock to the monotonic base
- * @reprogram:		function to reprogram the timer event
  */
 struct hrtimer_clock_base {
 	struct hrtimer_cpu_base	*cpu_base;
@@ -159,9 +158,6 @@ struct hrtimer_clock_base {
 	ktime_t			softirq_time;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t			offset;
-	int			(*reprogram)(struct hrtimer *t,
-					     struct hrtimer_clock_base *b,
-					     ktime_t n);
 #endif
 };
 
-- 
cgit v1.2.3-71-gd317


From bb34d92f643086d546b49cef680f6f305ed84414 Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Fri, 12 Sep 2008 09:54:39 -0700
Subject: timers: fix itimer/many thread hang, v2

This is the second resubmission of the posix timer rework patch, posted
a few days ago.

This includes the changes from the previous resubmittion, which addressed
Oleg Nesterov's comments, removing the RCU stuff from the patch and
un-inlining the thread_group_cputime() function for SMP.

In addition, per Ingo Molnar it simplifies the UP code, consolidating much
of it with the SMP version and depending on lower-level SMP/UP handling to
take care of the differences.

It also cleans up some UP compile errors, moves the scheduler stats-related
macros into kernel/sched_stats.h, cleans up a merge error in
kernel/fork.c and has a few other minor fixes and cleanups as suggested
by Oleg and Ingo. Thanks for the review, guys.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |   1 +
 include/linux/sched.h       | 183 ++------------------------------------------
 kernel/fork.c               |   5 +-
 kernel/posix-cpu-timers.c   | 153 ++++++++++++++++--------------------
 kernel/sched.c              |  47 ++----------
 kernel/sched_stats.h        | 136 ++++++++++++++++++++++++++++++++
 6 files changed, 214 insertions(+), 311 deletions(-)

(limited to 'include')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index cf9f40a91c9c..cac3750cd65e 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -52,6 +52,7 @@ static inline int kstat_irqs(int irq)
 	return sum;
 }
 
+extern unsigned long long task_delta_exec(struct task_struct *);
 extern void account_user_time(struct task_struct *, cputime_t);
 extern void account_user_time_scaled(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7ce8d4e53565..b982fb48c8f0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -454,15 +454,9 @@ struct task_cputime {
  * This structure contains the version of task_cputime, above, that is
  * used for thread group CPU clock calculations.
  */
-#ifdef CONFIG_SMP
 struct thread_group_cputime {
 	struct task_cputime *totals;
 };
-#else
-struct thread_group_cputime {
-	struct task_cputime totals;
-};
-#endif
 
 /*
  * NOTE! "signal_struct" does not have it's own
@@ -2124,193 +2118,26 @@ static inline int spin_needbreak(spinlock_t *lock)
 /*
  * Thread group CPU time accounting.
  */
-#ifdef CONFIG_SMP
 
-extern int thread_group_cputime_alloc_smp(struct task_struct *);
-extern void thread_group_cputime_smp(struct task_struct *, struct task_cputime *);
+extern int thread_group_cputime_alloc(struct task_struct *);
+extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
 
 static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
 	sig->cputime.totals = NULL;
 }
 
-static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
-						    struct task_struct *new)
+static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
 {
 	if (curr->signal->cputime.totals)
 		return 0;
-	return thread_group_cputime_alloc_smp(curr);
+	return thread_group_cputime_alloc(curr);
 }
 
-static inline void thread_group_cputime_free(struct signal_struct *sig)
-{
-	free_percpu(sig->cputime.totals);
-}
-
-/**
- * thread_group_cputime - Sum the thread group time fields across all CPUs.
- *
- * This is a wrapper for the real routine, thread_group_cputime_smp().  See
- * that routine for details.
- */
-static inline void thread_group_cputime(
-	struct task_struct *tsk,
-	struct task_cputime *times)
-{
-	thread_group_cputime_smp(tsk, times);
-}
-
-/**
- * thread_group_cputime_account_user - Maintain utime for a thread group.
- *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @cputime:	Time value by which to increment the utime field of that
- *		structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void thread_group_cputime_account_user(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	if (tgtimes->totals) {
-		struct task_cputime *times;
-
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
-		times->utime = cputime_add(times->utime, cputime);
-		put_cpu_no_resched();
-	}
-}
-
-/**
- * thread_group_cputime_account_system - Maintain stime for a thread group.
- *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @cputime:	Time value by which to increment the stime field of that
- *		structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void thread_group_cputime_account_system(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	if (tgtimes->totals) {
-		struct task_cputime *times;
-
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
-		times->stime = cputime_add(times->stime, cputime);
-		put_cpu_no_resched();
-	}
-}
-
-/**
- * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a
- *						thread group.
- *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @ns:		Time value by which to increment the sum_exec_runtime field
- *		of that structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void thread_group_cputime_account_exec_runtime(
-	struct thread_group_cputime *tgtimes,
-	unsigned long long ns)
-{
-	if (tgtimes->totals) {
-		struct task_cputime *times;
-
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
-		times->sum_exec_runtime += ns;
-		put_cpu_no_resched();
-	}
-}
-
-#else /* CONFIG_SMP */
-
-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
-	sig->cputime.totals.utime = cputime_zero;
-	sig->cputime.totals.stime = cputime_zero;
-	sig->cputime.totals.sum_exec_runtime = 0;
-}
-
-static inline int thread_group_cputime_alloc(struct task_struct *tsk)
-{
-	return 0;
-}
 
 static inline void thread_group_cputime_free(struct signal_struct *sig)
 {
-}
-
-static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
-						     struct task_struct *tsk)
-{
-	return 0;
-}
-
-static inline void thread_group_cputime(struct task_struct *tsk,
-					 struct task_cputime *cputime)
-{
-	*cputime = tsk->signal->cputime.totals;
-}
-
-static inline void thread_group_cputime_account_user(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	tgtimes->totals.utime = cputime_add(tgtimes->totals.utime, cputime);
-}
-
-static inline void thread_group_cputime_account_system(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	tgtimes->totals.stime = cputime_add(tgtimes->totals.stime, cputime);
-}
-
-static inline void thread_group_cputime_account_exec_runtime(
-	struct thread_group_cputime *tgtimes,
-	unsigned long long ns)
-{
-	tgtimes->totals.sum_exec_runtime += ns;
-}
-
-#endif /* CONFIG_SMP */
-
-static inline void account_group_user_time(struct task_struct *tsk,
-					    cputime_t cputime)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_user(&sig->cputime, cputime);
-}
-
-static inline void account_group_system_time(struct task_struct *tsk,
-					      cputime_t cputime)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_system(&sig->cputime, cputime);
-}
-
-static inline void account_group_exec_runtime(struct task_struct *tsk,
-					       unsigned long long ns)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_exec_runtime(&sig->cputime, ns);
+	free_percpu(sig->cputime.totals);
 }
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 1181b9aac48e..021ae012cc75 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -791,7 +791,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	int ret;
 
 	if (clone_flags & CLONE_THREAD) {
-		ret = thread_group_cputime_clone_thread(current, tsk);
+		ret = thread_group_cputime_clone_thread(current);
 		if (likely(!ret)) {
 			atomic_inc(&current->signal->count);
 			atomic_inc(&current->signal->live);
@@ -834,9 +834,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
 	task_io_accounting_init(&sig->ioac);
-	INIT_LIST_HEAD(&sig->cpu_timers[0]);
-	INIT_LIST_HEAD(&sig->cpu_timers[1]);
-	INIT_LIST_HEAD(&sig->cpu_timers[2]);
 	taskstats_tgid_init(sig);
 
 	task_lock(current->group_leader);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9a7ea049fcdc..153dcb2639c3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,50 +7,46 @@
 #include <linux/errno.h>
 #include <linux/math64.h>
 #include <asm/uaccess.h>
+#include <linux/kernel_stat.h>
 
-#ifdef CONFIG_SMP
 /*
- * Allocate the thread_group_cputime structure appropriately for SMP kernels
- * and fill in the current values of the fields.  Called from copy_signal()
- * via thread_group_cputime_clone_thread() when adding a second or subsequent
+ * Allocate the thread_group_cputime structure appropriately and fill in the
+ * current values of the fields.  Called from copy_signal() via
+ * thread_group_cputime_clone_thread() when adding a second or subsequent
  * thread to a thread group.  Assumes interrupts are enabled when called.
  */
-int thread_group_cputime_alloc_smp(struct task_struct *tsk)
+int thread_group_cputime_alloc(struct task_struct *tsk)
 {
 	struct signal_struct *sig = tsk->signal;
 	struct task_cputime *cputime;
 
 	/*
 	 * If we have multiple threads and we don't already have a
-	 * per-CPU task_cputime struct, allocate one and fill it in with
-	 * the times accumulated so far.
+	 * per-CPU task_cputime struct (checked in the caller), allocate
+	 * one and fill it in with the times accumulated so far.  We may
+	 * race with another thread so recheck after we pick up the sighand
+	 * lock.
 	 */
-	if (sig->cputime.totals)
-		return 0;
 	cputime = alloc_percpu(struct task_cputime);
 	if (cputime == NULL)
 		return -ENOMEM;
-	read_lock(&tasklist_lock);
 	spin_lock_irq(&tsk->sighand->siglock);
 	if (sig->cputime.totals) {
 		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
 		free_percpu(cputime);
 		return 0;
 	}
 	sig->cputime.totals = cputime;
-	cputime = per_cpu_ptr(sig->cputime.totals, get_cpu());
+	cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
 	cputime->utime = tsk->utime;
 	cputime->stime = tsk->stime;
 	cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
-	put_cpu_no_resched();
 	spin_unlock_irq(&tsk->sighand->siglock);
-	read_unlock(&tasklist_lock);
 	return 0;
 }
 
 /**
- * thread_group_cputime_smp - Sum the thread group time fields across all CPUs.
+ * thread_group_cputime - Sum the thread group time fields across all CPUs.
  *
  * @tsk:	The task we use to identify the thread group.
  * @times:	task_cputime structure in which we return the summed fields.
@@ -58,7 +54,7 @@ int thread_group_cputime_alloc_smp(struct task_struct *tsk)
  * Walk the list of CPUs to sum the per-CPU time fields in the thread group
  * time structure.
  */
-void thread_group_cputime_smp(
+void thread_group_cputime(
 	struct task_struct *tsk,
 	struct task_cputime *times)
 {
@@ -83,8 +79,6 @@ void thread_group_cputime_smp(
 	}
 }
 
-#endif /* CONFIG_SMP */
-
 /*
  * Called after updating RLIMIT_CPU to set timer expiration if necessary.
  */
@@ -300,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 		cpu->cpu = virt_ticks(p);
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = task_sched_runtime(p);
+		cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
 		break;
 	}
 	return 0;
@@ -309,16 +303,15 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 /*
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
- * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
  */
-static int cpu_clock_sample_group_locked(unsigned int clock_idx,
-					 struct task_struct *p,
-					 union cpu_time_count *cpu)
+static int cpu_clock_sample_group(const clockid_t which_clock,
+				  struct task_struct *p,
+				  union cpu_time_count *cpu)
 {
 	struct task_cputime cputime;
 
 	thread_group_cputime(p, &cputime);
-	switch (clock_idx) {
+	switch (which_clock) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
@@ -328,29 +321,12 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
 		cpu->cpu = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = thread_group_sched_runtime(p);
+		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
 		break;
 	}
 	return 0;
 }
 
-/*
- * Sample a process (thread group) clock for the given group_leader task.
- * Must be called with tasklist_lock held for reading.
- */
-static int cpu_clock_sample_group(const clockid_t which_clock,
-				  struct task_struct *p,
-				  union cpu_time_count *cpu)
-{
-	int ret;
-	unsigned long flags;
-	spin_lock_irqsave(&p->sighand->siglock, flags);
-	ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
-					    cpu);
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
-	return ret;
-}
-
 
 int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
@@ -1324,29 +1300,37 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
  * fastpath_timer_check - POSIX CPU timers fast path.
  *
  * @tsk:	The task (thread) being checked.
- * @sig:	The signal pointer for that task.
  *
- * If there are no timers set return false.  Otherwise snapshot the task and
- * thread group timers, then compare them with the corresponding expiration
- # times.  Returns true if a timer has expired, else returns false.
+ * Check the task and thread group timers.  If both are zero (there are no
+ * timers set) return false.  Otherwise snapshot the task and thread group
+ * timers and compare them with the corresponding expiration times.  Return
+ * true if a timer has expired, else return false.
  */
-static inline int fastpath_timer_check(struct task_struct *tsk,
-					struct signal_struct *sig)
+static inline int fastpath_timer_check(struct task_struct *tsk)
 {
-	struct task_cputime task_sample = {
-		.utime = tsk->utime,
-		.stime = tsk->stime,
-		.sum_exec_runtime = tsk->se.sum_exec_runtime
-	};
-	struct task_cputime group_sample;
+	struct signal_struct *sig = tsk->signal;
 
-	if (task_cputime_zero(&tsk->cputime_expires) &&
-	    task_cputime_zero(&sig->cputime_expires))
+	if (unlikely(!sig))
 		return 0;
-	if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
-		return 1;
-	thread_group_cputime(tsk, &group_sample);
-	return task_cputime_expired(&group_sample, &sig->cputime_expires);
+
+	if (!task_cputime_zero(&tsk->cputime_expires)) {
+		struct task_cputime task_sample = {
+			.utime = tsk->utime,
+			.stime = tsk->stime,
+			.sum_exec_runtime = tsk->se.sum_exec_runtime
+		};
+
+		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
+			return 1;
+	}
+	if (!task_cputime_zero(&sig->cputime_expires)) {
+		struct task_cputime group_sample;
+
+		thread_group_cputime(tsk, &group_sample);
+		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
+			return 1;
+	}
+	return 0;
 }
 
 /*
@@ -1358,43 +1342,34 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
-	struct signal_struct *sig;
-	struct sighand_struct *sighand;
-	unsigned long flags;
 
 	BUG_ON(!irqs_disabled());
 
-	/* Pick up tsk->signal and make sure it's valid. */
-	sig = tsk->signal;
 	/*
 	 * The fast path checks that there are no expired thread or thread
-	 * group timers.  If that's so, just return.  Also check that
-	 * tsk->signal is non-NULL; this probably can't happen but cover the
-	 * possibility anyway.
+	 * group timers.  If that's so, just return.
 	 */
-	if (unlikely(!sig) || !fastpath_timer_check(tsk, sig))
+	if (!fastpath_timer_check(tsk))
 		return;
 
-	sighand = lock_task_sighand(tsk, &flags);
-	if (likely(sighand)) {
-		/*
-		 * Here we take off tsk->signal->cpu_timers[N] and
-		 * tsk->cpu_timers[N] all the timers that are firing, and
-		 * put them on the firing list.
-		 */
-		check_thread_timers(tsk, &firing);
-		check_process_timers(tsk, &firing);
+	spin_lock(&tsk->sighand->siglock);
+	/*
+	 * Here we take off tsk->signal->cpu_timers[N] and
+	 * tsk->cpu_timers[N] all the timers that are firing, and
+	 * put them on the firing list.
+	 */
+	check_thread_timers(tsk, &firing);
+	check_process_timers(tsk, &firing);
 
-		/*
-		 * We must release these locks before taking any timer's lock.
-		 * There is a potential race with timer deletion here, as the
-		 * siglock now protects our private firing list.  We have set
-		 * the firing flag in each timer, so that a deletion attempt
-		 * that gets the timer lock before we do will give it up and
-		 * spin until we've taken care of that timer below.
-		 */
-	}
-	unlock_task_sighand(tsk, &flags);
+	/*
+	 * We must release these locks before taking any timer's lock.
+	 * There is a potential race with timer deletion here, as the
+	 * siglock now protects our private firing list.  We have set
+	 * the firing flag in each timer, so that a deletion attempt
+	 * that gets the timer lock before we do will give it up and
+	 * spin until we've taken care of that timer below.
+	 */
+	spin_unlock(&tsk->sighand->siglock);
 
 	/*
 	 * Now that all the timers on our list have the firing flag,
@@ -1433,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	struct list_head *head;
 
 	BUG_ON(clock_idx == CPUCLOCK_SCHED);
-	cpu_clock_sample_group_locked(clock_idx, tsk, &now);
+	cpu_clock_sample_group(clock_idx, tsk, &now);
 
 	if (oldval) {
 		if (!cputime_eq(*oldval, cputime_zero)) {
diff --git a/kernel/sched.c b/kernel/sched.c
index c51b5d276665..260c22cc530a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4039,55 +4039,22 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
  */
-static unsigned long long task_delta_exec(struct task_struct *p, struct rq *rq)
+unsigned long long task_delta_exec(struct task_struct *p)
 {
+	struct rq *rq;
+	unsigned long flags;
+	u64 ns = 0;
+
+	rq = task_rq_lock(p, &flags);
 	if (task_current(rq, p)) {
 		u64 delta_exec;
 
 		update_rq_clock(rq);
 		delta_exec = rq->clock - p->se.exec_start;
 		if ((s64)delta_exec > 0)
-			return delta_exec;
+			ns = delta_exec;
 	}
-	return 0;
-}
-
-/*
- * Return p->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked in case the task is currently running.
- */
-unsigned long long task_sched_runtime(struct task_struct *p)
-{
-	unsigned long flags;
-	u64 ns;
-	struct rq *rq;
-
-	rq = task_rq_lock(p, &flags);
-	ns = p->se.sum_exec_runtime + task_delta_exec(p, rq);
-	task_rq_unlock(rq, &flags);
-
-	return ns;
-}
-
-/*
- * Return sum_exec_runtime for the thread group plus any more ns on the
- * sched_clock that have not yet been banked in case the task is currently
- * running.
- */
-unsigned long long thread_group_sched_runtime(struct task_struct *p)
-{
-	unsigned long flags;
-	u64 ns;
-	struct rq *rq;
-	struct task_cputime totals;
-
-	rq = task_rq_lock(p, &flags);
-	thread_group_cputime(p, &totals);
-	ns = totals.sum_exec_runtime + task_delta_exec(p, rq);
-	task_rq_unlock(rq, &flags);
 
 	return ns;
 }
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8385d43987e2..d6903bd0c7a8 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -270,3 +270,139 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 #define sched_info_switch(t, next)		do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
+/*
+ * The following are functions that support scheduler-internal time accounting.
+ * These functions are generally called at the timer tick.  None of this depends
+ * on CONFIG_SCHEDSTATS.
+ */
+
+#ifdef CONFIG_SMP
+
+/**
+ * thread_group_cputime_account_user - Maintain utime for a thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @cputime:	Time value by which to increment the utime field of that
+ *		structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void thread_group_cputime_account_user(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->utime = cputime_add(times->utime, cputime);
+		put_cpu_no_resched();
+	}
+}
+
+/**
+ * thread_group_cputime_account_system - Maintain stime for a thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @cputime:	Time value by which to increment the stime field of that
+ *		structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void thread_group_cputime_account_system(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->stime = cputime_add(times->stime, cputime);
+		put_cpu_no_resched();
+	}
+}
+
+/**
+ * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a
+ *						thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @ns:		Time value by which to increment the sum_exec_runtime field
+ *		of that structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void thread_group_cputime_account_exec_runtime(
+	struct thread_group_cputime *tgtimes,
+	unsigned long long ns)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->sum_exec_runtime += ns;
+		put_cpu_no_resched();
+	}
+}
+
+#else /* CONFIG_SMP */
+
+static inline void thread_group_cputime_account_user(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime);
+}
+
+static inline void thread_group_cputime_account_system(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime);
+}
+
+static inline void thread_group_cputime_account_exec_runtime(
+	struct thread_group_cputime *tgtimes,
+	unsigned long long ns)
+{
+	tgtimes->totals->sum_exec_runtime += ns;
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * These are the generic time-accounting routines that use the above
+ * functions.  They are the functions actually called by the scheduler.
+ */
+static inline void account_group_user_time(struct task_struct *tsk,
+					    cputime_t cputime)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_user(&sig->cputime, cputime);
+}
+
+static inline void account_group_system_time(struct task_struct *tsk,
+					      cputime_t cputime)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_system(&sig->cputime, cputime);
+}
+
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+					       unsigned long long ns)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_exec_runtime(&sig->cputime, ns);
+}
-- 
cgit v1.2.3-71-gd317


From 5a9fa73072854981a5c05eb7ba18a96d49c2804f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 22 Sep 2008 14:42:50 -0700
Subject: posix-timers: kill ->it_sigev_signo and ->it_sigev_value

With the recent changes ->it_sigev_signo and ->it_sigev_value are only
used in sys_timer_create(), kill them.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: mingo@elte.hu
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/posix-timers.h |  2 --
 kernel/posix-timers.c        | 17 +++++++----------
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index f9d8e9e94e9b..a7c721355549 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -45,8 +45,6 @@ struct k_itimer {
 	int it_requeue_pending;		/* waiting to requeue this timer */
 #define REQUEUE_PENDING 1
 	int it_sigev_notify;		/* notify word of sigevent struct */
-	int it_sigev_signo;		/* signo word of sigevent struct */
-	sigval_t it_sigev_value;	/* value word of sigevent struct */
 	struct task_struct *it_process;	/* process to send signal to */
 	struct sigqueue *sigq;		/* signal queue entry. */
 	union {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 7be385fe4eca..3eff47b0d8d5 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -510,10 +510,6 @@ sys_timer_create(const clockid_t which_clock,
 			error = -EFAULT;
 			goto out;
 		}
-		new_timer->it_sigev_notify = event.sigev_notify;
-		new_timer->it_sigev_signo = event.sigev_signo;
-		new_timer->it_sigev_value = event.sigev_value;
-
 		rcu_read_lock();
 		process = good_sigevent(&event);
 		if (process)
@@ -524,17 +520,18 @@ sys_timer_create(const clockid_t which_clock,
 			goto out;
 		}
 	} else {
-		new_timer->it_sigev_notify = SIGEV_SIGNAL;
-		new_timer->it_sigev_signo = SIGALRM;
-		new_timer->it_sigev_value.sival_int = new_timer->it_id;
+		event.sigev_notify = SIGEV_SIGNAL;
+		event.sigev_signo = SIGALRM;
+		event.sigev_value.sival_int = new_timer->it_id;
 		process = current->group_leader;
 		get_task_struct(process);
 	}
 
-	new_timer->sigq->info.si_code  = SI_TIMER;
+	new_timer->it_sigev_notify     = event.sigev_notify;
+	new_timer->sigq->info.si_signo = event.sigev_signo;
+	new_timer->sigq->info.si_value = event.sigev_value;
 	new_timer->sigq->info.si_tid   = new_timer->it_id;
-	new_timer->sigq->info.si_signo = new_timer->it_sigev_signo;
-	new_timer->sigq->info.si_value = new_timer->it_sigev_value;
+	new_timer->sigq->info.si_code  = SI_TIMER;
 
 	spin_lock_irq(&current->sighand->siglock);
 	new_timer->it_process = process;
-- 
cgit v1.2.3-71-gd317


From 1b02469088ac7a13d7e622b618b7410d0f1ce5ec Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Mon, 22 Sep 2008 14:42:43 -0700
Subject: hrtimer: reorder struct hrtimer to save 8 bytes on 64bit builds

reorder struct hrtimer to save 8 bytes on 64 bit builds when
CONFIG_TIMER_STATS selected.  (also removes 8 bytes from signal_struct)

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 68b0196d8696..8730b60c9432 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -115,12 +115,12 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
-	enum hrtimer_cb_mode		cb_mode;
 	struct list_head		cb_entry;
+	enum hrtimer_cb_mode		cb_mode;
 #ifdef CONFIG_TIMER_STATS
+	int				start_pid;
 	void				*start_site;
 	char				start_comm[16];
-	int				start_pid;
 #endif
 };
 
-- 
cgit v1.2.3-71-gd317


From d40e944c25fb4642adb2a4c580a48218a9f3f824 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Mon, 22 Sep 2008 14:42:44 -0700
Subject: ntp: improve adjtimex frequency rounding

Change PPM_SCALE_INV_SHIFT so that it doesn't throw away any input bits
(19 is the amount of the factor 2 in PPM_SCALE), the output frequency
can then be calculated back to its input value, as the inverse divide
produce a slightly larger value, which is then correctly rounded by the
final shift.

Reported-by: Martin Ziegler <ziegler@uni-freiburg.de>
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timex.h | 2 +-
 kernel/time/ntp.c     | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index c00bcdd3ae42..9007313b5b71 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -82,7 +82,7 @@
  */
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
 #define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
-#define PPM_SCALE_INV_SHIFT 20
+#define PPM_SCALE_INV_SHIFT 19
 #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \
 		       PPM_SCALE + 1)
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 450a45cb01c1..ddb0465a6baa 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -406,9 +406,8 @@ adj_done:
 	if (time_status & (STA_UNSYNC|STA_CLOCKERR))
 		result = TIME_ERROR;
 
-	txc->freq	   = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
-					 (s64)PPM_SCALE_INV,
-					 NTP_SCALE_SHIFT);
+	txc->freq	   = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
+					 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
-- 
cgit v1.2.3-71-gd317


From 7086efe1c1536f6bc160e7d60a9bfd645b91f279 Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Fri, 12 Sep 2008 09:54:39 -0700
Subject: timers: fix itimer/many thread hang, v3

- fix UP lockup
- another set of UP/SMP cleanups and simplifications

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   1 -
 kernel/sched.c        |   1 -
 kernel/sched_stats.h  | 126 +++++++++++++++-----------------------------------
 3 files changed, 38 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b982fb48c8f0..23d9d5464544 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2134,7 +2134,6 @@ static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
 	return thread_group_cputime_alloc(curr);
 }
 
-
 static inline void thread_group_cputime_free(struct signal_struct *sig)
 {
 	free_percpu(sig->cputime.totals);
diff --git a/kernel/sched.c b/kernel/sched.c
index 260c22cc530a..29a3152c45db 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4046,7 +4046,6 @@ unsigned long long task_delta_exec(struct task_struct *p)
 	unsigned long flags;
 	u64 ns = 0;
 
-	rq = task_rq_lock(p, &flags);
 	if (task_current(rq, p)) {
 		u64 delta_exec;
 
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index d6903bd0c7a8..b8c156979cf2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -276,133 +276,83 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
  * on CONFIG_SCHEDSTATS.
  */
 
-#ifdef CONFIG_SMP
-
 /**
- * thread_group_cputime_account_user - Maintain utime for a thread group.
+ * account_group_user_time - Maintain utime for a thread group.
  *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @cputime:	Time value by which to increment the utime field of that
- *		structure.
+ * @tsk:	Pointer to task structure.
+ * @cputime:	Time value by which to increment the utime field of the
+ *		thread_group_cputime structure.
  *
  * If thread group time is being maintained, get the structure for the
  * running CPU and update the utime field there.
  */
-static inline void thread_group_cputime_account_user(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
+static inline void account_group_user_time(struct task_struct *tsk,
+					   cputime_t cputime)
 {
-	if (tgtimes->totals) {
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (unlikely(!sig))
+		return;
+	if (sig->cputime.totals) {
 		struct task_cputime *times;
 
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
 		times->utime = cputime_add(times->utime, cputime);
 		put_cpu_no_resched();
 	}
 }
 
 /**
- * thread_group_cputime_account_system - Maintain stime for a thread group.
+ * account_group_system_time - Maintain stime for a thread group.
  *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @cputime:	Time value by which to increment the stime field of that
- *		structure.
+ * @tsk:	Pointer to task structure.
+ * @cputime:	Time value by which to increment the stime field of the
+ *		thread_group_cputime structure.
  *
  * If thread group time is being maintained, get the structure for the
  * running CPU and update the stime field there.
  */
-static inline void thread_group_cputime_account_system(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
+static inline void account_group_system_time(struct task_struct *tsk,
+					     cputime_t cputime)
 {
-	if (tgtimes->totals) {
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (unlikely(!sig))
+		return;
+	if (sig->cputime.totals) {
 		struct task_cputime *times;
 
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
 		times->stime = cputime_add(times->stime, cputime);
 		put_cpu_no_resched();
 	}
 }
 
 /**
- * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a
- *						thread group.
+ * account_group_exec_runtime - Maintain exec runtime for a thread group.
  *
- * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @tsk:	Pointer to task structure.
  * @ns:		Time value by which to increment the sum_exec_runtime field
- *		of that structure.
+ *		of the thread_group_cputime structure.
  *
  * If thread group time is being maintained, get the structure for the
  * running CPU and update the sum_exec_runtime field there.
  */
-static inline void thread_group_cputime_account_exec_runtime(
-	struct thread_group_cputime *tgtimes,
-	unsigned long long ns)
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+					      unsigned long long ns)
 {
-	if (tgtimes->totals) {
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (unlikely(!sig))
+		return;
+	if (sig->cputime.totals) {
 		struct task_cputime *times;
 
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
 		times->sum_exec_runtime += ns;
 		put_cpu_no_resched();
 	}
 }
-
-#else /* CONFIG_SMP */
-
-static inline void thread_group_cputime_account_user(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime);
-}
-
-static inline void thread_group_cputime_account_system(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime);
-}
-
-static inline void thread_group_cputime_account_exec_runtime(
-	struct thread_group_cputime *tgtimes,
-	unsigned long long ns)
-{
-	tgtimes->totals->sum_exec_runtime += ns;
-}
-
-#endif /* CONFIG_SMP */
-
-/*
- * These are the generic time-accounting routines that use the above
- * functions.  They are the functions actually called by the scheduler.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
-					    cputime_t cputime)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_user(&sig->cputime, cputime);
-}
-
-static inline void account_group_system_time(struct task_struct *tsk,
-					      cputime_t cputime)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_system(&sig->cputime, cputime);
-}
-
-static inline void account_group_exec_runtime(struct task_struct *tsk,
-					       unsigned long long ns)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_exec_runtime(&sig->cputime, ns);
-}
-- 
cgit v1.2.3-71-gd317


From 719254faa17ffedc87ba0fadb9b34e535c9758d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Oct 2008 09:59:47 +0200
Subject: NOHZ: unify the nohz function calls in irq_enter()

We have two separate nohz function calls in irq_enter() for no good
reason. Just call a single NOHZ function from irq_enter() and call
the bits in the tick code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/tick.h     |  7 +++----
 kernel/softirq.c         | 10 +++-------
 kernel/time/tick-sched.c | 13 ++++++++++++-
 3 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 98921a3e1aa8..b6ec8189ac0c 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -96,9 +96,11 @@ extern cpumask_t *tick_get_broadcast_oneshot_mask(void);
 extern void tick_clock_notify(void);
 extern int tick_check_oneshot_change(int allow_nohz);
 extern struct tick_sched *tick_get_tick_sched(int cpu);
+extern void tick_check_idle(int cpu);
 # else
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline void tick_check_idle(int cpu) { }
 # endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
@@ -106,26 +108,23 @@ static inline void tick_init(void) { }
 static inline void tick_cancel_sched_timer(int cpu) { }
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline void tick_check_idle(int cpu) { }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
 extern void tick_nohz_stop_sched_tick(int inidle);
 extern void tick_nohz_restart_sched_tick(void);
-extern void tick_nohz_update_jiffies(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
-extern void tick_nohz_stop_idle(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 # else
 static inline void tick_nohz_stop_sched_tick(int inidle) { }
 static inline void tick_nohz_restart_sched_tick(void) { }
-static inline void tick_nohz_update_jiffies(void) { }
 static inline ktime_t tick_nohz_get_sleep_length(void)
 {
 	ktime_t len = { .tv64 = NSEC_PER_SEC/HZ };
 
 	return len;
 }
-static inline void tick_nohz_stop_idle(int cpu) { }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 37d67aa2d56f..d410014279e7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -265,16 +265,12 @@ asmlinkage void do_softirq(void)
  */
 void irq_enter(void)
 {
-#ifdef CONFIG_NO_HZ
 	int cpu = smp_processor_id();
+
 	if (idle_cpu(cpu) && !in_interrupt())
-		tick_nohz_stop_idle(cpu);
-#endif
+		tick_check_idle(cpu);
+
 	__irq_enter();
-#ifdef CONFIG_NO_HZ
-	if (idle_cpu(cpu))
-		tick_nohz_update_jiffies();
-#endif
 }
 
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b711ffcb106c..fdcf3f93bb8d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -155,7 +155,7 @@ void tick_nohz_update_jiffies(void)
 	touch_softlockup_watchdog();
 }
 
-void tick_nohz_stop_idle(int cpu)
+static void tick_nohz_stop_idle(int cpu)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 
@@ -558,6 +558,17 @@ static inline void tick_nohz_switch_to_nohz(void) { }
 
 #endif /* NO_HZ */
 
+/*
+ * Called from irq_enter to notify about the possible interruption of idle()
+ */
+void tick_check_idle(int cpu)
+{
+#ifdef CONFIG_NO_HZ
+	tick_nohz_stop_idle(cpu);
+	tick_nohz_update_jiffies();
+#endif
+}
+
 /*
  * High resolution timer specific code
  */
-- 
cgit v1.2.3-71-gd317