From a26ac2455ffcf3be5c6ef92bc6df7182700f2114 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jan 2011 14:10:23 -0800 Subject: rcu: move TREE_RCU from softirq to kthread If RCU priority boosting is to be meaningful, callback invocation must be boosted in addition to preempted RCU readers. Otherwise, in presence of CPU real-time threads, the grace period ends, but the callbacks don't get invoked. If the callbacks don't get invoked, the associated memory doesn't get freed, so the system is still subject to OOM. But it is not reasonable to priority-boost RCU_SOFTIRQ, so this commit moves the callback invocations to a kthread, which can be boosted easily. Also add comments and properly synchronized all accesses to rcu_cpu_kthread_task, as suggested by Lai Jiangshan. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/interrupt.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index bea0ac750712..6c12989839d9 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -414,7 +414,6 @@ enum TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, - RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS }; -- cgit v1.2.3-71-gd317 From 4a29865689dbb87a02e3b0fff4a4ae5041273173 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 3 Apr 2011 21:33:51 -0700 Subject: rcu: make rcutorture version numbers available through debugfs It is not possible to accurately correlate rcutorture output with that of debugfs. This patch therefore adds a debugfs file that prints out the rcutorture version number, permitting easy correlation. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 13 ++++++++++++- include/linux/rcutree.h | 3 +++ kernel/rcutorture.c | 8 +++++--- kernel/rcutree.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/rcutree_trace.c | 28 ++++++++++++++++++++++++++++ 5 files changed, 85 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ff422d2b7f90..9e169c2ba91f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -47,6 +47,18 @@ extern int rcutorture_runnable; /* for sysctl */ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) +extern void rcutorture_record_test_transition(void); +extern void rcutorture_record_progress(unsigned long vernum); +#else +static inline void rcutorture_record_test_transition(void) +{ +} +static inline void rcutorture_record_progress(unsigned long vernum) +{ +} +#endif + #define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) #define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) @@ -68,7 +80,6 @@ extern void call_rcu_sched(struct rcu_head *head, extern void synchronize_sched(void); extern void rcu_barrier_bh(void); extern void rcu_barrier_sched(void); -extern int sched_expedited_torture_stats(char *page); static inline void __rcu_read_lock_bh(void) { diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 3a933482734a..284dad10c55b 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -58,9 +58,12 @@ static inline void synchronize_rcu_bh_expedited(void) extern void rcu_barrier(void); +extern unsigned long rcutorture_testseq; +extern unsigned long rcutorture_vernum; extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); extern long rcu_batches_completed_sched(void); + extern void rcu_force_quiescent_state(void); extern void rcu_bh_force_quiescent_state(void); extern void rcu_sched_force_quiescent_state(void); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 22b0e74e7d99..c2f58ec24751 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -131,7 +131,7 @@ struct rcu_torture { static LIST_HEAD(rcu_torture_freelist); static struct rcu_torture __rcu *rcu_torture_current; -static long rcu_torture_current_version; +static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = @@ -884,7 +884,7 @@ rcu_torture_writer(void *arg) old_rp->rtort_pipe_count++; cur_ops->deferred_free(old_rp); } - rcu_torture_current_version++; + rcutorture_record_progress(++rcu_torture_current_version); oldbatch = cur_ops->completed(); rcu_stutter_wait("rcu_torture_writer"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); @@ -1064,7 +1064,7 @@ rcu_torture_printk(char *page) } cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], - "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " "rtmbe: %d rtbke: %ld rtbre: %ld " "rtbf: %ld rtb: %ld nt: %ld", rcu_torture_current, @@ -1325,6 +1325,7 @@ rcu_torture_cleanup(void) int i; mutex_lock(&fullstop_mutex); + rcutorture_record_test_transition(); if (fullstop == FULLSTOP_SHUTDOWN) { printk(KERN_WARNING /* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); @@ -1616,6 +1617,7 @@ rcu_torture_init(void) } } register_reboot_notifier(&rcutorture_shutdown_nb); + rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d8917401cbbc..bb84deca3319 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -101,6 +101,18 @@ static void invoke_rcu_cpu_kthread(void); #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ +/* + * Track the rcutorture test sequence number and the update version + * number within a given test. The rcutorture_testseq is incremented + * on every rcutorture module load and unload, so has an odd value + * when a test is running. The rcutorture_vernum is set to zero + * when rcutorture starts and is incremented on each rcutorture update. + * These variables enable correlating rcutorture output with the + * RCU tracing information. + */ +unsigned long rcutorture_testseq; +unsigned long rcutorture_vernum; + /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -192,6 +204,31 @@ void rcu_bh_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); +/* + * Record the number of times rcutorture tests have been initiated and + * terminated. This information allows the debugfs tracing stats to be + * correlated to the rcutorture messages, even when the rcutorture module + * is being repeatedly loaded and unloaded. In other words, we cannot + * store this state in rcutorture itself. + */ +void rcutorture_record_test_transition(void) +{ + rcutorture_testseq++; + rcutorture_vernum = 0; +} +EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); + +/* + * Record the number of writer passes through the current rcutorture test. + * This is also used to correlate debugfs tracing stats with the rcutorture + * messages. + */ +void rcutorture_record_progress(unsigned long vernum) +{ + rcutorture_vernum++; +} +EXPORT_SYMBOL_GPL(rcutorture_record_progress); + /* * Force a quiescent state for RCU-sched. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index fc40e89a028c..3baa235786b5 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -394,6 +394,29 @@ static const struct file_operations rcu_pending_fops = { .release = single_release, }; +static int show_rcutorture(struct seq_file *m, void *unused) +{ + seq_printf(m, "rcutorture test sequence: %lu %s\n", + rcutorture_testseq >> 1, + (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); + seq_printf(m, "rcutorture update version number: %lu\n", + rcutorture_vernum); + return 0; +} + +static int rcutorture_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcutorture, NULL); +} + +static const struct file_operations rcutorture_fops = { + .owner = THIS_MODULE, + .open = rcutorture_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static struct dentry *rcudir; static int __init rcutree_trace_init(void) @@ -430,6 +453,11 @@ static int __init rcutree_trace_init(void) NULL, &rcu_pending_fops); if (!retval) goto free_out; + + retval = debugfs_create_file("rcutorture", 0444, rcudir, + NULL, &rcutorture_fops); + if (!retval) + goto free_out; return 0; free_out: debugfs_remove_recursive(rcudir); -- cgit v1.2.3-71-gd317 From b0c9d7ff2793502650ad987c3f237d5fe5587a1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Mar 2011 12:56:56 -0700 Subject: rcu: add DEBUG_OBJECTS_RCU_HEAD check for alignment Verify that rcu_head structures are aligned to a four-byte boundary. This check is enabled by CONFIG_DEBUG_OBJECTS_RCU_HEAD. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9e169c2ba91f..c7aeacf7fc98 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -785,6 +785,7 @@ extern struct debug_obj_descr rcuhead_debug_descr; static inline void debug_rcu_head_queue(struct rcu_head *head) { + WARN_ON_ONCE((unsigned long)head & 0x3); debug_object_activate(head, &rcuhead_debug_descr); debug_object_active_state(head, &rcuhead_debug_descr, STATE_RCU_HEAD_READY, -- cgit v1.2.3-71-gd317 From 9ab1544eb4196ca8d05c433b2eb56f74496b1ee3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 11:15:47 +0800 Subject: rcu: introduce kfree_rcu() Many rcu callbacks functions just call kfree() on the base structure. These functions are trivial, but their size adds up, and furthermore when they are used in a kernel module, that module must invoke the high-latency rcu_barrier() function at module-unload time. The kfree_rcu() function introduced by this commit addresses this issue. Rather than encoding a function address in the embedded rcu_head structure, kfree_rcu() instead encodes the offset of the rcu_head structure within the base structure. Because the functions are not allowed in the low-order 4096 bytes of kernel virtual memory, offsets up to 4095 bytes can be accommodated. If the offset is larger than 4095 bytes, a compile-time error will be generated in __kfree_rcu(). If this error is triggered, you can either fall back to use of call_rcu() or rearrange the structure to position the rcu_head structure into the first 4096 bytes. Note that the allowable offset might decrease in the future, for example, to allow something like kmem_cache_free_rcu(). The new kfree_rcu() function can replace code as follows: call_rcu(&p->rcu, simple_kfree_callback); where "simple_kfree_callback()" might be defined as follows: void simple_kfree_callback(struct rcu_head *p) { struct foo *q = container_of(p, struct foo, rcu); kfree(q); } with the following: kfree_rcu(&p->rcu, rcu); Note that the "rcu" is the name of a field in the structure being freed. The reason for using this rather than passing in a pointer to the base structure is that the above approach allows better type checking. This commit is based on earlier work by Lai Jiangshan and Manfred Spraul: Lai's V1 patch: http://lkml.org/lkml/2008/9/18/1 Manfred's patch: http://lkml.org/lkml/2009/1/2/115 Signed-off-by: Lai Jiangshan Signed-off-by: Manfred Spraul Signed-off-by: Paul E. McKenney Reviewed-by: David Howells Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutiny.c | 2 +- kernel/rcutree.c | 2 +- 3 files changed, 58 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c7aeacf7fc98..99f9aa7c2804 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -809,4 +809,60 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static __always_inline bool __is_kfree_rcu_offset(unsigned long offset) +{ + return offset < 4096; +} + +static __always_inline +void __kfree_rcu(struct rcu_head *head, unsigned long offset) +{ + typedef void (*rcu_callback)(struct rcu_head *); + + BUILD_BUG_ON(!__builtin_constant_p(offset)); + + /* See the kfree_rcu() header comment. */ + BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); + + call_rcu(head, (rcu_callback)offset); +} + +extern void kfree(const void *); + +static inline void __rcu_reclaim(struct rcu_head *head) +{ + unsigned long offset = (unsigned long)head->func; + + if (__is_kfree_rcu_offset(offset)) + kfree((void *)head - offset); + else + head->func(head); +} + +/** + * kfree_rcu() - kfree an object after a grace period. + * @ptr: pointer to kfree + * @rcu_head: the name of the struct rcu_head within the type of @ptr. + * + * Many rcu callbacks functions just call kfree() on the base structure. + * These functions are trivial, but their size adds up, and furthermore + * when they are used in a kernel module, that module must invoke the + * high-latency rcu_barrier() function at module-unload time. + * + * The kfree_rcu() function handles this issue. Rather than encoding a + * function address in the embedded rcu_head structure, kfree_rcu() instead + * encodes the offset of the rcu_head structure within the base structure. + * Because the functions are not allowed in the low-order 4096 bytes of + * kernel virtual memory, offsets up to 4095 bytes can be accommodated. + * If the offset is larger than 4095 bytes, a compile-time error will + * be generated in __kfree_rcu(). If this error is triggered, you can + * either fall back to use of call_rcu() or rearrange the structure to + * position the rcu_head structure into the first 4096 bytes. + * + * Note that the allowable offset might decrease in the future, for example, + * to allow something like kmem_cache_free_rcu(). + */ +#define kfree_rcu(ptr, rcu_head) \ + __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 0c343b9a46d5..4d60fbc9c64c 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -167,7 +167,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - list->func(list); + __rcu_reclaim(list); local_bh_enable(); list = next; RCU_TRACE(cb_count++); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b579e4f97210..2c07adb97088 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1206,7 +1206,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - list->func(list); + __rcu_reclaim(list); list = next; if (++count >= rdp->blimit) break; -- cgit v1.2.3-71-gd317 From 29ce831000081dd757d3116bf774aafffc4b6b20 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 4 May 2011 16:31:03 +0300 Subject: rcu: provide rcu_virt_note_context_switch() function. Provide rcu_virt_note_context_switch() for vitalization use to note quiescent state during guest entry. Signed-off-by: Gleb Natapov Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 8 ++++++++ include/linux/rcutree.h | 10 ++++++++++ kernel/rcutree.c | 1 + 3 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 30ebd7c8d874..52b3e0281fd0 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -99,6 +99,14 @@ static inline void rcu_note_context_switch(int cpu) rcu_preempt_note_context_switch(); } +/* + * Take advantage of the fact that there is only one CPU, which + * allows us to ignore virtualization-based context switches. + */ +static inline void rcu_virt_note_context_switch(int cpu) +{ +} + /* * Return the number of grace periods. */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 284dad10c55b..e65d06634dd8 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -35,6 +35,16 @@ extern void rcu_note_context_switch(int cpu); extern int rcu_needs_cpu(int cpu); extern void rcu_cpu_stall_reset(void); +/* + * Note a virtualization-based context switch. This is simply a + * wrapper around rcu_note_context_switch(), which allows TINY_RCU + * to save a few bytes. + */ +static inline void rcu_virt_note_context_switch(int cpu) +{ + rcu_note_context_switch(cpu); +} + #ifdef CONFIG_TREE_PREEMPT_RCU extern void exit_rcu(void); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b2fe2a273df2..54ff7eb92819 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -157,6 +157,7 @@ void rcu_note_context_switch(int cpu) rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); } +EXPORT_SYMBOL_GPL(rcu_note_context_switch); #ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { -- cgit v1.2.3-71-gd317