From e725c731e3bb1e892e7b564c945b121cb41d1087 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 3 Mar 2017 13:37:33 -0500 Subject: tracing: Split tracing initialization into two for early initialization Create an early_trace_init() function that will initialize the buffers and allow for ealier use of trace_printk(). This will also allow for future work to have function tracing start earlier at boot up. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 3633e8beff39..569db5589851 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -42,8 +42,10 @@ /* Main tracing buffer and events set up */ #ifdef CONFIG_TRACING void trace_init(void); +void early_trace_init(void); #else static inline void trace_init(void) { } +static inline void early_trace_init(void) { } #endif struct module; -- cgit v1.2.3-71-gd317 From 42c269c88dc146982a54a8267f71abc99f12852a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 3 Mar 2017 16:15:39 -0500 Subject: ftrace: Allow for function tracing to record init functions on boot up Adding a hook into free_reserve_area() that informs ftrace that boot up init text is being free, lets ftrace safely remove those init functions from its records, which keeps ftrace from trying to modify text that no longer exists. Note, this still does not allow for tracing .init text of modules, as modules require different work for freeing its init code. Link: http://lkml.kernel.org/r/1488502497.7212.24.camel@linux.intel.com Cc: linux-mm@kvack.org Cc: Vlastimil Babka Cc: Mel Gorman Cc: Peter Zijlstra Requested-by: Todd Brandt Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 5 +++++ include/linux/init.h | 4 +++- kernel/trace/ftrace.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 4 ++++ scripts/recordmcount.c | 1 + scripts/recordmcount.pl | 1 + 6 files changed, 58 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 569db5589851..0276a2c487e6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -146,6 +146,10 @@ struct ftrace_ops_hash { struct ftrace_hash *filter_hash; struct mutex regex_lock; }; + +void ftrace_free_mem(void *start, void *end); +#else +static inline void ftrace_free_mem(void *start, void *end) { } #endif /* @@ -262,6 +266,7 @@ static inline int ftrace_nr_registered_ops(void) } static inline void clear_ftrace_function(void) { } static inline void ftrace_kill(void) { } +static inline void ftrace_free_mem(void *start, void *end) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_STACK_TRACER diff --git a/include/linux/init.h b/include/linux/init.h index 79af0962fd52..94769d687cf0 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -39,7 +39,7 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) __cold notrace __latent_entropy +#define __init __section(.init.text) __cold __inittrace __latent_entropy #define __initdata __section(.init.data) #define __initconst __section(.init.rodata) #define __exitdata __section(.exit.data) @@ -68,8 +68,10 @@ #ifdef MODULE #define __exitused +#define __inittrace notrace #else #define __exitused __used +#define __inittrace #endif #define __exit __section(.exit.text) __exitused __cold notrace diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b9691ee8f6c1..0556a202c055 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5262,6 +5262,50 @@ void ftrace_module_init(struct module *mod) } #endif /* CONFIG_MODULES */ +void ftrace_free_mem(void *start_ptr, void *end_ptr) +{ + unsigned long start = (unsigned long)start_ptr; + unsigned long end = (unsigned long)end_ptr; + struct ftrace_page **last_pg = &ftrace_pages_start; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + struct dyn_ftrace key; + int order; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + mutex_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + again: + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (!rec) + continue; + pg->index--; + if (!pg->index) { + *last_pg = pg->next; + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + kfree(pg); + pg = container_of(last_pg, struct ftrace_page, next); + if (!(*last_pg)) + ftrace_pages = pg; + continue; + } + memmove(rec, rec + 1, + (pg->index - (rec - pg->records)) * sizeof(*rec)); + /* More than one function may be in this block */ + goto again; + } + mutex_unlock(&ftrace_lock); +} + void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6cbde310abed..eee82bfb7cd8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -6605,6 +6606,9 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) void *pos; unsigned long pages = 0; + /* This may be .init text, inform ftrace to remove it */ + ftrace_free_mem(start, end); + start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index aeb34223167c..16e086dcc567 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -412,6 +412,7 @@ static int is_mcounted_section_name(char const *const txtname) { return strcmp(".text", txtname) == 0 || + strcmp(".init.text", txtname) == 0 || strcmp(".ref.text", txtname) == 0 || strcmp(".sched.text", txtname) == 0 || strcmp(".spinlock.text", txtname) == 0 || diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 0b6002b36f20..1633c3e6c0b9 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -130,6 +130,7 @@ if ($inputfile =~ m,kernel/trace/ftrace\.o$,) { # Acceptable sections to record. my %text_sections = ( ".text" => 1, + ".init.text" => 1, ".ref.text" => 1, ".sched.text" => 1, ".spinlock.text" => 1, -- cgit v1.2.3-71-gd317 From af0009fc16a45d091f896794e97a6457f9a7eddf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 16 Mar 2017 11:01:06 -0400 Subject: tracing: Move trace_handle_return() out of line Currently trace_handle_return() looks like this: static inline enum print_line_t trace_handle_return(struct trace_seq *s) { return trace_seq_has_overflowed(s) ? TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; } Where trace_seq_overflowed(s) is: static inline bool trace_seq_has_overflowed(struct trace_seq *s) { return s->full || seq_buf_has_overflowed(&s->seq); } And seq_buf_has_overflowed(&s->seq) is: static inline bool seq_buf_has_overflowed(struct seq_buf *s) { return s->len > s->size; } Making trace_handle_return() into: return (s->full || (s->seq->len > s->seq->size)) ? TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; One would think this is not an issue to keep as an inline. But because this is used in the TRACE_EVENT() macro, it is extended for every tracepoint in the system. Taking a look at a single tracepoint x86_irq_vector (was the first one I randomly chosen). As trace_handle_return is used in the TRACE_EVENT() macro of trace_raw_output_##call() we disassemble trace_raw_output_x86_irq_vector and do a diff: - is the original + is the out-of-line code I removed identical lines that were different just due to different addresses. --- /tmp/irq-vec-orig 2017-03-16 09:12:48.569384851 -0400 +++ /tmp/irq-vec-ool 2017-03-16 09:13:39.378153385 -0400 @@ -6,27 +6,23 @@ 53 push %rbx 48 89 fb mov %rdi,%rbx 4c 8b a7 c0 20 00 00 mov 0x20c0(%rdi),%r12 e8 f7 72 13 00 callq ffffffff81155c80 83 f8 01 cmp $0x1,%eax 74 05 je ffffffff8101e993 5b pop %rbx 41 5c pop %r12 5d pop %rbp c3 retq 41 8b 54 24 08 mov 0x8(%r12),%edx - 48 8d bb 98 10 00 00 lea 0x1098(%rbx),%rdi + 48 81 c3 98 10 00 00 add $0x1098,%rbx - 48 c7 c6 7b 8a a0 81 mov $0xffffffff81a08a7b,%rsi + 48 c7 c6 ab 8a a0 81 mov $0xffffffff81a08aab,%rsi - e8 c5 85 13 00 callq ffffffff81156f70 === here's the start of the main difference === + 48 89 df mov %rbx,%rdi + e8 62 7e 13 00 callq ffffffff81156810 - 8b 93 b8 20 00 00 mov 0x20b8(%rbx),%edx - 31 c0 xor %eax,%eax - 85 d2 test %edx,%edx - 75 11 jne ffffffff8101e9c8 - 48 8b 83 a8 20 00 00 mov 0x20a8(%rbx),%rax - 48 39 83 a0 20 00 00 cmp %rax,0x20a0(%rbx) - 0f 93 c0 setae %al + 48 89 df mov %rbx,%rdi + e8 4a c5 12 00 callq ffffffff8114af00 5b pop %rbx - 0f b6 c0 movzbl %al,%eax === end === 41 5c pop %r12 5d pop %rbp c3 retq If you notice, the original has 22 bytes of text more than the out of line version. As this is for every TRACE_EVENT() defined in the system, this can become quite large. text data bss dec hex filename 8690305 5450490 1298432 15439227 eb957b vmlinux-orig 8681725 5450490 1298432 15430647 eb73f7 vmlinux-handle This change has a total of 8580 bytes in savings. $ objdump -dr /tmp/vmlinux-orig | grep '^[0-9a-f]* Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 11 +---------- kernel/trace/trace.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 0af63c4381b9..a556805eff8a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -138,16 +138,7 @@ enum print_line_t { TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; -/* - * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq - * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function - * simplifies those functions and keeps them in sync. - */ -static inline enum print_line_t trace_handle_return(struct trace_seq *s) -{ - return trace_seq_has_overflowed(s) ? - TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; -} +enum print_line_t trace_handle_return(struct trace_seq *s); void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4fa8e8f3c765..b5d4b80f2d45 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1998,6 +1998,18 @@ void tracing_record_cmdline(struct task_struct *tsk) __this_cpu_write(trace_cmdline_save, false); } +/* + * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq + * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function + * simplifies those functions and keeps them in sync. + */ +enum print_line_t trace_handle_return(struct trace_seq *s) +{ + return trace_seq_has_overflowed(s) ? + TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; +} +EXPORT_SYMBOL_GPL(trace_handle_return); + void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc) -- cgit v1.2.3-71-gd317 From b80f0f6c9ed3958ff4002b6135f43a1ef312a610 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Apr 2017 12:57:35 -0400 Subject: ftrace: Have init/main.c call ftrace directly to free init memory Relying on free_reserved_area() to call ftrace to free init memory proved to not be sufficient. The issue is that on x86, when debug_pagealloc is enabled, the init memory is not freed, but simply set as not present. Since ftrace was uninformed of this, starting function tracing still tries to update pages that are not present according to the page tables, causing ftrace to bug, as well as killing the kernel itself. Instead of relying on free_reserved_area(), have init/main.c call ftrace directly just before it frees the init memory. Then it needs to use __init_begin and __init_end to know where the init memory location is. Looking at all archs (and testing what I can), it appears that this should work for each of them. Reported-by: kernel test robot Reported-by: Fengguang Wu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 +++--- init/main.c | 1 + kernel/trace/ftrace.c | 7 ++++--- mm/page_alloc.c | 3 --- 4 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0276a2c487e6..ef7123219f14 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -147,9 +147,9 @@ struct ftrace_ops_hash { struct mutex regex_lock; }; -void ftrace_free_mem(void *start, void *end); +void ftrace_free_init_mem(void); #else -static inline void ftrace_free_mem(void *start, void *end) { } +static inline void ftrace_free_init_mem(void) { } #endif /* @@ -266,7 +266,7 @@ static inline int ftrace_nr_registered_ops(void) } static inline void clear_ftrace_function(void) { } static inline void ftrace_kill(void) { } -static inline void ftrace_free_mem(void *start, void *end) { } +static inline void ftrace_free_init_mem(void) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_STACK_TRACER diff --git a/init/main.c b/init/main.c index c0137b916aa1..0e8849f74561 100644 --- a/init/main.c +++ b/init/main.c @@ -962,6 +962,7 @@ static int __ref kernel_init(void *unused) kernel_init_freeable(); /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); + ftrace_free_init_mem(); free_initmem(); mark_readonly(); system_state = SYSTEM_RUNNING; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index aff7a2c08387..8efd9fe7aec0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -36,6 +36,7 @@ #include +#include #include #include "trace_output.h" @@ -5279,10 +5280,10 @@ void ftrace_module_init(struct module *mod) } #endif /* CONFIG_MODULES */ -void ftrace_free_mem(void *start_ptr, void *end_ptr) +void __init ftrace_free_init_mem(void) { - unsigned long start = (unsigned long)start_ptr; - unsigned long end = (unsigned long)end_ptr; + unsigned long start = (unsigned long)(&__init_begin); + unsigned long end = (unsigned long)(&__init_end); struct ftrace_page **last_pg = &ftrace_pages_start; struct ftrace_page *pg; struct dyn_ftrace *rec; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee82bfb7cd8..178bf9c2a2cb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6606,9 +6606,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) void *pos; unsigned long pages = 0; - /* This may be .init text, inform ftrace to remove it */ - ftrace_free_mem(start, end); - start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { -- cgit v1.2.3-71-gd317 From 5367278cb7ba74537bcad1470d75f30d95b09c14 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 6 Apr 2017 12:26:20 -0400 Subject: tracing: Add stack_tracer_disable/enable() functions There are certain parts of the kernel that cannot let stack tracing proceed (namely in RCU), because the stack tracer uses RCU, and parts of RCU internals cannot handle having RCU read side locks taken. Add stack_tracer_disable() and stack_tracer_enable() functions to let RCU stop stack tracing on the current CPU when it is in those critical sections. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ++++++ kernel/trace/trace_stack.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ef7123219f14..7b4e6572ab21 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -286,6 +286,12 @@ int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); + +void stack_tracer_disable(void); +void stack_tracer_enable(void); +#else +static inline void stack_tracer_disable(void) { } +static inline void stack_tracer_enable(void) { } #endif struct ftrace_func_command { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 338d076a06da..21e536cf66e4 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -41,6 +41,38 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; +/** + * stack_tracer_disable - temporarily disable the stack tracer + * + * There's a few locations (namely in RCU) where stack tracing + * cannot be executed. This function is used to disable stack + * tracing during those critical sections. + * + * This function must be called with preemption or interrupts + * disabled and stack_tracer_enable() must be called shortly after + * while preemption or interrupts are still disabled. + */ +void stack_tracer_disable(void) +{ + /* Preemption or interupts must be disabled */ + if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) + WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); + this_cpu_inc(trace_active); +} + +/** + * stack_tracer_enable - re-enable the stack tracer + * + * After stack_tracer_disable() is called, stack_tracer_enable() + * must be called shortly afterward. + */ +void stack_tracer_enable(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) + WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); + this_cpu_dec(trace_active); +} + void stack_trace_print(void) { long i; -- cgit v1.2.3-71-gd317 From 8aaf1ee70e19ac74cbbb81098edfa328d1ab4bd7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 6 Apr 2017 15:47:32 -0400 Subject: tracing: Rename trace_active to disable_stack_tracer and inline its modification In order to eliminate a function call, make "trace_active" into "disable_stack_tracer" and convert stack_tracer_disable() and friends into static inline functions. Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 36 +++++++++++++++++++++++++++++++-- kernel/trace/trace_stack.c | 50 +++++++++------------------------------------- 2 files changed, 43 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7b4e6572ab21..06b2990a35e4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -287,8 +287,40 @@ stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -void stack_tracer_disable(void); -void stack_tracer_enable(void); +/* DO NOT MODIFY THIS VARIABLE DIRECTLY! */ +DECLARE_PER_CPU(int, disable_stack_tracer); + +/** + * stack_tracer_disable - temporarily disable the stack tracer + * + * There's a few locations (namely in RCU) where stack tracing + * cannot be executed. This function is used to disable stack + * tracing during those critical sections. + * + * This function must be called with preemption or interrupts + * disabled and stack_tracer_enable() must be called shortly after + * while preemption or interrupts are still disabled. + */ +static inline void stack_tracer_disable(void) +{ + /* Preemption or interupts must be disabled */ + if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) + WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); + this_cpu_inc(disable_stack_tracer); +} + +/** + * stack_tracer_enable - re-enable the stack tracer + * + * After stack_tracer_disable() is called, stack_tracer_enable() + * must be called shortly afterward. + */ +static inline void stack_tracer_enable(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) + WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); + this_cpu_dec(disable_stack_tracer); +} #else static inline void stack_tracer_disable(void) { } static inline void stack_tracer_enable(void) { } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 21e536cf66e4..f2f02ff350d4 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -35,44 +35,12 @@ unsigned long stack_trace_max_size; arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -static DEFINE_PER_CPU(int, trace_active); +DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; -/** - * stack_tracer_disable - temporarily disable the stack tracer - * - * There's a few locations (namely in RCU) where stack tracing - * cannot be executed. This function is used to disable stack - * tracing during those critical sections. - * - * This function must be called with preemption or interrupts - * disabled and stack_tracer_enable() must be called shortly after - * while preemption or interrupts are still disabled. - */ -void stack_tracer_disable(void) -{ - /* Preemption or interupts must be disabled */ - if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) - WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); - this_cpu_inc(trace_active); -} - -/** - * stack_tracer_enable - re-enable the stack tracer - * - * After stack_tracer_disable() is called, stack_tracer_enable() - * must be called shortly afterward. - */ -void stack_tracer_enable(void) -{ - if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) - WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); - this_cpu_dec(trace_active); -} - void stack_trace_print(void) { long i; @@ -243,8 +211,8 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); /* no atomic needed, we only modify this variable by this cpu */ - __this_cpu_inc(trace_active); - if (__this_cpu_read(trace_active) != 1) + __this_cpu_inc(disable_stack_tracer); + if (__this_cpu_read(disable_stack_tracer) != 1) goto out; ip += MCOUNT_INSN_SIZE; @@ -252,7 +220,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, check_stack(ip, &stack); out: - __this_cpu_dec(trace_active); + __this_cpu_dec(disable_stack_tracer); /* prevent recursion in schedule */ preempt_enable_notrace(); } @@ -294,15 +262,15 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, /* * In case we trace inside arch_spin_lock() or after (NMI), * we will cause circular lock, so we also need to increase - * the percpu trace_active here. + * the percpu disable_stack_tracer here. */ - __this_cpu_inc(trace_active); + __this_cpu_inc(disable_stack_tracer); arch_spin_lock(&stack_trace_max_lock); *ptr = val; arch_spin_unlock(&stack_trace_max_lock); - __this_cpu_dec(trace_active); + __this_cpu_dec(disable_stack_tracer); local_irq_restore(flags); return count; @@ -338,7 +306,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) { local_irq_disable(); - __this_cpu_inc(trace_active); + __this_cpu_inc(disable_stack_tracer); arch_spin_lock(&stack_trace_max_lock); @@ -352,7 +320,7 @@ static void t_stop(struct seq_file *m, void *p) { arch_spin_unlock(&stack_trace_max_lock); - __this_cpu_dec(trace_active); + __this_cpu_dec(disable_stack_tracer); local_irq_enable(); } -- cgit v1.2.3-71-gd317 From 03ecd3f48e57f2e6154584e0ee7450d7a05e2d3b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 7 Apr 2017 12:20:36 -0400 Subject: rcu/tracing: Add rcu_disabled to denote when rcu_irq_enter() will not work Tracing uses rcu_irq_enter() as a way to make sure that RCU is watching when it needs to use rcu_read_lock() and friends. This is because tracing can happen as RCU is about to enter user space, or about to go idle, and RCU does not watch for RCU read side critical sections as it makes the transition. There is a small location within the RCU infrastructure that rcu_irq_enter() itself will not work. If tracing were to occur in that section it will break if it tries to use rcu_irq_enter(). Originally, this happens with the stack_tracer, because it will call save_stack_trace when it encounters stack usage that is greater than any stack usage it had encountered previously. There was a case where that happened in the RCU section where rcu_irq_enter() did not work, and lockdep complained loudly about it. To fix it, stack tracing added a call to be disabled and RCU would disable stack tracing during the critical section that rcu_irq_enter() was inoperable. This solution worked, but there are other cases that use rcu_irq_enter() and it would be a good idea to let RCU give a way to let others know that rcu_irq_enter() will not work. For example, in trace events. Another helpful aspect of this change is that it also moves the per cpu variable called in the RCU critical section into a cache locale along with other RCU per cpu variables used in that same location. I'm keeping the stack_trace_disable() code, as that still could be used in the future by places that really need to disable it. And since it's only a static inline, it wont take up any kernel text if it is not used. Link: http://lkml.kernel.org/r/20170405093207.404f8deb@gandalf.local.home Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- include/linux/rcupdate.h | 5 +++++ kernel/rcu/tree.c | 18 ++++++++++++++++-- kernel/trace/trace_stack.c | 8 ++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de88b33c0974..dea8f17b2fe3 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -97,6 +97,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long secs, unsigned long c_old, unsigned long c); +bool rcu_irq_enter_disabled(void); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, @@ -113,6 +114,10 @@ static inline void rcutorture_record_test_transition(void) static inline void rcutorture_record_progress(unsigned long vernum) { } +static inline bool rcu_irq_enter_disabled(void) +{ + return false; +} #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8b4d273331e4..a6dcf3bd244f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -284,6 +284,20 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +/* + * There's a few places, currently just in the tracing infrastructure, + * that uses rcu_irq_enter() to make sure RCU is watching. But there's + * a small location where that will not even work. In those cases + * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter() + * can be called. + */ +static DEFINE_PER_CPU(bool, disable_rcu_irq_enter); + +bool rcu_irq_enter_disabled(void) +{ + return this_cpu_read(disable_rcu_irq_enter); +} + /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state. @@ -800,10 +814,10 @@ static void rcu_eqs_enter_common(bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - stack_tracer_disable(); + __this_cpu_inc(disable_rcu_irq_enter); rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ - stack_tracer_enable(); + __this_cpu_dec(disable_rcu_irq_enter); rcu_dynticks_task_enter(); /* diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index f2f02ff350d4..76aa04d4c925 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -96,6 +96,14 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; + /* + * There's a slight chance that we are tracing inside the + * RCU infrastructure, and rcu_irq_enter() will not work + * as expected. + */ + if (unlikely(rcu_irq_enter_disabled())) + return; + local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); -- cgit v1.2.3-71-gd317 From d54b6eeb553c89ed8d4c5a2ed73df374a45b9562 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 7 Apr 2017 12:40:49 -0400 Subject: tracing: Make sure rcu_irq_enter() can work for trace_*_rcuidle() trace events Stack tracing discovered that there's a small location inside the RCU infrastructure where calling rcu_irq_enter() does not work. As trace events use rcu_irq_enter() it must make sure that it is functionable. A check against rcu_irq_enter_disabled() is added with a WARN_ON_ONCE() as no trace event should ever be used in that part of RCU. If the warning is triggered, then the trace event is ignored. Restructure the __DO_TRACE() a bit to get rid of the prercu and postrcu, and just have an rcucheck that does the work from within the _DO_TRACE() macro. gcc optimization will compile out the rcucheck=0 case. Link: http://lkml.kernel.org/r/20170405093207.404f8deb@gandalf.local.home Acked-by: Mathieu Desnoyers Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index f72fcfe0e66a..cc48cb2ce209 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -128,7 +128,7 @@ extern void syscall_unregfunc(void); * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto". */ -#define __DO_TRACE(tp, proto, args, cond, prercu, postrcu) \ +#define __DO_TRACE(tp, proto, args, cond, rcucheck) \ do { \ struct tracepoint_func *it_func_ptr; \ void *it_func; \ @@ -136,7 +136,11 @@ extern void syscall_unregfunc(void); \ if (!(cond)) \ return; \ - prercu; \ + if (rcucheck) { \ + if (WARN_ON_ONCE(rcu_irq_enter_disabled())) \ + return; \ + rcu_irq_enter_irqson(); \ + } \ rcu_read_lock_sched_notrace(); \ it_func_ptr = rcu_dereference_sched((tp)->funcs); \ if (it_func_ptr) { \ @@ -147,20 +151,19 @@ extern void syscall_unregfunc(void); } while ((++it_func_ptr)->func); \ } \ rcu_read_unlock_sched_notrace(); \ - postrcu; \ + if (rcucheck) \ + rcu_irq_exit_irqson(); \ } while (0) #ifndef MODULE -#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) \ +#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) \ static inline void trace_##name##_rcuidle(proto) \ { \ if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ - TP_CONDITION(cond), \ - rcu_irq_enter_irqson(), \ - rcu_irq_exit_irqson()); \ + TP_CONDITION(cond), 1); \ } #else #define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) @@ -186,7 +189,7 @@ extern void syscall_unregfunc(void); __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ - TP_CONDITION(cond),,); \ + TP_CONDITION(cond), 0); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ rcu_read_lock_sched_notrace(); \ rcu_dereference_sched(__tracepoint_##name.funcs);\ -- cgit v1.2.3-71-gd317 From ec19b85913486993d7d6f747beed1a711afd47d8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 31 Mar 2017 19:01:14 -0400 Subject: ftrace: Move the probe function into the tracing directory As nothing outside the tracing directory uses the function probes mechanism, I'm moving the prototypes out of the include/linux/ftrace.h and into the local kernel/trace/trace.h header. I plan on making them hook to the trace_array structure which is local to kernel/trace, and I do not want to expose it to the rest of the kernel. This requires that the probe functions must also be local to tracing. But luckily nothing else uses them. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 24 ------------------------ kernel/trace/trace.h | 25 +++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 06b2990a35e4..3e790ff1c501 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -360,30 +360,6 @@ void ftrace_bug(int err, struct dyn_ftrace *rec); struct seq_file; -struct ftrace_probe_ops { - void (*func)(unsigned long ip, - unsigned long parent_ip, - void **data); - int (*init)(struct ftrace_probe_ops *ops, - unsigned long ip, void **data); - void (*free)(struct ftrace_probe_ops *ops, - unsigned long ip, void **data); - int (*print)(struct seq_file *m, - unsigned long ip, - struct ftrace_probe_ops *ops, - void *data); -}; - -extern int -register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data); -extern void -unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data); -extern void -unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); -extern void unregister_ftrace_function_probe_all(char *glob); - extern int ftrace_text_reserved(const void *start, const void *end); extern int ftrace_nr_registered_ops(void); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 31a4997b67c6..2ff6d49fa5ca 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -923,6 +923,31 @@ static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { #endif /* CONFIG_FUNCTION_TRACER */ #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) + +struct ftrace_probe_ops { + void (*func)(unsigned long ip, + unsigned long parent_ip, + void **data); + int (*init)(struct ftrace_probe_ops *ops, + unsigned long ip, void **data); + void (*free)(struct ftrace_probe_ops *ops, + unsigned long ip, void **data); + int (*print)(struct seq_file *m, + unsigned long ip, + struct ftrace_probe_ops *ops, + void *data); +}; + +extern int +register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, + void *data); +extern void +unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, + void *data); +extern void +unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); +extern void unregister_ftrace_function_probe_all(char *glob); + void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent); void ftrace_destroy_filter_files(struct ftrace_ops *ops); -- cgit v1.2.3-71-gd317 From 92a68fa047ca5b8e1991af2d50b23ad9452613cd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 31 Mar 2017 19:21:41 -0400 Subject: ftrace: Move the function commands into the tracing directory As nothing outside the tracing directory uses the function command mechanism, I'm moving the prototypes out of the include/linux/ftrace.h and into the local kernel/trace/trace.h header. I plan on making them hook to the trace_array structure which is local to kernel/trace, and I do not want to expose it to the rest of the kernel. This requires that the command functions must also be local to tracing. But luckily nothing else uses them. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 19 ------------------- kernel/trace/trace.h | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 3e790ff1c501..774e7a95c201 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -326,14 +326,6 @@ static inline void stack_tracer_disable(void) { } static inline void stack_tracer_enable(void) { } #endif -struct ftrace_func_command { - struct list_head list; - char *name; - int (*func)(struct ftrace_hash *hash, - char *func, char *cmd, - char *params, int enable); -}; - #ifdef CONFIG_DYNAMIC_FTRACE int ftrace_arch_code_modify_prepare(void); @@ -421,9 +413,6 @@ void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); void ftrace_free_filter(struct ftrace_ops *ops); void ftrace_ops_set_global_filter(struct ftrace_ops *ops); -int register_ftrace_command(struct ftrace_func_command *cmd); -int unregister_ftrace_command(struct ftrace_func_command *cmd); - enum { FTRACE_UPDATE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -639,14 +628,6 @@ static inline void ftrace_enable_daemon(void) { } static inline void ftrace_module_init(struct module *mod) { } static inline void ftrace_module_enable(struct module *mod) { } static inline void ftrace_release_mod(struct module *mod) { } -static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) -{ - return -EINVAL; -} -static inline __init int unregister_ftrace_command(char *cmd_name) -{ - return -EINVAL; -} static inline int ftrace_text_reserved(const void *start, const void *end) { return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2ff6d49fa5ca..a63411c53c5e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -880,6 +880,13 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER +struct ftrace_func_command { + struct list_head list; + char *name; + int (*func)(struct ftrace_hash *hash, + char *func, char *cmd, + char *params, int enable); +}; extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { @@ -948,10 +955,23 @@ extern void unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); extern void unregister_ftrace_function_probe_all(char *glob); +int register_ftrace_command(struct ftrace_func_command *cmd); +int unregister_ftrace_command(struct ftrace_func_command *cmd); + void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent); void ftrace_destroy_filter_files(struct ftrace_ops *ops); #else +struct ftrace_func_command; + +static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) +{ + return -EINVAL; +} +static inline __init int unregister_ftrace_command(char *cmd_name) +{ + return -EINVAL; +} /* * The ops parameter passed in is usually undefined. * This must be a macro. -- cgit v1.2.3-71-gd317 From eee8ded131f15e0f5b1897c9c4a7687fabd28822 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Apr 2017 21:31:28 -0400 Subject: ftrace: Have the function probes call their own function Now that the function probes have their own ftrace_ops, there's no reason to continue using the ftrace_func_hash to find which probe to call in the function callback. The ops that is passed in to the function callback is part of the probe_ops to call. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 4 +- kernel/trace/ftrace.c | 225 +++++++++++++++++++++---------------------------- kernel/trace/trace.h | 1 + 3 files changed, 101 insertions(+), 129 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 774e7a95c201..6d2a63e4ea52 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -443,8 +443,8 @@ enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_NOTRACE = (1 << 1), FTRACE_ITER_PRINTALL = (1 << 2), - FTRACE_ITER_DO_HASH = (1 << 3), - FTRACE_ITER_HASH = (1 << 4), + FTRACE_ITER_DO_PROBES = (1 << 3), + FTRACE_ITER_PROBE = (1 << 4), FTRACE_ITER_ENABLED = (1 << 5), }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cf6b7263199a..493c7ff7e860 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1096,14 +1096,7 @@ static bool update_all_ops; # error Dynamic ftrace depends on MCOUNT_RECORD #endif -static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; - -struct ftrace_func_probe { - struct hlist_node node; - struct ftrace_probe_ops *ops; - unsigned long ip; - struct list_head free_list; -}; +static LIST_HEAD(ftrace_func_probes); struct ftrace_func_entry { struct hlist_node hlist; @@ -1270,7 +1263,7 @@ static void remove_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { - hlist_del(&entry->hlist); + hlist_del_rcu(&entry->hlist); hash->count--; } @@ -3063,35 +3056,58 @@ struct ftrace_iterator { loff_t func_pos; struct ftrace_page *pg; struct dyn_ftrace *func; - struct ftrace_func_probe *probe; + struct ftrace_probe_ops *probe; + struct ftrace_func_entry *probe_entry; struct trace_parser parser; struct ftrace_hash *hash; struct ftrace_ops *ops; - int hidx; + int pidx; int idx; unsigned flags; }; static void * -t_hash_next(struct seq_file *m, loff_t *pos) +t_probe_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct ftrace_hash *hash; + struct list_head *next; struct hlist_node *hnd = NULL; struct hlist_head *hhd; + int size; (*pos)++; iter->pos = *pos; - if (iter->probe) - hnd = &iter->probe->node; - retry: - if (iter->hidx >= FTRACE_FUNC_HASHSIZE) + if (list_empty(&ftrace_func_probes)) return NULL; - hhd = &ftrace_func_hash[iter->hidx]; + if (!iter->probe) { + next = ftrace_func_probes.next; + iter->probe = list_entry(next, struct ftrace_probe_ops, list); + } + + if (iter->probe_entry) + hnd = &iter->probe_entry->hlist; + + hash = iter->probe->ops.func_hash->filter_hash; + size = 1 << hash->size_bits; + + retry: + if (iter->pidx >= size) { + if (iter->probe->list.next == &ftrace_func_probes) + return NULL; + next = iter->probe->list.next; + iter->probe = list_entry(next, struct ftrace_probe_ops, list); + hash = iter->probe->ops.func_hash->filter_hash; + size = 1 << hash->size_bits; + iter->pidx = 0; + } + + hhd = &hash->buckets[iter->pidx]; if (hlist_empty(hhd)) { - iter->hidx++; + iter->pidx++; hnd = NULL; goto retry; } @@ -3101,7 +3117,7 @@ t_hash_next(struct seq_file *m, loff_t *pos) else { hnd = hnd->next; if (!hnd) { - iter->hidx++; + iter->pidx++; goto retry; } } @@ -3109,26 +3125,28 @@ t_hash_next(struct seq_file *m, loff_t *pos) if (WARN_ON_ONCE(!hnd)) return NULL; - iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + iter->probe_entry = hlist_entry(hnd, struct ftrace_func_entry, hlist); return iter; } -static void *t_hash_start(struct seq_file *m, loff_t *pos) +static void *t_probe_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; loff_t l; - if (!(iter->flags & FTRACE_ITER_DO_HASH)) + if (!(iter->flags & FTRACE_ITER_DO_PROBES)) return NULL; if (iter->func_pos > *pos) return NULL; - iter->hidx = 0; + iter->probe = NULL; + iter->probe_entry = NULL; + iter->pidx = 0; for (l = 0; l <= (*pos - iter->func_pos); ) { - p = t_hash_next(m, &l); + p = t_probe_next(m, &l); if (!p) break; } @@ -3136,24 +3154,27 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) return NULL; /* Only set this if we have an item */ - iter->flags |= FTRACE_ITER_HASH; + iter->flags |= FTRACE_ITER_PROBE; return iter; } static int -t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) +t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) { - struct ftrace_func_probe *rec; + struct ftrace_probe_ops *probe; + struct ftrace_func_entry *probe_entry; - rec = iter->probe; - if (WARN_ON_ONCE(!rec)) + probe = iter->probe; + probe_entry = iter->probe_entry; + + if (WARN_ON_ONCE(!probe || !probe_entry)) return -EIO; - if (rec->ops->print) - return rec->ops->print(m, rec->ip, rec->ops, NULL); + if (probe->print) + return probe->print(m, probe_entry->ip, probe, NULL); - seq_printf(m, "%ps:%ps\n", (void *)rec->ip, (void *)rec->ops->func); + seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip, (void *)probe->func); return 0; } @@ -3205,19 +3226,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (unlikely(ftrace_disabled)) return NULL; - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, pos); + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_next(m, pos); if (iter->flags & FTRACE_ITER_PRINTALL) { - /* next must increment pos, and t_hash_start does not */ + /* next must increment pos, and t_probe_start does not */ (*pos)++; - return t_hash_start(m, &l); + return t_probe_start(m, &l); } ret = t_func_next(m, pos); if (!ret) - return t_hash_start(m, &l); + return t_probe_start(m, &l); return ret; } @@ -3226,7 +3247,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); } static void *t_start(struct seq_file *m, loff_t *pos) @@ -3255,15 +3276,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) ftrace_hash_empty(iter->hash)) { iter->func_pos = 1; /* Account for the message */ if (*pos > 0) - return t_hash_start(m, pos); + return t_probe_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; /* reset in case of seek/pread */ - iter->flags &= ~FTRACE_ITER_HASH; + iter->flags &= ~FTRACE_ITER_PROBE; return iter; } - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_start(m, pos); + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_start(m, pos); /* * Unfortunately, we need to restart at ftrace_pages_start @@ -3279,7 +3300,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) } if (!p) - return t_hash_start(m, pos); + return t_probe_start(m, pos); return iter; } @@ -3310,8 +3331,8 @@ static int t_show(struct seq_file *m, void *v) struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec; - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, iter); + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { if (iter->flags & FTRACE_ITER_NOTRACE) @@ -3490,7 +3511,7 @@ ftrace_filter_open(struct inode *inode, struct file *file) struct ftrace_ops *ops = inode->i_private; return ftrace_regex_open(ops, - FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES, inode, file); } @@ -3765,16 +3786,9 @@ core_initcall(ftrace_mod_cmd_init); static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct ftrace_func_probe *entry; - struct hlist_head *hhd; - unsigned long key; - - key = hash_long(ip, FTRACE_HASH_BITS); + struct ftrace_probe_ops *probe_ops; - hhd = &ftrace_func_hash[key]; - - if (hlist_empty(hhd)) - return; + probe_ops = container_of(op, struct ftrace_probe_ops, ops); /* * Disable preemption for these calls to prevent a RCU grace @@ -3782,20 +3796,10 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu_notrace(entry, hhd, node) { - if (entry->ip == ip) - entry->ops->func(ip, parent_ip, entry->ops, NULL); - } + probe_ops->func(ip, parent_ip, probe_ops, NULL); preempt_enable_notrace(); } -static void ftrace_free_entry(struct ftrace_func_probe *entry) -{ - if (entry->ops->free) - entry->ops->free(entry->ops, entry->ip, NULL); - kfree(entry); -} - struct ftrace_func_map { struct ftrace_func_entry entry; void *data; @@ -3942,13 +3946,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_entry *entry; - struct ftrace_func_probe *probe; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; struct ftrace_hash *hash; - struct hlist_head hl; - struct hlist_node *n; - unsigned long key; int count = 0; int size; int ret; @@ -3961,6 +3961,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (!(ops->ops.flags & FTRACE_OPS_FL_INITIALIZED)) { ops->ops.func = function_trace_probe_call; ftrace_ops_init(&ops->ops); + INIT_LIST_HEAD(&ops->list); } mutex_lock(&ops->ops.func_hash->regex_lock); @@ -3978,31 +3979,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (ret < 0) goto out; - INIT_HLIST_HEAD(&hl); - size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { if (ftrace_lookup_ip(old_hash, entry->ip)) continue; - probe = kmalloc(sizeof(*probe), GFP_KERNEL); - if (!probe) { - count = -ENOMEM; - goto err_free; - } - probe->ops = ops; - probe->ip = entry->ip; /* * The caller might want to do something special * for each function we find. We call the callback * to give the caller an opportunity to do so. */ - if (ops->init && ops->init(ops, entry->ip, data) < 0) { - kfree(probe); - goto err_free; + if (ops->init) { + ret = ops->init(ops, entry->ip, data); + if (ret < 0) + goto out; } - hlist_add_head(&probe->node, &hl); - count++; } } @@ -4012,17 +4003,15 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ret = ftrace_hash_move_and_update_ops(&ops->ops, orig_hash, hash, 1); if (ret < 0) - goto err_free_unlock; + goto out_unlock; - hlist_for_each_entry_safe(probe, n, &hl, node) { - hlist_del(&probe->node); - key = hash_long(probe->ip, FTRACE_HASH_BITS); - hlist_add_head_rcu(&probe->node, &ftrace_func_hash[key]); - } + if (list_empty(&ops->list)) + list_add(&ops->list, &ftrace_func_probes); if (!(ops->ops.flags & FTRACE_OPS_FL_ENABLED)) ret = ftrace_startup(&ops->ops, 0); + out_unlock: mutex_unlock(&ftrace_lock); if (!ret) @@ -4032,34 +4021,22 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, free_ftrace_hash(hash); return ret; - - err_free_unlock: - mutex_unlock(&ftrace_lock); - err_free: - hlist_for_each_entry_safe(probe, n, &hl, node) { - hlist_del(&probe->node); - if (ops->free) - ops->free(ops, probe->ip, NULL); - kfree(probe); - } - goto out; } int unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) { struct ftrace_ops_hash old_hash_ops; - struct ftrace_func_entry *rec_entry; - struct ftrace_func_probe *entry; - struct ftrace_func_probe *p; + struct ftrace_func_entry *entry; struct ftrace_glob func_g; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; - struct list_head free_list; struct ftrace_hash *hash = NULL; struct hlist_node *tmp; + struct hlist_head hhd; char str[KSYM_SYMBOL_LEN]; int i, ret; + int size; if (!(ops->ops.flags & FTRACE_OPS_FL_INITIALIZED)) return -EINVAL; @@ -4097,18 +4074,12 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) if (!hash) goto out_unlock; - INIT_LIST_HEAD(&free_list); - - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; + INIT_HLIST_HEAD(&hhd); - hlist_for_each_entry_safe(entry, tmp, hhd, node) { - - /* break up if statements for readability */ - if (entry->ops != ops) - continue; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { - /* do this last, since it is the most expensive */ if (func_g.search) { kallsyms_lookup(entry->ip, NULL, NULL, NULL, str); @@ -4116,26 +4087,24 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) continue; } - rec_entry = ftrace_lookup_ip(hash, entry->ip); - /* It is possible more than one entry had this ip */ - if (rec_entry) - free_hash_entry(hash, rec_entry); - - hlist_del_rcu(&entry->node); - list_add(&entry->free_list, &free_list); + remove_hash_entry(hash, entry); + hlist_add_head(&entry->hlist, &hhd); } } /* Nothing found? */ - if (list_empty(&free_list)) { + if (hlist_empty(&hhd)) { ret = -EINVAL; goto out_unlock; } mutex_lock(&ftrace_lock); - if (ftrace_hash_empty(hash)) + if (ftrace_hash_empty(hash)) { ftrace_shutdown(&ops->ops, 0); + list_del_init(&ops->list); + } + ret = ftrace_hash_move_and_update_ops(&ops->ops, orig_hash, hash, 1); @@ -4146,9 +4115,11 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) &old_hash_ops); synchronize_sched(); - list_for_each_entry_safe(entry, p, &free_list, free_list) { - list_del(&entry->free_list); - ftrace_free_entry(entry); + hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { + hlist_del(&entry->hlist); + if (ops->free) + ops->free(ops, entry->ip, NULL); + kfree(entry); } mutex_unlock(&ftrace_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e16c67c49de4..d457addcc224 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -933,6 +933,7 @@ static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { struct ftrace_probe_ops { struct ftrace_ops ops; + struct list_head list; void (*func)(unsigned long ip, unsigned long parent_ip, struct ftrace_probe_ops *ops, -- cgit v1.2.3-71-gd317 From 73a757e63114dfd765f1c5d1ff7e994f123d0234 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 1 May 2017 09:35:09 -0400 Subject: ring-buffer: Return reader page back into existing ring buffer When reading the ring buffer for consuming, it is optimized for splice, where a page is taken out of the ring buffer (zero copy) and sent to the reading consumer. When the read is finished with the page, it calls ring_buffer_free_read_page(), which simply frees the page. The next time the reader needs to get a page from the ring buffer, it must call ring_buffer_alloc_read_page() which allocates and initializes a reader page for the ring buffer to be swapped into the ring buffer for a new filled page for the reader. The problem is that there's no reason to actually free the page when it is passed back to the ring buffer. It can hold it off and reuse it for the next iteration. This completely removes the interaction with the page_alloc mechanism. Using the trace-cmd utility to record all events (causing trace-cmd to require reading lots of pages from the ring buffer, and calling ring_buffer_alloc/free_read_page() several times), and also assigning a stack trace trigger to the mm_page_alloc event, we can see how many times the ring_buffer_alloc_read_page() needed to allocate a page for the ring buffer. Before this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 9968 After this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 4 Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 40 +++++++++++++++++++++++++++++++++--- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 17 ++++++++++----- 4 files changed, 51 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b6d4568795a7..ee9b461af095 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -185,7 +185,7 @@ size_t ring_buffer_page_len(void *page); void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu); -void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); +void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data); int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, size_t len, int cpu, int full); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 96fc3c043ad6..01b4ee5326cf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -438,6 +438,7 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; + struct buffer_data_page *free_page; unsigned long nr_pages; unsigned int current_context; struct list_head *pages; @@ -4377,9 +4378,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { - struct buffer_data_page *bpage; + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct buffer_data_page *bpage = NULL; + unsigned long flags; struct page *page; + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (cpu_buffer->free_page) { + bpage = cpu_buffer->free_page; + cpu_buffer->free_page = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + if (bpage) + goto out; + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) @@ -4387,6 +4404,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) bpage = page_address(page); + out: rb_init_page(bpage); return bpage; @@ -4396,13 +4414,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for + * @cpu: the cpu buffer the page came from * @data: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ -void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) +void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) { - free_page((unsigned long)data); + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct buffer_data_page *bpage = data; + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (!cpu_buffer->free_page) { + cpu_buffer->free_page = bpage; + bpage = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + free_page((unsigned long)bpage); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index c190a4d5013c..9fbcaf567886 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -171,7 +171,7 @@ static enum event_status read_page(int cpu) } } } - ring_buffer_free_read_page(buffer, bpage); + ring_buffer_free_read_page(buffer, cpu, bpage); if (ret < 0) return EVENT_DROPPED; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 60c904fa5480..5b645b0fbbb8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6054,6 +6054,7 @@ static int tracing_clock_open(struct inode *inode, struct file *file) struct ftrace_buffer_info { struct trace_iterator iter; void *spare; + unsigned int spare_cpu; unsigned int read; }; @@ -6383,9 +6384,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return -EBUSY; #endif - if (!info->spare) + if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); + info->spare_cpu = iter->cpu_file; + } if (!info->spare) return -ENOMEM; @@ -6445,7 +6448,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); if (info->spare) - ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); + ring_buffer_free_read_page(iter->trace_buffer->buffer, + info->spare_cpu, info->spare); kfree(info); mutex_unlock(&trace_types_lock); @@ -6456,6 +6460,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) struct buffer_ref { struct ring_buffer *buffer; void *page; + int cpu; int ref; }; @@ -6467,7 +6472,7 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, if (--ref->ref) return; - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); buf->private = 0; } @@ -6501,7 +6506,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) if (--ref->ref) return; - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); spd->partial[i].private = 0; } @@ -6566,11 +6571,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, kfree(ref); break; } + ref->cpu = iter->cpu_file; r = ring_buffer_read_page(ref->buffer, &ref->page, len, iter->cpu_file, 1); if (r < 0) { - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, + ref->page); kfree(ref); break; } -- cgit v1.2.3-71-gd317