From be8f274323c26ddc7e6fd6c44254b7abcdbe6389 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:16:58 +0900 Subject: kprobes: Prohibit probing on .entry.text code .entry.text is a code area which is used for interrupt/syscall entries, which includes many sensitive code. Thus, it is better to prohibit probing on all of such code instead of a part of that. Since some symbols are already registered on kprobe blacklist, this also removes them from the blacklist. Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Borislav Petkov Cc: David S. Miller Cc: Frederic Weisbecker Cc: Jan Kiszka Cc: Jiri Kosina Cc: Jonathan Lebon Cc: Seiji Aguchi Link: http://lkml.kernel.org/r/20140417081658.26341.57354.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- include/linux/kprobes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 925eaf28fca9..cdf9251f8249 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -265,6 +265,7 @@ extern void arch_disarm_kprobe(struct kprobe *p); extern int arch_init_kprobes(void); extern void show_registers(struct pt_regs *regs); extern void kprobes_inc_nmissed_count(struct kprobe *p); +extern bool arch_within_kprobe_blacklist(unsigned long addr); struct kprobe_insn_cache { struct mutex mutex; -- cgit v1.2.3-71-gd317 From 376e242429bf8539ef39a080ac113c8799840b13 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:17:05 +0900 Subject: kprobes: Introduce NOKPROBE_SYMBOL() macro to maintain kprobes blacklist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce NOKPROBE_SYMBOL() macro which builds a kprobes blacklist at kernel build time. The usage of this macro is similar to EXPORT_SYMBOL(), placed after the function definition: NOKPROBE_SYMBOL(function); Since this macro will inhibit inlining of static/inline functions, this patch also introduces a nokprobe_inline macro for static/inline functions. In this case, we must use NOKPROBE_SYMBOL() for the inline function caller. When CONFIG_KPROBES=y, the macro stores the given function address in the "_kprobe_blacklist" section. Since the data structures are not fully initialized by the macro (because there is no "size" information), those are re-initialized at boot time by using kallsyms. Signed-off-by: Masami Hiramatsu Link: http://lkml.kernel.org/r/20140417081705.26341.96719.stgit@ltc230.yrl.intra.hitachi.co.jp Cc: Alok Kataria Cc: Ananth N Mavinakayanahalli Cc: Andrew Morton Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: Christopher Li Cc: Chris Wright Cc: David S. Miller Cc: Jan-Simon Möller Cc: Jeremy Fitzhardinge Cc: Linus Torvalds Cc: Randy Dunlap Cc: Rusty Russell Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-sparse@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Ingo Molnar --- Documentation/kprobes.txt | 16 +++++- arch/x86/include/asm/asm.h | 7 +++ arch/x86/kernel/paravirt.c | 4 ++ include/asm-generic/vmlinux.lds.h | 9 ++++ include/linux/compiler.h | 2 + include/linux/kprobes.h | 20 ++++++-- kernel/kprobes.c | 100 ++++++++++++++++++++------------------ kernel/sched/core.c | 1 + 8 files changed, 107 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 0cfb00fd86ff..4bbeca8483ed 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -22,8 +22,9 @@ Appendix B: The kprobes sysctl interface Kprobes enables you to dynamically break into any kernel routine and collect debugging and performance information non-disruptively. You -can trap at almost any kernel code address, specifying a handler +can trap at almost any kernel code address(*), specifying a handler routine to be invoked when the breakpoint is hit. +(*: some parts of the kernel code can not be trapped, see 1.5 Blacklist) There are currently three types of probes: kprobes, jprobes, and kretprobes (also called return probes). A kprobe can be inserted @@ -273,6 +274,19 @@ using one of the following techniques: or - Execute 'sysctl -w debug.kprobes_optimization=n' +1.5 Blacklist + +Kprobes can probe most of the kernel except itself. This means +that there are some functions where kprobes cannot probe. Probing +(trapping) such functions can cause a recursive trap (e.g. double +fault) or the nested probe handler may never be called. +Kprobes manages such functions as a blacklist. +If you want to add a function into the blacklist, you just need +to (1) include linux/kprobes.h and (2) use NOKPROBE_SYMBOL() macro +to specify a blacklisted function. +Kprobes checks the given probe address against the blacklist and +rejects registering it, if the given address is in the blacklist. + 2. Architectures Supported Kprobes, jprobes, and return probes are implemented on the following diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 4582e8e1cd1a..7730c1c5c83a 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -57,6 +57,12 @@ .long (from) - . ; \ .long (to) - . + 0x7ffffff0 ; \ .popsection + +# define _ASM_NOKPROBE(entry) \ + .pushsection "_kprobe_blacklist","aw" ; \ + _ASM_ALIGN ; \ + _ASM_PTR (entry); \ + .popsection #else # define _ASM_EXTABLE(from,to) \ " .pushsection \"__ex_table\",\"a\"\n" \ @@ -71,6 +77,7 @@ " .long (" #from ") - .\n" \ " .long (" #to ") - . + 0x7ffffff0\n" \ " .popsection\n" +/* For C file, we already have NOKPROBE_SYMBOL macro */ #endif #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1b10af835c31..e136869ae42e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -389,6 +390,9 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .end_context_switch = paravirt_nop, }; +/* At this point, native_get_debugreg has a real function entry */ +NOKPROBE_SYMBOL(native_get_debugreg); + struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC .startup_ipi_hook = paravirt_nop, diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 146e4fffd710..40ceb3ceba79 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -109,6 +109,14 @@ #define BRANCH_PROFILE() #endif +#ifdef CONFIG_KPROBES +#define KPROBE_BLACKLIST() VMLINUX_SYMBOL(__start_kprobe_blacklist) = .; \ + *(_kprobe_blacklist) \ + VMLINUX_SYMBOL(__stop_kprobe_blacklist) = .; +#else +#define KPROBE_BLACKLIST() +#endif + #ifdef CONFIG_EVENT_TRACING #define FTRACE_EVENTS() . = ALIGN(8); \ VMLINUX_SYMBOL(__start_ftrace_events) = .; \ @@ -507,6 +515,7 @@ *(.init.rodata) \ FTRACE_EVENTS() \ TRACE_SYSCALLS() \ + KPROBE_BLACKLIST() \ MEM_DISCARD(init.rodata) \ CLK_OF_TABLES() \ RESERVEDMEM_OF_TABLES() \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ee7239ea1583..0300c0f5c88b 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -374,7 +374,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */ #ifdef CONFIG_KPROBES # define __kprobes __attribute__((__section__(".kprobes.text"))) +# define nokprobe_inline __always_inline #else # define __kprobes +# define nokprobe_inline inline #endif #endif /* __LINUX_COMPILER_H */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index cdf9251f8249..e059507c465d 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -205,10 +205,10 @@ struct kretprobe_blackpoint { void *addr; }; -struct kprobe_blackpoint { - const char *name; +struct kprobe_blacklist_entry { + struct list_head list; unsigned long start_addr; - unsigned long range; + unsigned long end_addr; }; #ifdef CONFIG_KPROBES @@ -477,4 +477,18 @@ static inline int enable_jprobe(struct jprobe *jp) return enable_kprobe(&jp->kp); } +#ifdef CONFIG_KPROBES +/* + * Blacklist ganerating macro. Specify functions which is not probed + * by using this macro. + */ +#define __NOKPROBE_SYMBOL(fname) \ +static unsigned long __used \ + __attribute__((section("_kprobe_blacklist"))) \ + _kbl_addr_##fname = (unsigned long)fname; +#define NOKPROBE_SYMBOL(fname) __NOKPROBE_SYMBOL(fname) +#else +#define NOKPROBE_SYMBOL(fname) +#endif + #endif /* _LINUX_KPROBES_H */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5b5ac76671e7..5ffc6875d2a7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -86,18 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } -/* - * Normally, functions that we'd want to prohibit kprobes in, are marked - * __kprobes. But, there are cases where such functions already belong to - * a different section (__sched for preempt_schedule) - * - * For such cases, we now have a blacklist - */ -static struct kprobe_blackpoint kprobe_blacklist[] = { - {"preempt_schedule",}, - {"native_get_debugreg",}, - {NULL} /* Terminator */ -}; +/* Blacklist -- list of struct kprobe_blacklist_entry */ +static LIST_HEAD(kprobe_blacklist); #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* @@ -1328,24 +1318,22 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -static int __kprobes in_kprobes_functions(unsigned long addr) +static bool __kprobes within_kprobe_blacklist(unsigned long addr) { - struct kprobe_blackpoint *kb; + struct kprobe_blacklist_entry *ent; if (arch_within_kprobe_blacklist(addr)) - return -EINVAL; + return true; /* * If there exists a kprobe_blacklist, verify and * fail any probe registration in the prohibited area */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - if (kb->start_addr) { - if (addr >= kb->start_addr && - addr < (kb->start_addr + kb->range)) - return -EINVAL; - } + list_for_each_entry(ent, &kprobe_blacklist, list) { + if (addr >= ent->start_addr && addr < ent->end_addr) + return true; } - return 0; + + return false; } /* @@ -1436,7 +1424,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr) || + within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr)) { ret = -EINVAL; goto out; @@ -2022,6 +2010,38 @@ void __kprobes dump_kprobe(struct kprobe *kp) kp->symbol_name, kp->addr, kp->offset); } +/* + * Lookup and populate the kprobe_blacklist. + * + * Unlike the kretprobe blacklist, we'll need to determine + * the range of addresses that belong to the said functions, + * since a kprobe need not necessarily be at the beginning + * of a function. + */ +static int __init populate_kprobe_blacklist(unsigned long *start, + unsigned long *end) +{ + unsigned long *iter; + struct kprobe_blacklist_entry *ent; + unsigned long offset = 0, size = 0; + + for (iter = start; iter < end; iter++) { + if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) { + pr_err("Failed to find blacklist %p\n", (void *)*iter); + continue; + } + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->start_addr = *iter; + ent->end_addr = *iter + size; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &kprobe_blacklist); + } + return 0; +} + /* Module notifier call back, checking kprobes on the module */ static int __kprobes kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2065,14 +2085,13 @@ static struct notifier_block kprobe_module_nb = { .priority = 0 }; +/* Markers of _kprobe_blacklist section */ +extern unsigned long __start_kprobe_blacklist[]; +extern unsigned long __stop_kprobe_blacklist[]; + static int __init init_kprobes(void) { int i, err = 0; - unsigned long offset = 0, size = 0; - char *modname, namebuf[KSYM_NAME_LEN]; - const char *symbol_name; - void *addr; - struct kprobe_blackpoint *kb; /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ @@ -2082,26 +2101,11 @@ static int __init init_kprobes(void) raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); } - /* - * Lookup and populate the kprobe_blacklist. - * - * Unlike the kretprobe blacklist, we'll need to determine - * the range of addresses that belong to the said functions, - * since a kprobe need not necessarily be at the beginning - * of a function. - */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - kprobe_lookup_name(kb->name, addr); - if (!addr) - continue; - - kb->start_addr = (unsigned long)addr; - symbol_name = kallsyms_lookup(kb->start_addr, - &size, &offset, &modname, namebuf); - if (!symbol_name) - kb->range = 0; - else - kb->range = size; + err = populate_kprobe_blacklist(__start_kprobe_blacklist, + __stop_kprobe_blacklist); + if (err) { + pr_err("kprobes: failed to populate blacklist: %d\n", err); + pr_err("Please take care of using kprobes.\n"); } if (kretprobe_blacklist_size) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..6863631e8cd0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2804,6 +2804,7 @@ asmlinkage void __sched notrace preempt_schedule(void) barrier(); } while (need_resched()); } +NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #endif /* CONFIG_PREEMPT */ -- cgit v1.2.3-71-gd317 From b02ef20a9fba08948e643d3eec0efadf1da01a44 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 12 May 2014 18:24:45 +0200 Subject: uprobes/x86: Fix the wrong ->si_addr when xol triggers a trap If the probed insn triggers a trap, ->si_addr = regs->ip is technically correct, but this is not what the signal handler wants; we need to pass the address of the probed insn, not the address of xol slot. Add the new arch-agnostic helper, uprobe_get_trap_addr(), and change fill_trap_info() and math_error() to use it. !CONFIG_UPROBES case in uprobes.h uses a macro to avoid include hell and ensure that it can be compiled even if an architecture doesn't define instruction_pointer(). Test-case: #include #include #include extern void probe_div(void); void sigh(int sig, siginfo_t *info, void *c) { int passed = (info->si_addr == probe_div); printf(passed ? "PASS\n" : "FAIL\n"); _exit(!passed); } int main(void) { struct sigaction sa = { .sa_sigaction = sigh, .sa_flags = SA_SIGINFO, }; sigaction(SIGFPE, &sa, NULL); asm ( "xor %ecx,%ecx\n" ".globl probe_div; probe_div:\n" "idiv %ecx\n" ); return 0; } it fails if probe_div() is probed. Note: show_unhandled_signals users should probably use this helper too, but we need to cleanup them first. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu --- arch/x86/kernel/traps.c | 7 ++++--- include/linux/uprobes.h | 4 ++++ kernel/events/uprobes.c | 10 ++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 73b3ea32245a..3fdb20548c4b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -148,11 +149,11 @@ static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr, case X86_TRAP_DE: sicode = FPE_INTDIV; - siaddr = regs->ip; + siaddr = uprobe_get_trap_addr(regs); break; case X86_TRAP_UD: sicode = ILL_ILLOPN; - siaddr = regs->ip; + siaddr = uprobe_get_trap_addr(regs); break; case X86_TRAP_AC: sicode = BUS_ADRALN; @@ -531,7 +532,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) task->thread.error_code = error_code; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_addr = (void __user *)regs->ip; + info.si_addr = (void __user *)uprobe_get_trap_addr(regs); if (trapnr == X86_TRAP_MF) { unsigned short cwd, swd; /* diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index edff2b97b864..88c3b7e8b384 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -102,6 +102,7 @@ extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, u extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern bool __weak is_trap_insn(uprobe_opcode_t *insn); extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); @@ -130,6 +131,9 @@ extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *r #else /* !CONFIG_UPROBES */ struct uprobes_state { }; + +#define uprobe_get_trap_addr(regs) instruction_pointer(regs) + static inline int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a13251e8bfa4..3b02c72938a8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1351,6 +1351,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } +unsigned long uprobe_get_trap_addr(struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (unlikely(utask && utask->active_uprobe)) + return utask->vaddr; + + return instruction_pointer(regs); +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread. -- cgit v1.2.3-71-gd317 From 53b25335dd60981ad608da7890420898a34469a6 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 16 May 2014 17:12:12 -0400 Subject: perf: Disable sampled events if no PMU interrupt Add common code to generate -ENOTSUPP at event creation time if an architecture attempts to create a sampled event and PERF_PMU_NO_INTERRUPT is set. This adds a new pmu->capabilities flag. Initially we only support PERF_PMU_NO_INTERRUPT (to indicate a PMU has no support for generating hardware interrupts) but there are other capabilities that can be added later. Signed-off-by: Vince Weaver Acked-by: Will Deacon [peterz: rename to PERF_PMU_CAP_* and moved the pmu::capabilities word into a hole] Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1405161708060.11099@vincent-weaver-1.umelst.maine.edu Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 10 ++++++++++ kernel/events/core.c | 7 +++++++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index af6dcf1d9e47..267c8f37012c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -166,6 +166,11 @@ struct perf_event; */ #define PERF_EVENT_TXN 0x1 +/** + * pmu::capabilities flags + */ +#define PERF_PMU_CAP_NO_INTERRUPT 0x01 + /** * struct pmu - generic performance monitoring unit */ @@ -178,6 +183,11 @@ struct pmu { const char *name; int type; + /* + * various common per-pmu feature flags + */ + int capabilities; + int * __percpu pmu_disable_count; struct perf_cpu_context * __percpu pmu_cpu_context; int task_ctx_nr; diff --git a/kernel/events/core.c b/kernel/events/core.c index a62d142ad498..e9ef0c6646af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7120,6 +7120,13 @@ SYSCALL_DEFINE5(perf_event_open, } } + if (is_sampling_event(event)) { + if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { + err = -ENOTSUPP; + goto err_alloc; + } + } + account_event(event); /* -- cgit v1.2.3-71-gd317 From e041e328c4b41e1db79bfe5ba9992c2ed771ad19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 May 2014 17:32:19 +0200 Subject: perf: Fix perf_event_comm() vs. exec() assumption perf_event_comm() assumes that set_task_comm() is only called on exec(), and in particular that its only called on current. Neither are true, as Dave reported a WARN triggered by set_task_comm() being called on !current. Separate the exec() hook from the comm hook. Reported-by: Dave Jones Signed-off-by: Peter Zijlstra Cc: Alexander Viro Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20140521153219.GH5226@laptop.programming.kicks-ass.net [ Build fix. ] Signed-off-by: Ingo Molnar --- fs/exec.c | 1 + include/linux/perf_event.h | 4 +++- kernel/events/core.c | 28 ++++++++++++++++------------ 3 files changed, 20 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 238b7aa26f68..a038a41a3677 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1110,6 +1110,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); + perf_event_exec(); set_task_comm(current, kbasename(bprm->filename)); /* Set the new mm task size. We have to do that late because it may diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3ef6ea12806a..9b5cd1992a88 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -695,6 +695,7 @@ extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); +extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); @@ -772,7 +773,7 @@ extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern int __perf_event_disable(void *info); extern void perf_event_task_tick(void); -#else +#else /* !CONFIG_PERF_EVENTS: */ static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } @@ -802,6 +803,7 @@ static inline int perf_unregister_guest_info_callbacks (struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } +static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } diff --git a/kernel/events/core.c b/kernel/events/core.c index 440eefc67397..647698f91988 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2970,6 +2970,22 @@ out: local_irq_restore(flags); } +void perf_event_exec(void) +{ + struct perf_event_context *ctx; + int ctxn; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = current->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctx); + } + rcu_read_unlock(); +} + /* * Cross CPU call to read the hardware event */ @@ -5057,18 +5073,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) void perf_event_comm(struct task_struct *task) { struct perf_comm_event comm_event; - struct perf_event_context *ctx; - int ctxn; - - rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - perf_event_enable_on_exec(ctx); - } - rcu_read_unlock(); if (!atomic_read(&nr_comm_events)) return; -- cgit v1.2.3-71-gd317 From 82b897782d10fcc4930c9d4a15b175348fdd2871 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 28 May 2014 11:45:04 +0300 Subject: perf: Differentiate exec() and non-exec() comm events perf tools like 'perf report' can aggregate samples by comm strings, which generally works. However, there are other potential use-cases. For example, to pair up 'calls' with 'returns' accurately (from branch events like Intel BTS) it is necessary to identify whether the process has exec'd. Although a comm event is generated when an 'exec' happens it is also generated whenever the comm string is changed on a whim (e.g. by prctl PR_SET_NAME). This patch adds a flag to the comm event to differentiate one case from the other. In order to determine whether the kernel supports the new flag, a selection bit named 'exec' is added to struct perf_event_attr. The bit does nothing but will cause perf_event_open() to fail if the bit is set on kernels that do not have it defined. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/537D9EBE.7030806@intel.com Cc: Paul Mackerras Cc: Dave Jones Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Alexander Viro Cc: Linus Torvalds Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/exec.c | 6 +++--- include/linux/perf_event.h | 4 ++-- include/linux/sched.h | 6 +++++- include/uapi/linux/perf_event.h | 9 +++++++-- kernel/events/core.c | 4 ++-- 5 files changed, 19 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index a038a41a3677..a3d33fe592d6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1046,13 +1046,13 @@ EXPORT_SYMBOL_GPL(get_task_comm); * so that a new one can be started */ -void set_task_comm(struct task_struct *tsk, const char *buf) +void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); - perf_event_comm(tsk); + perf_event_comm(tsk, exec); } int flush_old_exec(struct linux_binprm * bprm) @@ -1111,7 +1111,7 @@ void setup_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); perf_event_exec(); - set_task_comm(current, kbasename(bprm->filename)); + __set_task_comm(current, kbasename(bprm->filename), true); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b4c1d4685bf0..707617a8c0f6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -707,7 +707,7 @@ extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks * extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern void perf_event_exec(void); -extern void perf_event_comm(struct task_struct *tsk); +extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_fork(struct task_struct *tsk); /* Callchains */ @@ -815,7 +815,7 @@ static inline int perf_unregister_guest_info_callbacks static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_exec(void) { } -static inline void perf_event_comm(struct task_struct *tsk) { } +static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 221b2bde3723..ad86e1d7dbc2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2379,7 +2379,11 @@ extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, i struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -extern void set_task_comm(struct task_struct *tsk, const char *from); +extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); +static inline void set_task_comm(struct task_struct *tsk, const char *from) +{ + __set_task_comm(tsk, from, false); +} extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index d9cd853818ad..5312fae47218 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -302,8 +302,8 @@ struct perf_event_attr { exclude_callchain_kernel : 1, /* exclude kernel callchains */ exclude_callchain_user : 1, /* exclude user callchains */ mmap2 : 1, /* include mmap with inode data */ - - __reserved_1 : 40; + comm_exec : 1, /* flag comm events that are due to an exec */ + __reserved_1 : 39; union { __u32 wakeup_events; /* wakeup every n events */ @@ -502,7 +502,12 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) #define PERF_RECORD_MISC_GUEST_USER (5 << 0) +/* + * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on + * different events so can reuse the same bit position. + */ #define PERF_RECORD_MISC_MMAP_DATA (1 << 13) +#define PERF_RECORD_MISC_COMM_EXEC (1 << 13) /* * Indicates that the content of PERF_SAMPLE_IP points to * the actual instruction that triggered the event. See also diff --git a/kernel/events/core.c b/kernel/events/core.c index 8fac2056d51e..7da5e561e89a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5090,7 +5090,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) NULL); } -void perf_event_comm(struct task_struct *task) +void perf_event_comm(struct task_struct *task, bool exec) { struct perf_comm_event comm_event; @@ -5104,7 +5104,7 @@ void perf_event_comm(struct task_struct *task) .event_id = { .header = { .type = PERF_RECORD_COMM, - .misc = 0, + .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, /* .size */ }, /* .pid */ -- cgit v1.2.3-71-gd317