From 6909e29fdefbb7aa643021279daef6ed10c81528 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 12 Oct 2017 16:06:11 +0200 Subject: kdb: use __ktime_get_real_seconds instead of __current_kernel_time kdb is the only user of the __current_kernel_time() interface, which is not y2038 safe and should be removed at some point. The kdb code also goes to great lengths to print the time in a human-readable format from 'struct timespec', again using a non-y2038-safe re-implementation of the generic time_to_tm() code. Using __current_kernel_time() here is necessary since the regular accessors that require a sequence lock might hang when called during the xtime update. However, this is safe in the particular case since kdb is only interested in the tv_sec field that is updated atomically. In order to make this y2038-safe, I'm converting the code to the generic time64_to_tm helper, but that introduces the problem that we have no interface like __current_kernel_time() that provides a 64-bit timestamp in a lockless, safe and architecture-independent way. I have multiple ideas for how to solve that: - __ktime_get_real_seconds() is lockless, but can return incorrect results on 32-bit architectures in the special case that we are in the process of changing the time across the epoch, either during the timer tick that overflows the seconds in 2038, or while calling settimeofday. - ktime_get_real_fast_ns() would work in this context, but does require a call into the clocksource driver to return a high-resolution timestamp. This may have undesired side-effects in the debugger, since we want to limit the interactions with the rest of the kernel. - Adding a ktime_get_real_fast_seconds() based on tk_fast_mono plus tkr->base_real without the tk_clock_read() delta. Not sure about the value of adding yet another interface here. - Changing the existing ktime_get_real_seconds() to use tk_fast_mono on 32-bit architectures rather than xtime_sec. I think this could work, but am not entirely sure if this is an improvement. I picked the first of those for simplicity here. It's technically not correct but probably good enough as the time is only used for the debugging output and the race will likely never be hit in practice. Another downside is having to move the declaration into a public header file. Let me know if anyone has a different preference. Cc: Andy Shevchenko Link: https://patchwork.kernel.org/patch/9775309/ Signed-off-by: Arnd Bergmann Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 45 +++++--------------------------------- kernel/time/timekeeping_internal.h | 2 -- 2 files changed, 5 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index c8146d53ca67..69e70f4021fe 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2479,41 +2479,6 @@ static int kdb_kill(int argc, const char **argv) return 0; } -struct kdb_tm { - int tm_sec; /* seconds */ - int tm_min; /* minutes */ - int tm_hour; /* hours */ - int tm_mday; /* day of the month */ - int tm_mon; /* month */ - int tm_year; /* year */ -}; - -static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) -{ - /* This will work from 1970-2099, 2100 is not a leap year */ - static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31, - 31, 30, 31, 30, 31 }; - memset(tm, 0, sizeof(*tm)); - tm->tm_sec = tv->tv_sec % (24 * 60 * 60); - tm->tm_mday = tv->tv_sec / (24 * 60 * 60) + - (2 * 365 + 1); /* shift base from 1970 to 1968 */ - tm->tm_min = tm->tm_sec / 60 % 60; - tm->tm_hour = tm->tm_sec / 60 / 60; - tm->tm_sec = tm->tm_sec % 60; - tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1)); - tm->tm_mday %= (4*365+1); - mon_day[1] = 29; - while (tm->tm_mday >= mon_day[tm->tm_mon]) { - tm->tm_mday -= mon_day[tm->tm_mon]; - if (++tm->tm_mon == 12) { - tm->tm_mon = 0; - ++tm->tm_year; - mon_day[1] = 28; - } - } - ++tm->tm_mday; -} - /* * Most of this code has been lifted from kernel/timer.c::sys_sysinfo(). * I cannot call that code directly from kdb, it has an unconditional @@ -2539,8 +2504,8 @@ static void kdb_sysinfo(struct sysinfo *val) */ static int kdb_summary(int argc, const char **argv) { - struct timespec now; - struct kdb_tm tm; + time64_t now; + struct tm tm; struct sysinfo val; if (argc) @@ -2554,9 +2519,9 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("domainname %s\n", init_uts_ns.name.domainname); kdb_printf("ccversion %s\n", __stringify(CCVERSION)); - now = __current_kernel_time(); - kdb_gmtime(&now, &tm); - kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " + now = __ktime_get_real_seconds(); + time64_to_tm(now, 0, &tm); + kdb_printf("date %04ld-%02d-%02d %02d:%02d:%02d " "tz_minuteswest %d\n", 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index fdbeeb02dde9..cf5c0828ee31 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -31,6 +31,4 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) } #endif -extern time64_t __ktime_get_real_seconds(void); - #endif /* _TIMEKEEPING_INTERNAL_H */ -- cgit v1.2.3-71-gd317 From 1e0ce03bf142454f38a5fc050bf4fd698d2d36d8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 8 Dec 2017 10:19:19 -0800 Subject: kdb: make "mdr" command repeat The "mdr" command should repeat (continue) when only Enter/Return is pressed, so make it do so. Signed-off-by: Randy Dunlap Cc: Daniel Thompson Cc: Jason Wessel Cc: kgdb-bugreport@lists.sourceforge.net Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 69e70f4021fe..ff6047d3b73f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1566,6 +1566,7 @@ static int kdb_md(int argc, const char **argv) int symbolic = 0; int valid = 0; int phys = 0; + int raw = 0; kdbgetintenv("MDCOUNT", &mdcount); kdbgetintenv("RADIX", &radix); @@ -1575,9 +1576,10 @@ static int kdb_md(int argc, const char **argv) repeat = mdcount * 16 / bytesperword; if (strcmp(argv[0], "mdr") == 0) { - if (argc != 2) + if (argc == 2 || (argc == 0 && last_addr != 0)) + valid = raw = 1; + else return KDB_ARGCOUNT; - valid = 1; } else if (isdigit(argv[0][2])) { bytesperword = (int)(argv[0][2] - '0'); if (bytesperword == 0) { @@ -1613,7 +1615,10 @@ static int kdb_md(int argc, const char **argv) radix = last_radix; bytesperword = last_bytesperword; repeat = last_repeat; - mdcount = ((repeat * bytesperword) + 15) / 16; + if (raw) + mdcount = repeat; + else + mdcount = ((repeat * bytesperword) + 15) / 16; } if (argc) { @@ -1630,7 +1635,10 @@ static int kdb_md(int argc, const char **argv) diag = kdbgetularg(argv[nextarg], &val); if (!diag) { mdcount = (int) val; - repeat = mdcount * 16 / bytesperword; + if (raw) + repeat = mdcount; + else + repeat = mdcount * 16 / bytesperword; } } if (argc >= nextarg+1) { @@ -1640,8 +1648,15 @@ static int kdb_md(int argc, const char **argv) } } - if (strcmp(argv[0], "mdr") == 0) - return kdb_mdr(addr, mdcount); + if (strcmp(argv[0], "mdr") == 0) { + int ret; + last_addr = addr; + ret = kdb_mdr(addr, mdcount); + last_addr += mdcount; + last_repeat = mdcount; + last_bytesperword = bytesperword; // to make REPEAT happy + return ret; + } switch (radix) { case 10: -- cgit v1.2.3-71-gd317 From b0f73bc7f1793997eb48bd14e3db51c3c95e2098 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 8 Dec 2017 10:19:23 -0800 Subject: kdb: drop newline in unknown command output When an unknown command is entered, kdb prints "Unknown kdb command:" and then the unknown text, including the newline character. This causes the ending single-quote mark to be printed on the next line by itself, so just change the ending newline character to a null character (end of string) so that it won't be "printed." Signed-off-by: Randy Dunlap Cc: Daniel Thompson Cc: Jason Wessel Cc: kgdb-bugreport@lists.sourceforge.net Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index ff6047d3b73f..6055231544a0 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1150,6 +1150,16 @@ void kdb_set_current_task(struct task_struct *p) kdb_current_regs = NULL; } +static void drop_newline(char *buf) +{ + size_t len = strlen(buf); + + if (len == 0) + return; + if (*(buf + len - 1) == '\n') + *(buf + len - 1) = '\0'; +} + /* * kdb_local - The main code for kdb. This routine is invoked on a * specific processor, it is not global. The main kdb() routine @@ -1327,6 +1337,7 @@ do_full_getstr: cmdptr = cmd_head; diag = kdb_parse(cmdbuf); if (diag == KDB_NOTFOUND) { + drop_newline(cmdbuf); kdb_printf("Unknown kdb command: '%s'\n", cmdbuf); diag = 0; } -- cgit v1.2.3-71-gd317 From 33f765f698895527acd69faf5d54ab07f02683ff Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 8 Dec 2017 10:19:28 -0800 Subject: kdb: bl: don't use tab character in output The "bl" (list breakpoints) command prints a '\t' (tab) character in its output, but on a console (video device), that just prints some odd graphics character. Instead of printing a tab character, just align the output with spaces. Signed-off-by: Randy Dunlap Cc: Daniel Thompson Cc: Jason Wessel Cc: kgdb-bugreport@lists.sourceforge.net Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_bp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 90ff129c88a2..62c301ad0773 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -242,11 +242,11 @@ static void kdb_printbp(kdb_bp_t *bp, int i) kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT); if (bp->bp_enabled) - kdb_printf("\n is enabled"); + kdb_printf("\n is enabled "); else kdb_printf("\n is disabled"); - kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n", + kdb_printf(" addr at %016lx, hardtype=%d installed=%d\n", bp->bp_addr, bp->bp_type, bp->bp_installed); kdb_printf("\n"); -- cgit v1.2.3-71-gd317 From 40b90efeae9be8702d387dbcbb3aadc57033d4db Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 29 Jan 2018 10:22:51 +0800 Subject: kdb: use ktime_get_mono_fast_ns() instead of ktime_get_ts() The kdb code will print the monotonic time by ktime_get_ts(), but the ktime_get_ts() will be protected by a sequence lock, that will introduce one deadlock risk if the lock was already held in the context from which we entered the debugger. Thus we can use the ktime_get_mono_fast_ns() to get the monotonic time, which is NMI safe access to clock monotonic. Moreover we can remove the 'struct timespec', which is not y2038 safe. Signed-off-by: Baolin Wang Reviewed-by: Daniel Thompson Reviewed-by: Arnd Bergmann Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 6055231544a0..16140d1aa0c3 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2512,10 +2512,10 @@ static int kdb_kill(int argc, const char **argv) */ static void kdb_sysinfo(struct sysinfo *val) { - struct timespec uptime; - ktime_get_ts(&uptime); + u64 uptime = ktime_get_mono_fast_ns(); + memset(val, 0, sizeof(*val)); - val->uptime = uptime.tv_sec; + val->uptime = div_u64(uptime, NSEC_PER_SEC); val->loads[0] = avenrun[0]; val->loads[1] = avenrun[1]; val->loads[2] = avenrun[2]; -- cgit v1.2.3-71-gd317 From 2cf2f0d5b91fd1b06a6ae260462fc7945ea84add Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 2 Feb 2018 15:59:40 +0100 Subject: kdb: use memmove instead of overlapping memcpy gcc discovered that the memcpy() arguments in kdbnearsym() overlap, so we should really use memmove(), which is defined to handle that correctly: In function 'memcpy', inlined from 'kdbnearsym' at /git/arm-soc/kernel/debug/kdb/kdb_support.c:132:4: /git/arm-soc/include/linux/string.h:353:9: error: '__builtin_memcpy' accessing 792 bytes at offsets 0 and 8 overlaps 784 bytes at offset 8 [-Werror=restrict] return __builtin_memcpy(p, q, size); Signed-off-by: Arnd Bergmann Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_support.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index d35cc2d3a4cc..990b3cc526c8 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -129,13 +129,13 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) } if (i >= ARRAY_SIZE(kdb_name_table)) { debug_kfree(kdb_name_table[0]); - memcpy(kdb_name_table, kdb_name_table+1, + memmove(kdb_name_table, kdb_name_table+1, sizeof(kdb_name_table[0]) * (ARRAY_SIZE(kdb_name_table)-1)); } else { debug_kfree(knt1); knt1 = kdb_name_table[i]; - memcpy(kdb_name_table+i, kdb_name_table+i+1, + memmove(kdb_name_table+i, kdb_name_table+i+1, sizeof(kdb_name_table[0]) * (ARRAY_SIZE(kdb_name_table)-i-1)); } -- cgit v1.2.3-71-gd317 From e12f03d7031a977356e3d7b75a68c2185ff8d155 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2017 14:45:15 -0800 Subject: perf/core: Implement the 'perf_kprobe' PMU A new PMU type, perf_kprobe is added. Based on attr from perf_event_open(), perf_kprobe creates a kprobe (or kretprobe) for the perf_event. This kprobe is private to this perf_event, and thus not added to global lists, and not available in tracefs. Two functions, create_local_trace_kprobe() and destroy_local_trace_kprobe() are added to created and destroy these local trace_kprobe. Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yonghong Song Reviewed-by: Josef Bacik Cc: Cc: Cc: Cc: Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171206224518.3598254-6-songliubraving@fb.com Signed-off-by: Ingo Molnar --- include/linux/trace_events.h | 4 ++ kernel/events/core.c | 142 ++++++++++++++++++++++++++++++---------- kernel/trace/trace_event_perf.c | 49 ++++++++++++++ kernel/trace/trace_kprobe.c | 91 ++++++++++++++++++++++--- kernel/trace/trace_probe.h | 7 ++ 5 files changed, 250 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index af44e7c2d577..21c5d43a21af 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -533,6 +533,10 @@ extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); extern int perf_trace_add(struct perf_event *event, int flags); extern void perf_trace_del(struct perf_event *event, int flags); +#ifdef CONFIG_KPROBE_EVENTS +extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); +extern void perf_kprobe_destroy(struct perf_event *event); +#endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); diff --git a/kernel/events/core.c b/kernel/events/core.c index d99fe3fdec8a..333735531968 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7992,9 +7992,77 @@ static struct pmu perf_tracepoint = { .read = perf_swevent_read, }; +#ifdef CONFIG_KPROBE_EVENTS +/* + * Flags in config, used by dynamic PMU kprobe and uprobe + * The flags should match following PMU_FORMAT_ATTR(). + * + * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe + * if not set, create kprobe/uprobe + */ +enum perf_probe_config { + PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ +}; + +PMU_FORMAT_ATTR(retprobe, "config:0"); + +static struct attribute *probe_attrs[] = { + &format_attr_retprobe.attr, + NULL, +}; + +static struct attribute_group probe_format_group = { + .name = "format", + .attrs = probe_attrs, +}; + +static const struct attribute_group *probe_attr_groups[] = { + &probe_format_group, + NULL, +}; + +static int perf_kprobe_event_init(struct perf_event *event); +static struct pmu perf_kprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_kprobe_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, + .attr_groups = probe_attr_groups, +}; + +static int perf_kprobe_event_init(struct perf_event *event) +{ + int err; + bool is_retprobe; + + if (event->attr.type != perf_kprobe.type) + return -ENOENT; + /* + * no branch sampling for probe events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + err = perf_kprobe_init(event, is_retprobe); + if (err) + return err; + + event->destroy = perf_kprobe_destroy; + + return 0; +} +#endif /* CONFIG_KPROBE_EVENTS */ + static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); +#ifdef CONFIG_KPROBE_EVENTS + perf_pmu_register(&perf_kprobe, "kprobe", -1); +#endif } static void perf_event_free_filter(struct perf_event *event) @@ -8071,13 +8139,28 @@ static void perf_event_free_bpf_handler(struct perf_event *event) } #endif +/* + * returns true if the event is a tracepoint, or a kprobe/upprobe created + * with perf_event_open() + */ +static inline bool perf_event_is_tracing(struct perf_event *event) +{ + if (event->pmu == &perf_tracepoint) + return true; +#ifdef CONFIG_KPROBE_EVENTS + if (event->pmu == &perf_kprobe) + return true; +#endif + return false; +} + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; int ret; - if (event->attr.type != PERF_TYPE_TRACEPOINT) + if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog_fd); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; @@ -8116,7 +8199,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) static void perf_event_free_bpf_prog(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_TRACEPOINT) { + if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; } @@ -8535,47 +8618,36 @@ fail_clear_files: return ret; } -static int -perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) -{ - struct perf_event_context *ctx = event->ctx; - int ret; - - /* - * Beware, here be dragons!! - * - * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint - * stuff does not actually need it. So temporarily drop ctx->mutex. As per - * perf_event_ctx_lock() we already have a reference on ctx. - * - * This can result in event getting moved to a different ctx, but that - * does not affect the tracepoint state. - */ - mutex_unlock(&ctx->mutex); - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - mutex_lock(&ctx->mutex); - - return ret; -} - static int perf_event_set_filter(struct perf_event *event, void __user *arg) { - char *filter_str; int ret = -EINVAL; - - if ((event->attr.type != PERF_TYPE_TRACEPOINT || - !IS_ENABLED(CONFIG_EVENT_TRACING)) && - !has_addr_filter(event)) - return -EINVAL; + char *filter_str; filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str); - if (IS_ENABLED(CONFIG_EVENT_TRACING) && - event->attr.type == PERF_TYPE_TRACEPOINT) - ret = perf_tracepoint_set_filter(event, filter_str); - else if (has_addr_filter(event)) +#ifdef CONFIG_EVENT_TRACING + if (perf_event_is_tracing(event)) { + struct perf_event_context *ctx = event->ctx; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but + * the tracepoint stuff does not actually need it. So + * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we + * already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, + * but that does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + } else +#endif + if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); kfree(filter_str); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 55d6dff37daf..779baade9114 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -8,6 +8,7 @@ #include #include #include "trace.h" +#include "trace_probe.h" static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; @@ -237,6 +238,54 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_unlock(&event_mutex); } +#ifdef CONFIG_KPROBE_EVENTS +int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) +{ + int ret; + char *func = NULL; + struct trace_event_call *tp_event; + + if (p_event->attr.kprobe_func) { + func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!func) + return -ENOMEM; + ret = strncpy_from_user( + func, u64_to_user_ptr(p_event->attr.kprobe_func), + KSYM_NAME_LEN); + if (ret < 0) + goto out; + + if (func[0] == '\0') { + kfree(func); + func = NULL; + } + } + + tp_event = create_local_trace_kprobe( + func, (void *)(unsigned long)(p_event->attr.kprobe_addr), + p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_kprobe(tp_event); +out: + kfree(func); + return ret; +} + +void perf_kprobe_destroy(struct perf_event *p_event) +{ + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + + destroy_local_trace_kprobe(p_event->tp_event); +} +#endif /* CONFIG_KPROBE_EVENTS */ + int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 492700c5fb4d..246c786c851c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -438,6 +438,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) disable_kprobe(&tk->rp.kp); wait = 1; } + + /* + * if tk is not added to any list, it must be a local trace_kprobe + * created with perf_event_open. We don't need to wait for these + * trace_kprobes + */ + if (list_empty(&tk->list)) + wait = 0; out: if (wait) { /* @@ -1313,12 +1321,9 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; -static int register_kprobe_event(struct trace_kprobe *tk) +static inline void init_trace_event_call(struct trace_kprobe *tk, + struct trace_event_call *call) { - struct trace_event_call *call = &tk->tp.call; - int ret; - - /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; @@ -1327,6 +1332,19 @@ static int register_kprobe_event(struct trace_kprobe *tk) call->event.funcs = &kprobe_funcs; call->class->define_fields = kprobe_event_define_fields; } + + call->flags = TRACE_EVENT_FL_KPROBE; + call->class->reg = kprobe_register; + call->data = tk; +} + +static int register_kprobe_event(struct trace_kprobe *tk) +{ + struct trace_event_call *call = &tk->tp.call; + int ret = 0; + + init_trace_event_call(tk, call); + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); @@ -1334,9 +1352,6 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - call->flags = TRACE_EVENT_FL_KPROBE; - call->class->reg = kprobe_register; - call->data = tk; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", @@ -1358,6 +1373,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) return ret; } +#ifdef CONFIG_PERF_EVENTS +/* create a trace_kprobe, but don't add it to global lists */ +struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return) +{ + struct trace_kprobe *tk; + int ret; + char *event; + + /* + * local trace_kprobes are not added to probe_list, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name here. + */ + event = func ? func : "DUMMY_EVENT"; + + tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func, + offs, 0 /* maxactive */, 0 /* nargs */, + is_return); + + if (IS_ERR(tk)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tk)); + return ERR_CAST(tk); + } + + init_trace_event_call(tk, &tk->tp.call); + + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + ret = -ENOMEM; + goto error; + } + + ret = __register_trace_kprobe(tk); + if (ret < 0) + goto error; + + return &tk->tp.call; +error: + free_trace_kprobe(tk); + return ERR_PTR(ret); +} + +void destroy_local_trace_kprobe(struct trace_event_call *event_call) +{ + struct trace_kprobe *tk; + + tk = container_of(event_call, struct trace_kprobe, tp.call); + + if (trace_probe_is_enabled(&tk->tp)) { + WARN_ON(1); + return; + } + + __unregister_trace_kprobe(tk); + free_trace_kprobe(tk); +} +#endif /* CONFIG_PERF_EVENTS */ + /* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index fb66e3eaa192..e3d940a49dcd 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -404,3 +404,10 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, } extern int set_print_fmt(struct trace_probe *tp, bool is_return); + +#ifdef CONFIG_PERF_EVENTS +extern struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return); +extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); +#endif -- cgit v1.2.3-71-gd317 From 33ea4b24277b06dbc55d7f5772a46f029600255e Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2017 14:45:16 -0800 Subject: perf/core: Implement the 'perf_uprobe' PMU This patch adds perf_uprobe support with similar pattern as previous patch (for kprobe). Two functions, create_local_trace_uprobe() and destroy_local_trace_uprobe(), are created so a uprobe can be created and attached to the file descriptor created by perf_event_open(). Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yonghong Song Reviewed-by: Josef Bacik Cc: Cc: Cc: Cc: Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171206224518.3598254-7-songliubraving@fb.com Signed-off-by: Ingo Molnar --- include/linux/trace_events.h | 4 ++ kernel/events/core.c | 48 ++++++++++++++++++++++- kernel/trace/trace_event_perf.c | 53 +++++++++++++++++++++++++ kernel/trace/trace_probe.h | 4 ++ kernel/trace/trace_uprobe.c | 86 +++++++++++++++++++++++++++++++++++++---- 5 files changed, 186 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 21c5d43a21af..0d9d6cb454b1 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -537,6 +537,10 @@ extern void perf_trace_del(struct perf_event *event, int flags); extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); #endif +#ifdef CONFIG_UPROBE_EVENTS +extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); +extern void perf_uprobe_destroy(struct perf_event *event); +#endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); diff --git a/kernel/events/core.c b/kernel/events/core.c index 333735531968..5a54630db86b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7992,7 +7992,7 @@ static struct pmu perf_tracepoint = { .read = perf_swevent_read, }; -#ifdef CONFIG_KPROBE_EVENTS +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe * The flags should match following PMU_FORMAT_ATTR(). @@ -8020,7 +8020,9 @@ static const struct attribute_group *probe_attr_groups[] = { &probe_format_group, NULL, }; +#endif +#ifdef CONFIG_KPROBE_EVENTS static int perf_kprobe_event_init(struct perf_event *event); static struct pmu perf_kprobe = { .task_ctx_nr = perf_sw_context, @@ -8057,12 +8059,52 @@ static int perf_kprobe_event_init(struct perf_event *event) } #endif /* CONFIG_KPROBE_EVENTS */ +#ifdef CONFIG_UPROBE_EVENTS +static int perf_uprobe_event_init(struct perf_event *event); +static struct pmu perf_uprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_uprobe_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, + .attr_groups = probe_attr_groups, +}; + +static int perf_uprobe_event_init(struct perf_event *event) +{ + int err; + bool is_retprobe; + + if (event->attr.type != perf_uprobe.type) + return -ENOENT; + /* + * no branch sampling for probe events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + err = perf_uprobe_init(event, is_retprobe); + if (err) + return err; + + event->destroy = perf_uprobe_destroy; + + return 0; +} +#endif /* CONFIG_UPROBE_EVENTS */ + static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); #ifdef CONFIG_KPROBE_EVENTS perf_pmu_register(&perf_kprobe, "kprobe", -1); #endif +#ifdef CONFIG_UPROBE_EVENTS + perf_pmu_register(&perf_uprobe, "uprobe", -1); +#endif } static void perf_event_free_filter(struct perf_event *event) @@ -8150,6 +8192,10 @@ static inline bool perf_event_is_tracing(struct perf_event *event) #ifdef CONFIG_KPROBE_EVENTS if (event->pmu == &perf_kprobe) return true; +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (event->pmu == &perf_uprobe) + return true; #endif return false; } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 779baade9114..2c416509b834 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -286,6 +286,59 @@ void perf_kprobe_destroy(struct perf_event *p_event) } #endif /* CONFIG_KPROBE_EVENTS */ +#ifdef CONFIG_UPROBE_EVENTS +int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) +{ + int ret; + char *path = NULL; + struct trace_event_call *tp_event; + + if (!p_event->attr.uprobe_path) + return -EINVAL; + path = kzalloc(PATH_MAX, GFP_KERNEL); + if (!path) + return -ENOMEM; + ret = strncpy_from_user( + path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); + if (ret < 0) + goto out; + if (path[0] == '\0') { + ret = -EINVAL; + goto out; + } + + tp_event = create_local_trace_uprobe( + path, p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + /* + * local trace_uprobe need to hold event_mutex to call + * uprobe_buffer_enable() and uprobe_buffer_disable(). + * event_mutex is not required for local trace_kprobes. + */ + mutex_lock(&event_mutex); + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_uprobe(tp_event); + mutex_unlock(&event_mutex); +out: + kfree(path); + return ret; +} + +void perf_uprobe_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); + destroy_local_trace_uprobe(p_event->tp_event); +} +#endif /* CONFIG_UPROBE_EVENTS */ + int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index e3d940a49dcd..27de3174050a 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -410,4 +410,8 @@ extern struct trace_event_call * create_local_trace_kprobe(char *func, void *addr, unsigned long offs, bool is_return); extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); + +extern struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, bool is_return); +extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 40592e7b3568..e3cbae702a53 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1292,16 +1292,25 @@ static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; -static int register_uprobe_event(struct trace_uprobe *tu) +static inline void init_trace_event_call(struct trace_uprobe *tu, + struct trace_event_call *call) { - struct trace_event_call *call = &tu->tp.call; - int ret; - - /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; + call->flags = TRACE_EVENT_FL_UPROBE; + call->class->reg = trace_uprobe_register; + call->data = tu; +} + +static int register_uprobe_event(struct trace_uprobe *tu) +{ + struct trace_event_call *call = &tu->tp.call; + int ret = 0; + + init_trace_event_call(tu, call); + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; @@ -1311,9 +1320,6 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } - call->flags = TRACE_EVENT_FL_UPROBE; - call->class->reg = trace_uprobe_register; - call->data = tu; ret = trace_add_event_call(call); if (ret) { @@ -1339,6 +1345,70 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) return 0; } +#ifdef CONFIG_PERF_EVENTS +struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) +{ + struct trace_uprobe *tu; + struct inode *inode; + struct path path; + int ret; + + ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + + inode = igrab(d_inode(path.dentry)); + path_put(&path); + + if (!inode || !S_ISREG(inode->i_mode)) { + iput(inode); + return ERR_PTR(-EINVAL); + } + + /* + * local trace_kprobes are not added to probe_list, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name "DUMMY_EVENT" here. + */ + tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0, + is_return); + + if (IS_ERR(tu)) { + pr_info("Failed to allocate trace_uprobe.(%d)\n", + (int)PTR_ERR(tu)); + return ERR_CAST(tu); + } + + tu->offset = offs; + tu->inode = inode; + tu->filename = kstrdup(name, GFP_KERNEL); + init_trace_event_call(tu, &tu->tp.call); + + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + ret = -ENOMEM; + goto error; + } + + return &tu->tp.call; +error: + free_trace_uprobe(tu); + return ERR_PTR(ret); +} + +void destroy_local_trace_uprobe(struct trace_event_call *event_call) +{ + struct trace_uprobe *tu; + + tu = container_of(event_call, struct trace_uprobe, tp.call); + + kfree(tu->tp.call.print_fmt); + tu->tp.call.print_fmt = NULL; + + free_trace_uprobe(tu); +} +#endif /* CONFIG_PERF_EVENTS */ + /* Make a trace interface for controling probe points */ static __init int init_uprobe_trace(void) { -- cgit v1.2.3-71-gd317 From 097114aa6eb2aa206c8cf136de77ebffe424234c Mon Sep 17 00:00:00 2001 From: Dave Young Date: Sat, 27 Jan 2018 12:11:29 +0800 Subject: print kdump kernel loaded status in stack dump It is useful to print kdump kernel loaded status in dump_stack() especially when panic happens so that we can differenciate kdump kernel early hang and a normal panic in a bug report. Link: http://lkml.kernel.org/r/20180127041129.GA29016@dhcp-128-65.nay.redhat.com To: Steven Rostedt To: Andi Kleen To: kexec@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: akpm@linux-foundation.org Cc: kexec@lists.infradead.org Signed-off-by: Dave Young Reviewed-by: Sergey Senozhatsky Reviewed-by: Simon Horman Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index db4b9b8929eb..fa3de5f10e0e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -3287,9 +3288,11 @@ void __init dump_stack_set_arch_desc(const char *fmt, ...) */ void dump_stack_print_info(const char *log_lvl) { - printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n", + printk("%sCPU: %d PID: %d Comm: %.20s %s%s %s %.*s\n", log_lvl, raw_smp_processor_id(), current->pid, current->comm, - print_tainted(), init_utsname()->release, + kexec_crash_loaded() ? "Kdump: loaded " : "", + print_tainted(), + init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); -- cgit v1.2.3-71-gd317 From c53593e5cb693d59d9e8b64fb3a79436bf99c3b3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jan 2018 11:26:18 -0800 Subject: sched, cgroup: Don't reject lower cpu.max on ancestors While adding cgroup2 interface for the cpu controller, 0d5936344f30 ("sched: Implement interface for cgroup unified hierarchy") forgot to update input validation and left it to reject cpu.max config if any descendant has set a higher value. cgroup2 officially supports delegation and a descendant must not be able to restrict what its ancestors can configure. For absolute limits such as cpu.max and memory.max, this means that the config at each level should only act as the upper limit at that level and shouldn't interfere with what other cgroups can configure. This patch updates config validation on cgroup2 so that the cpu controller follows the same convention. Signed-off-by: Tejun Heo Fixes: 0d5936344f30 ("sched: Implement interface for cgroup unified hierarchy") Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org # v4.15+ --- kernel/sched/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bf724c1952ea..1bc6a694c84f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6678,13 +6678,18 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) parent_quota = parent_b->hierarchical_quota; /* - * Ensure max(child_quota) <= parent_quota, inherit when no + * Ensure max(child_quota) <= parent_quota. On cgroup2, + * always take the min. On cgroup1, only inherit when no * limit is set: */ - if (quota == RUNTIME_INF) - quota = parent_quota; - else if (parent_quota != RUNTIME_INF && quota > parent_quota) - return -EINVAL; + if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { + quota = min(quota, parent_quota); + } else { + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF && quota > parent_quota) + return -EINVAL; + } } cfs_b->hierarchical_quota = quota; -- cgit v1.2.3-71-gd317 From 387f77cc8249c847b4fa4d8c93694818b79efee3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 13 Feb 2018 09:59:42 +0100 Subject: sched/fair: Remove stray space in #ifdef Remove a useless space in # ifdef and align it with others. Signed-off-by: Vincent Guittot Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1518512382-29426-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5eb3ffc9be84..820f94c9b200 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2823,7 +2823,7 @@ void reweight_task(struct task_struct *p, int prio) } #ifdef CONFIG_FAIR_GROUP_SCHED -# ifdef CONFIG_SMP +#ifdef CONFIG_SMP /* * All this does is approximate the hierarchical proportion which includes that * global sum we all love to hate. @@ -2974,7 +2974,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) return clamp_t(long, runnable, MIN_SHARES, shares); } -# endif /* CONFIG_SMP */ +#endif /* CONFIG_SMP */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -- cgit v1.2.3-71-gd317 From 906f63ec1d1b4941d36eee18121e68749c9e3279 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:28:24 +0300 Subject: net: Convert audit_net_ops This patch starts to convert pernet_subsys, registered from postcore initcalls. audit_net_init() creates netlink socket, while audit_net_exit() destroys it. The rest of the pernet_list are not interested in the socket, so we make audit_net_ops async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- kernel/audit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 227db99b0f19..5e49b614d0e6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1526,6 +1526,7 @@ static struct pernet_operations audit_net_ops __net_initdata = { .exit = audit_net_exit, .id = &audit_net_id, .size = sizeof(struct audit_net), + .async = true, }; /* Initialize audit support at boot time. */ -- cgit v1.2.3-71-gd317 From d590dca62bb8a315a2356159444de07278a3ad83 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Sat, 3 Feb 2018 00:33:11 -0500 Subject: audit: update bugtracker and source URIs Since the Linux Audit project has transitioned completely over to github, update the MAINTAINERS file and the primary audit source file to reflect that reality. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- MAINTAINERS | 1 - kernel/audit.c | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/MAINTAINERS b/MAINTAINERS index 845fc25812f1..fba48756e495 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2479,7 +2479,6 @@ M: Paul Moore M: Eric Paris L: linux-audit@redhat.com (moderated for non-subscribers) W: https://github.com/linux-audit -W: https://people.redhat.com/sgrubb/audit T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git S: Supported F: include/linux/audit.h diff --git a/kernel/audit.c b/kernel/audit.c index 227db99b0f19..5c2544984375 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -38,7 +38,8 @@ * 6) Support low-overhead kernel-based filtering to minimize the * information that must be passed to user-space. * - * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ + * Audit userspace, documentation, tests, and bug/issue trackers: + * https://github.com/linux-audit */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -- cgit v1.2.3-71-gd317 From 6387440e15db1c9ee58028433cd87291cae488e7 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 12 Feb 2018 05:04:53 -0500 Subject: audit: session ID should not set arch quick field pointer A bug was introduced in 8fae47705685fcaa75a1fe4c8c3e18300a702979 ("audit: add support for session ID user filter") See: https://github.com/linux-audit/audit-kernel/issues/4 When setting a session ID filter, the session ID filter field overwrote the quick pointer reference to the arch field, potentially causing the arch field to be misinterpreted. Passes audit-testsuite. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditfilter.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4a1758adb222..739a6d2d4df8 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -496,7 +496,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (!gid_valid(f->gid)) goto exit_free; break; - case AUDIT_SESSIONID: case AUDIT_ARCH: entry->rule.arch_f = f; break; -- cgit v1.2.3-71-gd317 From 544bdebc6fcb68c2e8075bc2d3b68e39789d4165 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 14 Feb 2018 13:50:37 -0800 Subject: bpf: Remove unused callee_saved array This array appears to be completely unused, remove it. Signed-off-by: Joe Stringer Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5fb69a85d967..3c74b163eaeb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -508,10 +508,6 @@ err: static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -#define CALLEE_SAVED_REGS 5 -static const int callee_saved[CALLEE_SAVED_REGS] = { - BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 -}; static void __mark_reg_not_init(struct bpf_reg_state *reg); -- cgit v1.2.3-71-gd317 From 5260ecc2e0480cc7e184901ab4c3721d0c2765e3 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 14 Feb 2018 21:47:43 -0500 Subject: audit: deprecate the AUDIT_FILTER_ENTRY filter The audit entry filter has been long deprecated with userspace support finally removed in audit-v2.6.7 and plans to remove kernel support have existed since kernel-v2.6.31. Remove it. Since removing the audit entry filter, test for early return before setting up any context state. Passes audit-testsuite. See: https://github.com/linux-audit/audit-kernel/issues/6 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditfilter.c | 4 ++-- kernel/auditsc.c | 21 +++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 739a6d2d4df8..d7a807e81451 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -258,8 +258,8 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data * goto exit_err; #ifdef CONFIG_AUDITSYSCALL case AUDIT_FILTER_ENTRY: - if (rule->action == AUDIT_ALWAYS) - goto exit_err; + pr_err("AUDIT_FILTER_ENTRY is deprecated\n"); + goto exit_err; case AUDIT_FILTER_EXIT: case AUDIT_FILTER_TASK: #endif diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e80459f7e132..bc534bfb49a4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1519,22 +1519,23 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, if (!audit_enabled) return; - context->arch = syscall_get_arch(); - context->major = major; - context->argv[0] = a1; - context->argv[1] = a2; - context->argv[2] = a3; - context->argv[3] = a4; - state = context->state; + if (state == AUDIT_DISABLED) + return; + context->dummy = !audit_n_rules; if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { context->prio = 0; - state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); + if (auditd_test_task(tsk)) + return; } - if (state == AUDIT_DISABLED) - return; + context->arch = syscall_get_arch(); + context->major = major; + context->argv[0] = a1; + context->argv[1] = a2; + context->argv[2] = a3; + context->argv[3] = a4; context->serial = 0; context->ctime = current_kernel_time64(); context->in_syscall = 1; -- cgit v1.2.3-71-gd317 From 94d14e3e7b41d99f0d62a41fd856183057e1e474 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 14 Feb 2018 21:47:44 -0500 Subject: audit: bail before bug check if audit disabled If audit is disabled, who cares if there is a bug indicating syscall in process or names already recorded. Bail immediately on audit disabled. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index bc534bfb49a4..4e0a4ac803db 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1511,14 +1511,11 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, struct audit_context *context = tsk->audit_context; enum audit_state state; - if (!context) + if (!audit_enabled || !context) return; BUG_ON(context->in_syscall || context->name_count); - if (!audit_enabled) - return; - state = context->state; if (state == AUDIT_DISABLED) return; -- cgit v1.2.3-71-gd317 From 398953e62ce3b27f9f7805e367195b7ee6705f57 Mon Sep 17 00:00:00 2001 From: Lihao Liang Date: Wed, 22 Nov 2017 19:00:55 +0000 Subject: rcu: Remove unnecessary spinlock in rcu_boot_init_percpu_data() Since rcu_boot_init_percpu_data() is only called at boot time, there is no data race and spinlock is not needed. Signed-off-by: Lihao Liang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 491bdf39f276..66c73a214cff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3636,12 +3636,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { - unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); @@ -3649,7 +3646,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* -- cgit v1.2.3-71-gd317 From a7c8655b073d89303911c89d0fd9fc4be7631fbe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Nov 2017 15:36:35 -0800 Subject: sched/isolation: Eliminate NO_HZ_FULL_ALL Commit 6f1982fedd59 ("sched/isolation: Handle the nohz_full= parameter") broke CONFIG_NO_HZ_FULL_ALL=y kernels. This breakage is due to the code under CONFIG_NO_HZ_FULL_ALL failing to invoke the shiny new housekeeping functions. This means that rcutorture scenario TREE04 now emits RCU CPU stall warnings due to the RCU grace-period kthreads not being awakened at a time of their choosing, or perhaps even not at all: [ 27.731422] rcu_bh kthread starved for 21001 jiffies! g18446744073709551369 c18446744073709551368 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x402 ->cpu=3 [ 27.731423] rcu_bh I14936 9 2 0x80080000 [ 27.731435] Call Trace: [ 27.731440] __schedule+0x31a/0x6d0 [ 27.731442] schedule+0x31/0x80 [ 27.731446] schedule_timeout+0x15a/0x320 [ 27.731453] ? call_timer_fn+0x130/0x130 [ 27.731457] rcu_gp_kthread+0x66c/0xea0 [ 27.731458] ? rcu_gp_kthread+0x66c/0xea0 Because no one has complained about CONFIG_NO_HZ_FULL_ALL=y being broken, I hypothesize that no one is in fact using it, other than rcutorture. This commit therefore eliminates CONFIG_NO_HZ_FULL_ALL and updates rcutorture's config files to instead use the nohz_full= kernel parameter to put the desired CPUs into nohz_full mode. Fixes: 6f1982fedd59 ("sched/isolation: Handle the nohz_full= parameter") Reported-by: kernel test robot Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Cc: Ingo Molnar Cc: John Stultz Cc: Jonathan Corbet --- Documentation/timers/NO_HZ.txt | 7 ------- kernel/time/Kconfig | 10 ---------- kernel/time/tick-sched.c | 22 ++-------------------- .../selftests/rcutorture/configs/rcu/TASKS03 | 1 - .../selftests/rcutorture/configs/rcu/TASKS03.boot | 2 +- .../selftests/rcutorture/configs/rcu/TREE04 | 1 - .../selftests/rcutorture/configs/rcu/TREE04.boot | 2 +- .../selftests/rcutorture/configs/rcu/TREE07 | 1 - 8 files changed, 4 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt index 2dcaf9adb7a7..9591092da5e0 100644 --- a/Documentation/timers/NO_HZ.txt +++ b/Documentation/timers/NO_HZ.txt @@ -131,13 +131,6 @@ error message, and the boot CPU will be removed from the mask. Note that this means that your system must have at least two CPUs in order for CONFIG_NO_HZ_FULL=y to do anything for you. -Alternatively, the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter specifies -that all CPUs other than the boot CPU are adaptive-ticks CPUs. This -Kconfig parameter will be overridden by the "nohz_full=" boot parameter, -so that if both the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter and -the "nohz_full=1" boot parameter is specified, the boot parameter will -prevail so that only CPU 1 will be an adaptive-ticks CPU. - Finally, adaptive-ticks CPUs must have their RCU callbacks offloaded. This is covered in the "RCU IMPLICATIONS" section below. diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f6b5f19223d6..78eabc41eaa6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -113,16 +113,6 @@ config NO_HZ_FULL endchoice -config NO_HZ_FULL_ALL - bool "Full dynticks system on all CPUs by default (except CPU 0)" - depends on NO_HZ_FULL - help - If the user doesn't pass the nohz_full boot option to - define the range of full dynticks CPUs, consider that all - CPUs in the system are full dynticks by default. - Note the boot CPU will still be kept outside the range to - handle the timekeeping duty. - config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 29a5733eff83..ccd3782da0bf 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -405,30 +405,12 @@ static int tick_nohz_cpu_down(unsigned int cpu) return 0; } -static int tick_nohz_init_all(void) -{ - int err = -1; - -#ifdef CONFIG_NO_HZ_FULL_ALL - if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { - WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); - return err; - } - err = 0; - cpumask_setall(tick_nohz_full_mask); - tick_nohz_full_running = true; -#endif - return err; -} - void __init tick_nohz_init(void) { int cpu, ret; - if (!tick_nohz_full_running) { - if (tick_nohz_init_all() < 0) - return; - } + if (!tick_nohz_full_running) + return; /* * Full dynticks uses irq work to drive the tick rescheduling on safe diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 index c70c51d5ded1..28568b72a31b 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 @@ -9,5 +9,4 @@ CONFIG_PREEMPT=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y -CONFIG_NO_HZ_FULL_ALL=y #CHECK#CONFIG_RCU_EXPERT=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot index cd2a188eeb6d..838297c58318 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot @@ -1 +1 @@ -rcutorture.torture_type=tasks +rcutorture.torture_type=tasks nohz_full=1 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 27d22695d64c..24c9f6012e35 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -7,7 +7,6 @@ CONFIG_PREEMPT=n CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y -CONFIG_NO_HZ_FULL_ALL=y CONFIG_RCU_FAST_NO_HZ=y CONFIG_RCU_TRACE=y CONFIG_HOTPLUG_CPU=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot index e34c33430447..e6071bb96c7d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot @@ -1 +1 @@ -rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4 +rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4 nohz_full=1-7 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 index 0f4759f4232e..d7afb271a586 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 @@ -7,7 +7,6 @@ CONFIG_PREEMPT=n CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y -CONFIG_NO_HZ_FULL_ALL=n CONFIG_RCU_FAST_NO_HZ=n CONFIG_RCU_TRACE=y CONFIG_HOTPLUG_CPU=y -- cgit v1.2.3-71-gd317 From cbf8699996a6e7f2f674b3a2a4cef9f666ff613e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 16 Feb 2018 15:21:20 +0100 Subject: genirq: Let irq thread follow the effective hard irq affinity In case of threaded interrupts the thread follows the affinity setting of the hard interrupt. The related function uses the affinity mask which was set by either from user space or via one of the kernel mechanisms. This mask can be wider than the resulting effective affinity of the hard interrupt. As a consequence the thread might become affine to a completely different CPU. Use the effective interrupt affinity if the architecture supports it, so the hard interrupt and the thread stay on the same CPU. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Sebastian Andrzej Siewior --- kernel/irq/manage.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0f922729bab9..d2b3c59f1200 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -855,10 +855,14 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ - if (cpumask_available(desc->irq_common_data.affinity)) - cpumask_copy(mask, desc->irq_common_data.affinity); - else + if (cpumask_available(desc->irq_common_data.affinity)) { + const struct cpumask *m; + + m = irq_data_get_effective_affinity_mask(&desc->irq_data); + cpumask_copy(mask, m); + } else { valid = false; + } raw_spin_unlock_irq(&desc->lock); if (valid) -- cgit v1.2.3-71-gd317 From 3016611eedaff3e75feba54392c105db214a4f52 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Dec 2017 09:48:59 -0800 Subject: rcu: Fix CPU offload boot message when no CPUs are offloaded In CONFIG_RCU_NOCB_CPU=y kernels, if the boot parameters indicate that none of the CPUs should in fact be offloaded, the following somewhat obtuse message appears: Offload RCU callbacks from CPUs: . This commit therefore makes the message at least grammatically correct in this case: Offload RCU callbacks from CPUs: (none) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fb88a028deec..93f5649ffd2b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2312,8 +2312,11 @@ void __init rcu_init_nohz(void) cpumask_and(rcu_nocb_mask, cpu_possible_mask, rcu_nocb_mask); } - pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", - cpumask_pr_args(rcu_nocb_mask)); + if (cpumask_empty(rcu_nocb_mask)) + pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); + else + pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", + cpumask_pr_args(rcu_nocb_mask)); if (rcu_nocb_poll) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); -- cgit v1.2.3-71-gd317 From 3caa973b7a260e7a2a69edc94c300ab9c65148c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 10:38:17 -0800 Subject: rcu: Call touch_nmi_watchdog() while printing stall warnings When RCU stall warning triggers, it can print out a lot of messages while holding spinlocks. If the console device is slow (e.g. an actual or IPMI serial console), it may end up triggering NMI hard lockup watchdog like the following. *** CPU printking while holding RCU spinlock PID: 4149739 TASK: ffff881a46baa880 CPU: 13 COMMAND: "CPUThreadPool8" #0 [ffff881fff945e48] crash_nmi_callback at ffffffff8103f7d0 #1 [ffff881fff945e58] nmi_handle at ffffffff81020653 #2 [ffff881fff945eb0] default_do_nmi at ffffffff81020c36 #3 [ffff881fff945ed0] do_nmi at ffffffff81020d32 #4 [ffff881fff945ef0] end_repeat_nmi at ffffffff81956a7e [exception RIP: io_serial_in+21] RIP: ffffffff81630e55 RSP: ffff881fff943b88 RFLAGS: 00000002 RAX: 000000000000ca00 RBX: ffffffff8230e188 RCX: 0000000000000000 RDX: 00000000000002fd RSI: 0000000000000005 RDI: ffffffff8230e188 RBP: ffff881fff943bb0 R8: 0000000000000000 R9: ffffffff820cb3c4 R10: 0000000000000019 R11: 0000000000002000 R12: 00000000000026e1 R13: 0000000000000020 R14: ffffffff820cd398 R15: 0000000000000035 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0000 --- --- #5 [ffff881fff943b88] io_serial_in at ffffffff81630e55 #6 [ffff881fff943b90] wait_for_xmitr at ffffffff8163175c #7 [ffff881fff943bb8] serial8250_console_putchar at ffffffff816317dc #8 [ffff881fff943bd8] uart_console_write at ffffffff8162ac00 #9 [ffff881fff943c08] serial8250_console_write at ffffffff81634691 #10 [ffff881fff943c80] univ8250_console_write at ffffffff8162f7c2 #11 [ffff881fff943c90] console_unlock at ffffffff810dfc55 #12 [ffff881fff943cf0] vprintk_emit at ffffffff810dffb5 #13 [ffff881fff943d50] vprintk_default at ffffffff810e01bf #14 [ffff881fff943d60] vprintk_func at ffffffff810e1127 #15 [ffff881fff943d70] printk at ffffffff8119a8a4 #16 [ffff881fff943dd0] print_cpu_stall_info at ffffffff810eb78c #17 [ffff881fff943e88] rcu_check_callbacks at ffffffff810ef133 #18 [ffff881fff943ee8] update_process_times at ffffffff810f3497 #19 [ffff881fff943f10] tick_sched_timer at ffffffff81103037 #20 [ffff881fff943f38] __hrtimer_run_queues at ffffffff810f3f38 #21 [ffff881fff943f88] hrtimer_interrupt at ffffffff810f442b *** CPU triggering the hardlockup watchdog PID: 4149709 TASK: ffff88010f88c380 CPU: 26 COMMAND: "CPUThreadPool35" #0 [ffff883fff1059d0] machine_kexec at ffffffff8104a874 #1 [ffff883fff105a30] __crash_kexec at ffffffff811116cc #2 [ffff883fff105af0] __crash_kexec at ffffffff81111795 #3 [ffff883fff105b08] panic at ffffffff8119a6ae #4 [ffff883fff105b98] watchdog_overflow_callback at ffffffff81135dbd #5 [ffff883fff105bb0] __perf_event_overflow at ffffffff81186866 #6 [ffff883fff105be8] perf_event_overflow at ffffffff81192bc4 #7 [ffff883fff105bf8] intel_pmu_handle_irq at ffffffff8100b265 #8 [ffff883fff105df8] perf_event_nmi_handler at ffffffff8100489f #9 [ffff883fff105e58] nmi_handle at ffffffff81020653 #10 [ffff883fff105eb0] default_do_nmi at ffffffff81020b94 #11 [ffff883fff105ed0] do_nmi at ffffffff81020d32 #12 [ffff883fff105ef0] end_repeat_nmi at ffffffff81956a7e [exception RIP: queued_spin_lock_slowpath+248] RIP: ffffffff810da958 RSP: ffff883fff103e68 RFLAGS: 00000046 RAX: 0000000000000000 RBX: 0000000000000046 RCX: 00000000006d0000 RDX: ffff883fff49a950 RSI: 0000000000d10101 RDI: ffffffff81e54300 RBP: ffff883fff103e80 R8: ffff883fff11a950 R9: 0000000000000000 R10: 000000000e5873ba R11: 000000000000010f R12: ffffffff81e54300 R13: 0000000000000000 R14: ffff88010f88c380 R15: ffffffff81e54300 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 --- --- #13 [ffff883fff103e68] queued_spin_lock_slowpath at ffffffff810da958 #14 [ffff883fff103e70] _raw_spin_lock_irqsave at ffffffff8195550b #15 [ffff883fff103e88] rcu_check_callbacks at ffffffff810eed18 #16 [ffff883fff103ee8] update_process_times at ffffffff810f3497 #17 [ffff883fff103f10] tick_sched_timer at ffffffff81103037 #18 [ffff883fff103f38] __hrtimer_run_queues at ffffffff810f3f38 #19 [ffff883fff103f88] hrtimer_interrupt at ffffffff810f442b --- --- Avoid spuriously triggering NMI hardlockup watchdog by touching it from the print functions. show_state_filter() shares the same problem and solution. v2: Relocate the comment to where it belongs. Signed-off-by: Tejun Heo Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 93f5649ffd2b..7e31fb1e9fd8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -560,8 +560,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) } t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); sched_show_task(t); + } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -1677,6 +1683,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) char *ticks_title; unsigned long ticks_value; + /* + * We could be printing a lot while holding a spinlock. Avoid + * triggering hard lockup. + */ + touch_nmi_watchdog(); + if (rsp->gpnum == rdp->gpnum) { ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; -- cgit v1.2.3-71-gd317 From bec06785fe2c866ccf39cafa18935d88d77d559a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jan 2018 12:16:42 -0800 Subject: rcu: Remove obsolete boost statistics for debugfs The debugfs interface displayed statistics on RCU priority boosting, but this interface has since been removed. This commit therefore removes the no-longer-used rcu_data structure's ->n_tasks_boosted, ->n_exp_boosts, and ->n_exp_boosts and their updates. If this information proves necessary in the future, the corresponding event traces will be added. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 6 ------ kernel/rcu/tree_plugin.h | 8 ++------ 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6488a3b0e729..01abd1c4e5da 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -146,12 +146,6 @@ struct rcu_node { /* boosting for this rcu_node structure. */ unsigned int boost_kthread_status; /* State of boost_kthread_task for tracing. */ - unsigned long n_tasks_boosted; - /* Total number of tasks boosted. */ - unsigned long n_exp_boosts; - /* Number of tasks boosted for expedited GP. */ - unsigned long n_normal_boosts; - /* Number of tasks boosted for normal GP. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7e31fb1e9fd8..0d552985b905 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -963,14 +963,10 @@ static int rcu_boost(struct rcu_node *rnp) * expedited grace period must boost all blocked tasks, including * those blocking the pre-existing normal grace period. */ - if (rnp->exp_tasks != NULL) { + if (rnp->exp_tasks != NULL) tb = rnp->exp_tasks; - rnp->n_exp_boosts++; - } else { + else tb = rnp->boost_tasks; - rnp->n_normal_boosts++; - } - rnp->n_tasks_boosted++; /* * We boost task t by manufacturing an rt_mutex that appears to -- cgit v1.2.3-71-gd317 From 62df63e048daf4e29373bbbaae3751e5af5d9502 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jan 2018 12:21:40 -0800 Subject: rcu: Remove obsolete callback-invocation statistics for debugfs The debugfs interface displayed statistics on RCU callback invocation but this interface has since been removed. This commit therefore removes the no-longer-used rcu_data structure's ->n_cbs_invoked and ->n_nocbs_invoked fields along with their updates. If this information proves necessary in the future, the corresponding event traces will be added. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - kernel/rcu/tree.h | 2 -- kernel/rcu/tree_plugin.h | 1 - 3 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 66c73a214cff..8e0711954bbf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2691,7 +2691,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); smp_mb(); /* List handling before counting for rcu_barrier(). */ - rdp->n_cbs_invoked += count; rcu_segcblist_insert_count(&rdp->cblist, &rcl); /* Reinstate batch limit if we have worked down the excess. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 01abd1c4e5da..b258fac73524 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -211,8 +211,6 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ - unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ - unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0d552985b905..1c2d58a85511 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2243,7 +2243,6 @@ static int rcu_nocb_kthread(void *arg) smp_mb__before_atomic(); /* _add after CB invocation. */ atomic_long_add(-c, &rdp->nocb_q_count); atomic_long_add(-cl, &rdp->nocb_q_count_lazy); - rdp->n_nocbs_invoked += c; } return 0; } -- cgit v1.2.3-71-gd317 From 01c495f72a3b5a210e5689deba1ef33c82e8aa30 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jan 2018 12:36:00 -0800 Subject: rcu: Remove obsolete __rcu_pending() statistics for debugfs The debugfs interface displayed statistics on RCU-pending checks but this interface has since been removed. This commit therefore removes the no-longer-used rcu_data structure's ->n_rcu_pending, ->n_rp_core_needs_qs, ->n_rp_report_qs, ->n_rp_cb_ready, ->n_rp_cpu_needs_gp, ->n_rp_gp_completed, ->n_rp_gp_started, ->n_rp_nocb_defer_wakeup, and ->n_rp_need_nothing fields along with their updates. If this information proves necessary in the future, the corresponding event traces will be added. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 31 ++++++------------------------- kernel/rcu/tree.h | 17 +++-------------- 2 files changed, 9 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e0711954bbf..99d59be761d1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3354,8 +3354,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) { struct rcu_node *rnp = rdp->mynode; - rdp->n_rcu_pending++; - /* Check for CPU stalls, if enabled. */ check_cpu_stall(rsp, rdp); @@ -3364,48 +3362,31 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rcu_scheduler_fully_active && - rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { - rdp->n_rp_core_needs_qs++; - } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { - rdp->n_rp_report_qs++; + if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) return 1; - } /* Does this CPU have callbacks ready to invoke? */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - rdp->n_rp_cb_ready++; + if (rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; - } /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) { - rdp->n_rp_cpu_needs_gp++; + if (cpu_needs_another_gp(rsp, rdp)) return 1; - } /* Has another RCU grace period completed? */ - if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ - rdp->n_rp_gp_completed++; + if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */ return 1; - } /* Has a new RCU grace period started? */ if (READ_ONCE(rnp->gpnum) != rdp->gpnum || - unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ - rdp->n_rp_gp_started++; + unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; - } /* Does this CPU need a deferred NOCB wakeup? */ - if (rcu_nocb_need_deferred_wakeup(rdp)) { - rdp->n_rp_nocb_defer_wakeup++; + if (rcu_nocb_need_deferred_wakeup(rdp)) return 1; - } /* nothing to do */ - rdp->n_rp_need_nothing++; return 0; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b258fac73524..d29bab8dea28 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -226,18 +226,7 @@ struct rcu_data { /* Grace period that needs help */ /* from cond_resched(). */ - /* 5) __rcu_pending() statistics. */ - unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ - unsigned long n_rp_core_needs_qs; - unsigned long n_rp_report_qs; - unsigned long n_rp_cb_ready; - unsigned long n_rp_cpu_needs_gp; - unsigned long n_rp_gp_completed; - unsigned long n_rp_gp_started; - unsigned long n_rp_nocb_defer_wakeup; - unsigned long n_rp_need_nothing; - - /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ + /* 5) _rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; @@ -248,7 +237,7 @@ struct rcu_data { atomic_long_t exp_workdone3; /* # done by others #3. */ int exp_dynticks_snap; /* Double-check need for IPI. */ - /* 7) Callback offloading. */ + /* 6) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; @@ -275,7 +264,7 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 8) RCU CPU stall data. */ + /* 7) RCU CPU stall data. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ -- cgit v1.2.3-71-gd317 From d62df57370a5215fbf1b088d2ee51fa5d69bd0c3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jan 2018 13:10:49 -0800 Subject: rcu: Remove obsolete force-quiescent-state statistics for debugfs The debugfs interface displayed statistics on RCU-pending checks but this interface has since been removed. This commit therefore removes the no-longer-used rcu_state structure's ->n_force_qs_lh and ->n_force_qs_ngp fields along with their updates. (Though the ->n_force_qs_ngp field was actually not used at all, embarrassingly enough.) If this information proves necessary in the future, the corresponding event traces will be added. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +---- kernel/rcu/tree.h | 4 ---- 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 99d59be761d1..86fc087c7777 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2844,10 +2844,8 @@ static void force_quiescent_state(struct rcu_state *rsp) !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); - if (ret) { - rsp->n_force_qs_lh++; + if (ret) return; - } rnp_old = rnp; } /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ @@ -2856,7 +2854,6 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - rsp->n_force_qs_lh++; raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d29bab8dea28..2edd56397b00 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -355,10 +355,6 @@ struct rcu_state { /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ - unsigned long n_force_qs_lh; /* ~Number of calls leaving */ - /* due to lock unavailable. */ - unsigned long n_force_qs_ngp; /* Number of calls leaving */ - /* due to no GP active. */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ -- cgit v1.2.3-71-gd317 From d07aee2c035e04dce11209a870b48091a47bd04a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Jan 2018 12:08:20 -0800 Subject: rcu: More clearly identify grace-period kthread stack dump It is not always obvious that the stack dump from a starved grace-period kthread isn't instead that of a CPU stalling the current grace period. This commit therefore adds a pr_err() flagging these dumps. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 86fc087c7777..4d7c727020f0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1350,6 +1350,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_kthread ? rsp->gp_kthread->state : ~0, rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); if (rsp->gp_kthread) { + pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(rsp->gp_kthread); wake_up_process(rsp->gp_kthread); } -- cgit v1.2.3-71-gd317 From bfbd767d4dba7fb41df0d356eecb7bbd99d0a7ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Jan 2018 12:58:53 -0800 Subject: rcu: Consolidate rcu.h #ifdefs The kernel/rcu/rcu.h file has a pair of consecutive #ifdefs on CONFIG_TINY_RCU, so this commit consolidates them, thus saving a few lines of code. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 6334f2c1abd0..c90812673d54 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -356,24 +356,20 @@ static inline bool rcu_gp_is_normal(void) { return true; } static inline bool rcu_gp_is_expedited(void) { return false; } static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } +static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 -#ifdef CONFIG_TINY_RCU -static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } -#else /* #ifdef CONFIG_TINY_RCU */ -void rcu_request_urgent_qs_task(struct task_struct *t); -#endif /* #else #ifdef CONFIG_TINY_RCU */ - enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, -- cgit v1.2.3-71-gd317 From 65518db86b9ed1180b013c8a34c73c6ff7275886 Mon Sep 17 00:00:00 2001 From: "Liu, Changcheng" Date: Tue, 16 Jan 2018 17:05:14 +0800 Subject: rcu: Remove redundant nxttail index macro define RCU's nxttail has been optimized to be a rcu_segcblist, which is a multi-tailed linked list with macros defined for the indexes for each tail. The indexes have been defined in linux/rcu_segcblist.h, so this commit removes the redundant definitions in kernel/rcu/tree.h. Signed-off-by: Liu Changcheng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2edd56397b00..f491ab4f2e8e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -178,13 +178,6 @@ union rcu_noqs { u16 s; /* Set of bits, aggregate OR here. */ }; -/* Index values for nxttail array in struct rcu_data. */ -#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ -#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ -#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ -#define RCU_NEXT_TAIL 3 -#define RCU_NEXT_SIZE 4 - /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ -- cgit v1.2.3-71-gd317 From a32e01ee689794a26bdfdbaa7e8c334576cee36c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 17 Jan 2018 06:24:30 -0800 Subject: rcu: Use wrapper for lockdep asserts Commits c0b334c5bfa9 and ea9b0c8a26a2 introduced new sparse warnings by accessing rcu_node->lock directly and ignoring the __private marker. Introduce a new wrapper and use it. Also fix a similar problem in srcutree.c introduced by a3883df3935e. Signed-off-by: Matthew Wilcox Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 5 ++++- kernel/rcu/srcutree.c | 2 +- kernel/rcu/tree.c | 24 ++++++++++++------------ kernel/rcu/tree_plugin.h | 4 ++-- 4 files changed, 19 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index c90812673d54..5d13f651cf08 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -337,7 +337,7 @@ do { \ } while (0) #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) #define raw_spin_trylock_rcu_node(p) \ ({ \ @@ -348,6 +348,9 @@ do { \ ___locked; \ }) +#define raw_lockdep_assert_held_rcu_node(p) \ + lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) + #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ #ifdef CONFIG_TINY_RCU diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d5cea81378cc..9c6e0eaab4d5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -439,7 +439,7 @@ static void srcu_gp_start(struct srcu_struct *sp) struct srcu_data *sdp = this_cpu_ptr(sp->sda); int state; - lockdep_assert_held(&sp->lock); + lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4d7c727020f0..99d404c6bbbb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1161,7 +1161,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) */ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) WRITE_ONCE(rdp->gpwrap, true); if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) @@ -1629,7 +1629,7 @@ void rcu_cpu_stall_reset(void) static unsigned long rcu_cbs_completed(struct rcu_state *rsp, struct rcu_node *rnp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* * If RCU is idle, we just wait for the next grace period. @@ -1676,7 +1676,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* * Pick up grace-period number for new callbacks. If this @@ -1804,7 +1804,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, { bool ret = false; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) @@ -1844,7 +1844,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) @@ -1872,7 +1872,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; bool need_gp; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && @@ -2297,7 +2297,7 @@ static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period @@ -2359,7 +2359,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - lockdep_assert_held(&rcu_get_root(rsp)->lock); + raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp)); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); @@ -2384,7 +2384,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, unsigned long oldmask = 0; struct rcu_node *rnp_c; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* Walk up the rcu_node hierarchy. */ for (;;) { @@ -2448,7 +2448,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2593,7 +2593,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; @@ -3596,7 +3596,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1c2d58a85511..84fbee4686d3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,7 +180,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); struct task_struct *t = current; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); @@ -1044,7 +1044,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; -- cgit v1.2.3-71-gd317 From 274afd6bfa64984724963204b832cc71565a7740 Mon Sep 17 00:00:00 2001 From: Ildar Ismagilov Date: Tue, 30 Jan 2018 15:40:16 -0800 Subject: rcu: Fix misprint in srcu_funnel_exp_start The srcu_funnel_exp_start() function checks to see if the srcu_struct structure's expedited grace period counter needs updating to reflect a newly arrived request for an expedited SRCU grace period. Unfortunately, the check is backwards, so this commit reverses the sense of the test. Signed-off-by: Ildar Ismagilov Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9c6e0eaab4d5..045b559b9f22 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -626,7 +626,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, spin_unlock_irqrestore_rcu_node(snp, flags); } spin_lock_irqsave_rcu_node(sp, flags); - if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; spin_unlock_irqrestore_rcu_node(sp, flags); } -- cgit v1.2.3-71-gd317 From 9a414201ae7ea089699a0cbd36533345ca17233b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 31 Jan 2018 19:23:24 -0800 Subject: rcu: Add more tracing of expedited grace periods This commit adds more tracing of expedited grace periods to enable improved debugging of slowdowns. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 3 +++ kernel/rcu/rcu.h | 8 +++++++- kernel/rcu/tree_exp.h | 12 ++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 0b50fda80db0..e56a618f2a59 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -179,6 +179,9 @@ TRACE_EVENT(rcu_grace_period_init, * * "snap": Captured snapshot of expedited grace period sequence number. * "start": Started a real expedited grace period. + * "reset": Started resetting the tree + * "select": Started selecting the CPUs to wait on. + * "startwait": Started waiting on selected CPUs. * "end": Ended a real expedited grace period. * "endwake": Woke piggybackers up. * "done": Someone else did the expedited grace period for us. diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 5d13f651cf08..507a0802c717 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -77,12 +77,18 @@ static inline void rcu_seq_start(unsigned long *sp) WARN_ON_ONCE(rcu_seq_state(*sp) != 1); } +/* Compute the end-of-grace-period value for the specified sequence number. */ +static inline unsigned long rcu_seq_endval(unsigned long *sp) +{ + return (*sp | RCU_SEQ_STATE_MASK) + 1; +} + /* Adjust sequence number for end of update-side operation. */ static inline void rcu_seq_end(unsigned long *sp) { smp_mb(); /* Ensure update-side operation before counter increment. */ WARN_ON_ONCE(!rcu_seq_state(*sp)); - WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); + WRITE_ONCE(*sp, rcu_seq_endval(sp)); } /* Take a snapshot of the update side's sequence number. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 46d61b597731..70ad12abde36 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -28,6 +28,15 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) rcu_seq_start(&rsp->expedited_sequence); } +/* + * Return then value that expedited-grace-period counter will have + * at the end of the current grace period. + */ +static unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp) +{ + return rcu_seq_endval(&rsp->expedited_sequence); +} + /* * Record the end of an expedited grace period. */ @@ -366,7 +375,9 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, int ret; struct rcu_node *rnp; + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); sync_exp_reset_tree(rsp); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -443,6 +454,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; -- cgit v1.2.3-71-gd317 From 7f5d42d05155523a4c42c2c5170f2a368217aed5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 31 Jan 2018 19:40:22 -0800 Subject: rcu: Trace expedited GP delays due to transitioning CPUs If a CPU is transitioning to or from offline state, an expedited grace period may undergo a timed wait. This timed wait can unduly delay grace periods, so this commit adds a trace statement to make it visible. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 1 + kernel/rcu/tree_exp.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index e56a618f2a59..d8c33298c153 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -181,6 +181,7 @@ TRACE_EVENT(rcu_grace_period_init, * "start": Started a real expedited grace period. * "reset": Started resetting the tree * "select": Started selecting the CPUs to wait on. + * "selectofl": Selected CPU partially offline. * "startwait": Started waiting on selected CPUs. * "end": Ended a real expedited grace period. * "endwake": Woke piggybackers up. diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 70ad12abde36..fecb6b6ab452 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -32,7 +32,7 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) * Return then value that expedited-grace-period counter will have * at the end of the current grace period. */ -static unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp) +static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp) { return rcu_seq_endval(&rsp->expedited_sequence); } @@ -428,6 +428,7 @@ retry_ipi: (rnp->expmask & mask)) { /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); schedule_timeout_uninterruptible(1); goto retry_ipi; } -- cgit v1.2.3-71-gd317 From 65963d246147c46aafda2b04523d6dbe6c457e7c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 31 Jan 2018 20:24:15 -0800 Subject: rcu: Make expedited RCU CPU selection avoid unnecessary stores This commit reworks the first loop in sync_rcu_exp_select_cpus() to avoid doing unnecssary stores to other CPUs' rcu_data structures. This speeds up that first loop by roughly a factor of two on an old x86 system. In the case where the system is mostly idle, this loop incurs a large fraction of the overhead of the synchronize_rcu_expedited(). There is less benefit on busy systems because the overhead of the smp_call_function_single() in the second loop dominates in that case. However, it is not unusual to do configuration chances involving RCU grace periods (both expedited and normal) while the system is mostly idle, so this optimization is worth doing. While we are in the area, this commit also adds parentheses to arguments used by the for_each_leaf_node_possible_cpu() macro. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 16 +++++++++++++--- kernel/rcu/tree_exp.h | 21 ++++++++++++++------- 2 files changed, 27 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 507a0802c717..1c868bcfd705 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -301,9 +301,19 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * Iterate over all possible CPUs in a leaf RCU node. */ #define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ - cpu <= rnp->grphi; \ - cpu = cpumask_next((cpu), cpu_possible_mask)) + for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ + (cpu) <= rnp->grphi; \ + (cpu) = cpumask_next((cpu), cpu_possible_mask)) + +/* + * Iterate over all CPUs in a leaf RCU node's specified mask. + */ +#define rcu_find_next_bit(rnp, cpu, mask) \ + ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) +#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ + for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ + (cpu) <= rnp->grphi; \ + (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) /* * Wrappers for the rcu_node::lock acquire and release. diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index fecb6b6ab452..6ad87642f44a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -383,15 +383,22 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; - for_each_leaf_node_possible_cpu(rnp, cpu) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + int snap; - rdp->exp_dynticks_snap = - rcu_dynticks_snap(rdp->dynticks); if (raw_smp_processor_id() == cpu || - rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || - !(rnp->qsmaskinitnext & rdp->grpmask)) - mask_ofl_test |= rdp->grpmask; + !(rnp->qsmaskinitnext & mask)) { + mask_ofl_test |= mask; + } else { + snap = rcu_dynticks_snap(rdtp); + if (rcu_dynticks_in_eqs(snap)) + mask_ofl_test |= mask; + else + rdp->exp_dynticks_snap = snap; + } } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; @@ -405,7 +412,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_possible_cpu(rnp, cpu) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); -- cgit v1.2.3-71-gd317 From cb4081cd4e0c9fd88d186553edcc78e1bbb96ce6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Dec 2017 15:28:11 -0800 Subject: srcu: Abstract function name This commit moves to __func__ for function names in the name of better resilience to change. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d5cea81378cc..01f4a8bf0e03 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -386,7 +386,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { - pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } free_percpu(sp->sda); -- cgit v1.2.3-71-gd317 From a35d13ec3686612b17771bf1abcad75d2ebd42ef Mon Sep 17 00:00:00 2001 From: Ildar Ismagilov Date: Wed, 31 Jan 2018 22:39:53 +0300 Subject: srcu: Prevent sdp->srcu_gp_seq_needed_exp counter wrap SRCU checks each srcu_data structure's grace-period number for counter wrap four times per cycle by default. This frequency guarantees that normal comparisons will detect potential wrap. However, the expedited grace-period number is not checked. The consquences are not too horrible (a failure to expedite a grace period when requested), but it would be good to avoid such things. This commit therefore adds this check to the expedited grace-period number. Signed-off-by: Ildar Ismagilov Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 01f4a8bf0e03..bb9607baad9b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -579,6 +579,9 @@ static void srcu_gp_end(struct srcu_struct *sp) if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, + sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; spin_unlock_irqrestore_rcu_node(sdp, flags); } } -- cgit v1.2.3-71-gd317 From 8ddbd8832d25463a996a1075c35f312fb43c86b1 Mon Sep 17 00:00:00 2001 From: Ildar Ismagilov Date: Wed, 31 Jan 2018 22:42:21 +0300 Subject: srcu: Reduce scans of srcu_data in counter wrap check Currently, given a multi-level srcu_node tree, SRCU can scan the full set of srcu_data structures at each level when cleaning up after a grace period. This, though harmless otherwise, represents pointless overhead. This commit therefore eliminates this overhead by scanning the srcu_data structures only when traversing the leaf srcu_node structures. Signed-off-by: Ildar Ismagilov Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index bb9607baad9b..d582a2352cdb 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -527,6 +527,7 @@ static void srcu_gp_end(struct srcu_struct *sp) { unsigned long cbdelay; bool cbs; + bool last_lvl; int cpu; unsigned long flags; unsigned long gpseq; @@ -559,7 +560,8 @@ static void srcu_gp_end(struct srcu_struct *sp) rcu_for_each_node_breadth_first(sp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - if (snp >= sp->level[rcu_num_lvls - 1]) + last_lvl = snp >= sp->level[rcu_num_lvls - 1]; + if (last_lvl) cbs = snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); @@ -572,7 +574,7 @@ static void srcu_gp_end(struct srcu_struct *sp) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check)) + if (!(gpseq & counter_wrap_check) && last_lvl) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { sdp = per_cpu_ptr(sp->sda, cpu); spin_lock_irqsave_rcu_node(sdp, flags); -- cgit v1.2.3-71-gd317 From a72da917f1861ed06f8824328b1b3ecd003a5b5b Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 14 Feb 2018 18:05:24 +0900 Subject: srcu: Remove dead code in srcu_gp_end() Of course, compilers will optimize out a dead code. Anyway, remove any dead code for better readibility. Signed-off-by: Byungchul Park Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d582a2352cdb..f962f4428c39 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -532,7 +532,6 @@ static void srcu_gp_end(struct srcu_struct *sp) unsigned long flags; unsigned long gpseq; int idx; - int idxnext; unsigned long mask; struct srcu_data *sdp; struct srcu_node *snp; @@ -556,7 +555,6 @@ static void srcu_gp_end(struct srcu_struct *sp) /* Initiate callback invocation as needed. */ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; -- cgit v1.2.3-71-gd317 From 6308f3477510e15391ac82deaee66f8425339014 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Feb 2018 14:20:45 -0800 Subject: rcu: Remove SRCU throttling The code in srcu_gp_end() inserts a delay every 0x3ff grace periods in order to prevent SRCU grace-period work from consuming an entire CPU when there is a long sequence of expedited SRCU grace-period requests. However, all of SRCU's grace-period work is carried out in workqueues, which are in turn within kthreads, which are automatically throttled as needed by the scheduler. In particular, if there is plenty of idle time, there is no point in throttling. This commit therefore removes the expedited SRCU grace-period throttling. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index f962f4428c39..e2796f79a597 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -596,9 +596,7 @@ static void srcu_gp_end(struct srcu_struct *sp) ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); spin_unlock_irq_rcu_node(sp); - /* Throttle expedited grace periods: Should be rare! */ - srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff - ? 0 : SRCU_INTERVAL); + srcu_reschedule(sp, 0); } else { spin_unlock_irq_rcu_node(sp); } -- cgit v1.2.3-71-gd317 From 68a675d433fbaa8a39aa9a63695da68192b70e3c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Dec 2017 14:26:56 -0800 Subject: rcutorture: Replace multi-instance kzalloc() with kcalloc() This commit replaces array-allocation calls to kzalloc() with equivalent calls to kcalloc(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 308e6fdbced8..b4a201dcbc55 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1557,11 +1557,10 @@ static int rcu_torture_barrier_init(void) atomic_set(&barrier_cbs_count, 0); atomic_set(&barrier_cbs_invoked, 0); barrier_cbs_tasks = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), + kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]), GFP_KERNEL); barrier_cbs_wq = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), - GFP_KERNEL); + kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL); if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { @@ -1799,7 +1798,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; if (nfakewriters > 0) { - fakewriter_tasks = kzalloc(nfakewriters * + fakewriter_tasks = kcalloc(nfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -1814,7 +1813,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; } - reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("out of memory"); -- cgit v1.2.3-71-gd317 From e0d31a34c6db6381bfc630a9ee93f05b9447dcc3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Dec 2017 15:22:38 -0800 Subject: rcutorture: Abstract function and module names This commit moves to __func__ for function names and for KBUILD_MODNAME for module names, all in the name of better resilience to change. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b4a201dcbc55..0f94025c672a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -924,19 +924,19 @@ rcu_torture_writer(void *arg) if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) synctype[nsynctypes++] = RTWS_COND_GET; else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) - pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); + pr_alert("%s: gp_cond without primitives.\n", __func__); if (gp_exp1 && cur_ops->exp_sync) synctype[nsynctypes++] = RTWS_EXP_SYNC; else if (gp_exp && !cur_ops->exp_sync) - pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); + pr_alert("%s: gp_exp without primitives.\n", __func__); if (gp_normal1 && cur_ops->deferred_free) synctype[nsynctypes++] = RTWS_DEF_FREE; else if (gp_normal && !cur_ops->deferred_free) - pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); + pr_alert("%s: gp_normal without primitives.\n", __func__); if (gp_sync1 && cur_ops->sync) synctype[nsynctypes++] = RTWS_SYNC; else if (gp_sync && !cur_ops->sync) - pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); + pr_alert("%s: gp_sync without primitives.\n", __func__); if (WARN_ONCE(nsynctypes == 0, "rcu_torture_writer: No update-side primitives.\n")) { /* @@ -1673,7 +1673,7 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) * next grace period. Unlikely, but can happen. If it * does happen, the debug-objects subsystem won't have splatted. */ - pr_alert("rcutorture: duplicated callback was invoked.\n"); + pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); } #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ @@ -1690,7 +1690,7 @@ static void rcu_test_debug_objects(void) init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); + pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); /* Try to queue the rh2 pair of callbacks for the same grace period. */ preempt_disable(); /* Prevent preemption from interrupting test. */ @@ -1705,11 +1705,11 @@ static void rcu_test_debug_objects(void) /* Wait for them all to get done so we can safely return. */ rcu_barrier(); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); + pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); + pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } -- cgit v1.2.3-71-gd317 From eb0339934f1d468ff09d9be1c608c89cb1da850b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Dec 2017 10:48:41 -0800 Subject: rcutorture: Avoid fake-writer use of undefined primitives Currently the rcu_torture_fakewriter() function invokes cur_ops->sync() and cur_ops->exp_sync() without first checking to see if they are in fact non-NULL. This results in kernel NULL pointer dereferences when testing RCU implementations that choose not to provide the full set of primitives. Given that it is perfectly reasonable to have specialized RCU implementations that provide only a subset of the RCU API, this is a bug in rcutorture. This commit therefore makes rcu_torture_fakewriter() check function pointers before invoking them, thus allowing it to test subsetted RCU implementations. Reported-by: Lihao Liang Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0f94025c672a..6c46cd1d8fd7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1045,13 +1045,13 @@ rcu_torture_fakewriter(void *arg) torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else if (gp_normal == gp_exp) { - if (torture_random(&rand) & 0x80) + if (cur_ops->sync && torture_random(&rand) & 0x80) cur_ops->sync(); - else + else if (cur_ops->exp_sync) cur_ops->exp_sync(); - } else if (gp_normal) { + } else if (gp_normal && cur_ops->sync) { cur_ops->sync(); - } else { + } else if (cur_ops->exp_sync) { cur_ops->exp_sync(); } stutter_wait("rcu_torture_fakewriter"); -- cgit v1.2.3-71-gd317 From f7c0e6ad4bb1ab1cf358b8815f7d0ba7ff94d786 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Dec 2017 11:37:24 -0800 Subject: rcutorture: Re-enable testing of dynamic expediting During boot, normal grace periods are processed as expedited. When rcutorture is built into the kernel, it starts during boot and thus detects that normal grace periods are unconditionally expedited. Therefore, rcutorture concludes that there is no point in trying to dynamically enable expediting, do it disables this aspect of testing, which is a bit of an overreaction to the temporary boot-time expediting. This commit therefore rechecks forced expediting throughout the test, enabling dynamic expediting if normal grace periods are processed normally at any point. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6c46cd1d8fd7..2964b9236ddc 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -909,14 +909,10 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - if (!can_expedite) { + if (!can_expedite) pr_alert("%s" TORTURE_FLAG - " GP expediting controlled from boot/sysfs for %s,\n", + " GP expediting controlled from boot/sysfs for %s.\n", torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " Disabled dynamic grace-period expediting.\n", - torture_type); - } /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) @@ -1011,6 +1007,9 @@ rcu_torture_writer(void *arg) rcu_unexpedite_gp(); if (++expediting > 3) expediting = -expediting; + } else if (!can_expedite) { /* Disabled during boot, recheck. */ + can_expedite = !rcu_gp_is_expedited() && + !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; stutter_wait("rcu_torture_writer"); @@ -1021,6 +1020,10 @@ rcu_torture_writer(void *arg) while (can_expedite && expediting++ < 0) rcu_unexpedite_gp(); WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); + if (!can_expedite) + pr_alert("%s" TORTURE_FLAG + " Dynamic grace-period expediting was disabled.\n", + torture_type); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; -- cgit v1.2.3-71-gd317 From db0c1a8aba31edd1432f3a5925a32107065b5568 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Dec 2017 12:23:10 -0800 Subject: rcutorture: Record which grace-period primitives are tested The rcu_torture_writer() function adapts to requested testing from module parameters as well as the function pointers in the structure referenced by cur_ops. However, as long as the module parameters do not conflict with the function pointers, this adaptation is silent. This silence can result in confusion as to exactly what was tested, which could in turn result in untested RCU code making its way into mainline. This commit therefore makes rcu_torture_writer() announce exactly which portions of RCU's API it ends up testing. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2964b9236ddc..680c96d8c00f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -917,22 +917,30 @@ rcu_torture_writer(void *arg) /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; - if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) + if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; - else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) + pr_info("%s: Testing conditional GPs.\n", __func__); + } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) { pr_alert("%s: gp_cond without primitives.\n", __func__); - if (gp_exp1 && cur_ops->exp_sync) + } + if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; - else if (gp_exp && !cur_ops->exp_sync) + pr_info("%s: Testing expedited GPs.\n", __func__); + } else if (gp_exp && !cur_ops->exp_sync) { pr_alert("%s: gp_exp without primitives.\n", __func__); - if (gp_normal1 && cur_ops->deferred_free) + } + if (gp_normal1 && cur_ops->deferred_free) { synctype[nsynctypes++] = RTWS_DEF_FREE; - else if (gp_normal && !cur_ops->deferred_free) + pr_info("%s: Testing asynchronous GPs.\n", __func__); + } else if (gp_normal && !cur_ops->deferred_free) { pr_alert("%s: gp_normal without primitives.\n", __func__); - if (gp_sync1 && cur_ops->sync) + } + if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; - else if (gp_sync && !cur_ops->sync) + pr_info("%s: Testing normal GPs.\n", __func__); + } else if (gp_sync && !cur_ops->sync) { pr_alert("%s: gp_sync without primitives.\n", __func__); + } if (WARN_ONCE(nsynctypes == 0, "rcu_torture_writer: No update-side primitives.\n")) { /* -- cgit v1.2.3-71-gd317 From 85ba6bfe8bb2a4d907f7380a8f37b31616ad694e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Feb 2018 19:19:04 -0800 Subject: torture: Provide more sensible nreader/nwriter defaults for rcuperf The default values for nreader and nwriter are apparently not all that user-friendly, resulting in people doing scalability tests that ran all runs at large scale. This commit therefore makes both the nreaders and nwriters module default to the number of CPUs, and adds a comment to rcuperf.c stating that the number of CPUs should be specified using the nr_cpus kernel boot parameter. This commit also eliminates the redundant rcuperf scripting specification of default values for these parameters. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 21 ++++++++++++++++++- .../rcutorture/configs/rcuperf/ver_functions.sh | 24 +--------------------- 2 files changed, 21 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d1ebdf9868bb..777e7a6a0292 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -61,11 +61,30 @@ MODULE_AUTHOR("Paul E. McKenney "); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +/* + * The intended use cases for the nreaders and nwriters module parameters + * are as follows: + * + * 1. Specify only the nr_cpus kernel boot parameter. This will + * set both nreaders and nwriters to the value specified by + * nr_cpus for a mixed reader/writer test. + * + * 2. Specify the nr_cpus kernel boot parameter, but set + * rcuperf.nreaders to zero. This will set nwriters to the + * value specified by nr_cpus for an update-only test. + * + * 3. Specify the nr_cpus kernel boot parameter, but set + * rcuperf.nwriters to zero. This will set nreaders to the + * value specified by nr_cpus for a read-only test. + * + * Various other use cases may of course be specified. + */ + torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, 0, "Number of RCU reader threads"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, !IS_ENABLED(MODULE), "Shutdown at end of performance tests."); diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh index b9603115d7c7..d36b8fd6f0fc 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh @@ -20,32 +20,10 @@ # # Authors: Paul E. McKenney -# rcuperf_param_nreaders bootparam-string -# -# Adds nreaders rcuperf module parameter if not already specified. -rcuperf_param_nreaders () { - if ! echo "$1" | grep -q "rcuperf.nreaders" - then - echo rcuperf.nreaders=-1 - fi -} - -# rcuperf_param_nwriters bootparam-string -# -# Adds nwriters rcuperf module parameter if not already specified. -rcuperf_param_nwriters () { - if ! echo "$1" | grep -q "rcuperf.nwriters" - then - echo rcuperf.nwriters=-1 - fi -} - # per_version_boot_params bootparam-string config-file seconds # # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { - echo $1 `rcuperf_param_nreaders "$1"` \ - `rcuperf_param_nwriters "$1"` \ - rcuperf.shutdown=1 \ + echo $1 rcuperf.shutdown=1 \ rcuperf.verbose=1 } -- cgit v1.2.3-71-gd317 From 7ebb66a12f85bc375beaf45ca900427fe47aa8f7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 13 Feb 2018 13:37:25 +0000 Subject: sched/fair: Avoid an unnecessary lookup of current CPU ID during wake_affine The only caller of wake_affine() knows the CPU ID. Pass it in instead of rechecking it. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Cc: Giovanni Gherdovich Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180213133730.24064-2-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 820f94c9b200..0132572d7523 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5751,9 +5751,8 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, } static int wake_affine(struct sched_domain *sd, struct task_struct *p, - int prev_cpu, int sync) + int this_cpu, int prev_cpu, int sync) { - int this_cpu = smp_processor_id(); int target = nr_cpumask_bits; if (sched_feat(WA_IDLE)) @@ -6376,7 +6375,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (cpu == prev_cpu) goto pick_cpu; - new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); + new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync); } if (sd && !(sd_flag & SD_BALANCE_FORK)) { -- cgit v1.2.3-71-gd317 From eeb60398639143c11ff2c8b509e3a471411bb5d3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 13 Feb 2018 13:37:26 +0000 Subject: sched/fair: Defer calculation of 'prev_eff_load' in wake_affine_weight() until needed On sync wakeups, the previous CPU effective load may not be used so delay the calculation until it's needed. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Cc: Giovanni Gherdovich Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180213133730.24064-3-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0132572d7523..ae3e6f877711 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5724,7 +5724,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, unsigned long task_load; this_eff_load = target_load(this_cpu, sd->wake_idx); - prev_eff_load = source_load(prev_cpu, sd->wake_idx); if (sync) { unsigned long current_load = task_h_load(current); @@ -5742,6 +5741,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); + prev_eff_load = source_load(prev_cpu, sd->wake_idx); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; -- cgit v1.2.3-71-gd317 From 082f764a2f3f2968afa1a0b04a1ccb1b70633844 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 13 Feb 2018 13:37:27 +0000 Subject: sched/fair: Do not migrate on wake_affine_weight() if weights are equal wake_affine_weight() will consider migrating a task to, or near, the current CPU if there is a load imbalance. If the CPUs share LLC then either CPU is valid as a search-for-idle-sibling target and equally appropriate for stacking two tasks on one CPU if an idle sibling is unavailable. If they do not share cache then a cross-node migration potentially impacts locality so while they are equal from a CPU capacity point of view, they are not equal in terms of memory locality. In either case, it's more appropriate to migrate only if there is a difference in their effective load. This patch modifies wake_affine_weight() to only consider migrating a task if there is a load imbalance for normal wakeups but will allow potential stacking if the loads are equal and it's a sync wakeup. For the most part, the different in performance is marginal. For example, on a 4-socket server running netperf UDP_STREAM on localhost the differences are as follows: 4.15.0 4.15.0 16rc0 noequal-v1r23 Hmean send-64 355.47 ( 0.00%) 349.50 ( -1.68%) Hmean send-128 697.98 ( 0.00%) 693.35 ( -0.66%) Hmean send-256 1328.02 ( 0.00%) 1318.77 ( -0.70%) Hmean send-1024 5051.83 ( 0.00%) 5051.11 ( -0.01%) Hmean send-2048 9637.02 ( 0.00%) 9601.34 ( -0.37%) Hmean send-3312 14355.37 ( 0.00%) 14414.51 ( 0.41%) Hmean send-4096 16464.97 ( 0.00%) 16301.37 ( -0.99%) Hmean send-8192 26722.42 ( 0.00%) 26428.95 ( -1.10%) Hmean send-16384 38137.81 ( 0.00%) 38046.11 ( -0.24%) Hmean recv-64 355.47 ( 0.00%) 349.50 ( -1.68%) Hmean recv-128 697.98 ( 0.00%) 693.35 ( -0.66%) Hmean recv-256 1328.02 ( 0.00%) 1318.77 ( -0.70%) Hmean recv-1024 5051.83 ( 0.00%) 5051.11 ( -0.01%) Hmean recv-2048 9636.95 ( 0.00%) 9601.30 ( -0.37%) Hmean recv-3312 14355.32 ( 0.00%) 14414.48 ( 0.41%) Hmean recv-4096 16464.74 ( 0.00%) 16301.16 ( -0.99%) Hmean recv-8192 26721.63 ( 0.00%) 26428.17 ( -1.10%) Hmean recv-16384 38136.00 ( 0.00%) 38044.88 ( -0.24%) Stddev send-64 7.30 ( 0.00%) 4.75 ( 34.96%) Stddev send-128 15.15 ( 0.00%) 22.38 ( -47.66%) Stddev send-256 13.99 ( 0.00%) 19.14 ( -36.81%) Stddev send-1024 105.73 ( 0.00%) 67.38 ( 36.27%) Stddev send-2048 294.57 ( 0.00%) 223.88 ( 24.00%) Stddev send-3312 302.28 ( 0.00%) 271.74 ( 10.10%) Stddev send-4096 195.92 ( 0.00%) 121.10 ( 38.19%) Stddev send-8192 399.71 ( 0.00%) 563.77 ( -41.04%) Stddev send-16384 1163.47 ( 0.00%) 1103.68 ( 5.14%) Stddev recv-64 7.30 ( 0.00%) 4.75 ( 34.96%) Stddev recv-128 15.15 ( 0.00%) 22.38 ( -47.66%) Stddev recv-256 13.99 ( 0.00%) 19.14 ( -36.81%) Stddev recv-1024 105.73 ( 0.00%) 67.38 ( 36.27%) Stddev recv-2048 294.59 ( 0.00%) 223.89 ( 24.00%) Stddev recv-3312 302.24 ( 0.00%) 271.75 ( 10.09%) Stddev recv-4096 196.03 ( 0.00%) 121.14 ( 38.20%) Stddev recv-8192 399.86 ( 0.00%) 563.65 ( -40.96%) Stddev recv-16384 1163.79 ( 0.00%) 1103.86 ( 5.15%) The difference in overall performance is marginal but note that most measurements are less variable. There were similar observations for other netperf comparisons. hackbench with sockets or threads with processes or threads showed minor difference with some reduction of migration. tbench showed only marginal differences that were within the noise. dbench, regardless of filesystem, showed minor differences all of which are within noise. Multiple machines, both UMA and NUMA were tested without any regressions showing up. The biggest risk with a patch like this is affecting wakeup latencies. However, the schbench load from Facebook which is very sensitive to wakeup latency showed a mixed result with mostly improvements in wakeup latency: 4.15.0 4.15.0 16rc0 noequal-v1r23 Lat 50.00th-qrtle-1 38.00 ( 0.00%) 38.00 ( 0.00%) Lat 75.00th-qrtle-1 49.00 ( 0.00%) 41.00 ( 16.33%) Lat 90.00th-qrtle-1 52.00 ( 0.00%) 50.00 ( 3.85%) Lat 95.00th-qrtle-1 54.00 ( 0.00%) 51.00 ( 5.56%) Lat 99.00th-qrtle-1 63.00 ( 0.00%) 60.00 ( 4.76%) Lat 99.50th-qrtle-1 66.00 ( 0.00%) 61.00 ( 7.58%) Lat 99.90th-qrtle-1 78.00 ( 0.00%) 65.00 ( 16.67%) Lat 50.00th-qrtle-2 38.00 ( 0.00%) 38.00 ( 0.00%) Lat 75.00th-qrtle-2 42.00 ( 0.00%) 43.00 ( -2.38%) Lat 90.00th-qrtle-2 46.00 ( 0.00%) 48.00 ( -4.35%) Lat 95.00th-qrtle-2 49.00 ( 0.00%) 50.00 ( -2.04%) Lat 99.00th-qrtle-2 55.00 ( 0.00%) 57.00 ( -3.64%) Lat 99.50th-qrtle-2 58.00 ( 0.00%) 60.00 ( -3.45%) Lat 99.90th-qrtle-2 65.00 ( 0.00%) 68.00 ( -4.62%) Lat 50.00th-qrtle-4 41.00 ( 0.00%) 41.00 ( 0.00%) Lat 75.00th-qrtle-4 45.00 ( 0.00%) 46.00 ( -2.22%) Lat 90.00th-qrtle-4 50.00 ( 0.00%) 50.00 ( 0.00%) Lat 95.00th-qrtle-4 54.00 ( 0.00%) 53.00 ( 1.85%) Lat 99.00th-qrtle-4 61.00 ( 0.00%) 61.00 ( 0.00%) Lat 99.50th-qrtle-4 65.00 ( 0.00%) 64.00 ( 1.54%) Lat 99.90th-qrtle-4 76.00 ( 0.00%) 82.00 ( -7.89%) Lat 50.00th-qrtle-8 48.00 ( 0.00%) 46.00 ( 4.17%) Lat 75.00th-qrtle-8 55.00 ( 0.00%) 54.00 ( 1.82%) Lat 90.00th-qrtle-8 60.00 ( 0.00%) 59.00 ( 1.67%) Lat 95.00th-qrtle-8 63.00 ( 0.00%) 63.00 ( 0.00%) Lat 99.00th-qrtle-8 71.00 ( 0.00%) 69.00 ( 2.82%) Lat 99.50th-qrtle-8 74.00 ( 0.00%) 73.00 ( 1.35%) Lat 99.90th-qrtle-8 98.00 ( 0.00%) 90.00 ( 8.16%) Lat 50.00th-qrtle-16 56.00 ( 0.00%) 55.00 ( 1.79%) Lat 75.00th-qrtle-16 68.00 ( 0.00%) 67.00 ( 1.47%) Lat 90.00th-qrtle-16 77.00 ( 0.00%) 78.00 ( -1.30%) Lat 95.00th-qrtle-16 82.00 ( 0.00%) 84.00 ( -2.44%) Lat 99.00th-qrtle-16 90.00 ( 0.00%) 93.00 ( -3.33%) Lat 99.50th-qrtle-16 93.00 ( 0.00%) 97.00 ( -4.30%) Lat 99.90th-qrtle-16 110.00 ( 0.00%) 110.00 ( 0.00%) Lat 50.00th-qrtle-32 68.00 ( 0.00%) 62.00 ( 8.82%) Lat 75.00th-qrtle-32 90.00 ( 0.00%) 83.00 ( 7.78%) Lat 90.00th-qrtle-32 110.00 ( 0.00%) 100.00 ( 9.09%) Lat 95.00th-qrtle-32 122.00 ( 0.00%) 111.00 ( 9.02%) Lat 99.00th-qrtle-32 145.00 ( 0.00%) 133.00 ( 8.28%) Lat 99.50th-qrtle-32 154.00 ( 0.00%) 143.00 ( 7.14%) Lat 99.90th-qrtle-32 2316.00 ( 0.00%) 515.00 ( 77.76%) Lat 50.00th-qrtle-35 69.00 ( 0.00%) 72.00 ( -4.35%) Lat 75.00th-qrtle-35 92.00 ( 0.00%) 95.00 ( -3.26%) Lat 90.00th-qrtle-35 111.00 ( 0.00%) 114.00 ( -2.70%) Lat 95.00th-qrtle-35 122.00 ( 0.00%) 124.00 ( -1.64%) Lat 99.00th-qrtle-35 142.00 ( 0.00%) 144.00 ( -1.41%) Lat 99.50th-qrtle-35 150.00 ( 0.00%) 154.00 ( -2.67%) Lat 99.90th-qrtle-35 6104.00 ( 0.00%) 5640.00 ( 7.60%) Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Cc: Giovanni Gherdovich Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180213133730.24064-4-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ae3e6f877711..a07920f3a2fd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5747,7 +5747,16 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; prev_eff_load *= capacity_of(this_cpu); - return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; + /* + * If sync, adjust the weight of prev_eff_load such that if + * prev_eff == this_eff that select_idle_sibling() will consider + * stacking the wakee on top of the waker if no other CPU is + * idle. + */ + if (sync) + prev_eff_load += 1; + + return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, -- cgit v1.2.3-71-gd317 From 24d0c1d6e65f635b2c0684d0a42ff6c0674aa0e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Feb 2018 13:37:28 +0000 Subject: sched/fair: Do not migrate due to a sync wakeup on exit When a task exits, it notifies the parent that it has exited. This is a sync wakeup and the exiting task may pull the parent towards the wakers CPU. For simple workloads like using a shell, it was observed that the shell is pulled across nodes by exiting processes. This is daft as the parent may be long-lived and properly placed. This patch special cases a sync wakeup on exit to avoid pulling tasks across nodes. Testing on a range of workloads and machines showed very little differences in performance although there was a small 3% boost on some machines running a shellscript intensive workload (git regression test suite). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Cc: Giovanni Gherdovich Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180213133730.24064-5-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a07920f3a2fd..302dda81e192 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6350,7 +6350,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; - int sync = wake_flags & WF_SYNC; + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); -- cgit v1.2.3-71-gd317 From 2c83362734dad8e48ccc0710b5cd2436a0323893 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 13 Feb 2018 13:37:29 +0000 Subject: sched/fair: Consider SD_NUMA when selecting the most idle group to schedule on find_idlest_group() compares a local group with each other group to select the one that is most idle. When comparing groups in different NUMA domains, a very slight imbalance is enough to select a remote NUMA node even if the runnable load on both groups is 0 or close to 0. This ignores the cost of remote accesses entirely and is a problem when selecting the CPU for a newly forked task to run on. This is problematic when a forking server is almost guaranteed to run on a remote node incurring numerous remote accesses and potentially causing automatic NUMA balancing to try migrate the task back or migrate the data to another node. Similar weirdness is observed if a basic shell command pipes output to another as each process in the pipeline is likely to start on different nodes and then get adjusted later by wake_affine(). This patch adds imbalance to remote domains when considering whether to select CPUs from remote domains. If the local domain is selected, imbalance will still be used to try select a CPU from a lower scheduler domain's group instead of stacking tasks on the same CPU. A variety of workloads and machines were tested and as expected, there is no difference on UMA. The difference on NUMA can be dramatic. This is a comparison of elapsed times running the git regression test suite. It's fork-intensive with short-lived processes: 4.15.0 4.15.0 noexit-v1r23 sdnuma-v1r23 Elapsed min 1706.06 ( 0.00%) 1435.94 ( 15.83%) Elapsed mean 1709.53 ( 0.00%) 1436.98 ( 15.94%) Elapsed stddev 2.16 ( 0.00%) 1.01 ( 53.38%) Elapsed coeffvar 0.13 ( 0.00%) 0.07 ( 44.54%) Elapsed max 1711.59 ( 0.00%) 1438.01 ( 15.98%) 4.15.0 4.15.0 noexit-v1r23 sdnuma-v1r23 User 5434.12 5188.41 System 4878.77 3467.09 Elapsed 10259.06 8624.21 That shows a considerable reduction in elapsed times. It's important to note that automatic NUMA balancing does not affect this load as processes are too short-lived. There is also a noticable impact on hackbench such as this example using processes and pipes: hackbench-process-pipes 4.15.0 4.15.0 noexit-v1r23 sdnuma-v1r23 Amean 1 1.0973 ( 0.00%) 0.9393 ( 14.40%) Amean 4 1.3427 ( 0.00%) 1.3730 ( -2.26%) Amean 7 1.4233 ( 0.00%) 1.6670 ( -17.12%) Amean 12 3.0250 ( 0.00%) 3.3013 ( -9.13%) Amean 21 9.0860 ( 0.00%) 9.5343 ( -4.93%) Amean 30 14.6547 ( 0.00%) 13.2433 ( 9.63%) Amean 48 22.5447 ( 0.00%) 20.4303 ( 9.38%) Amean 79 29.2010 ( 0.00%) 26.7853 ( 8.27%) Amean 110 36.7443 ( 0.00%) 35.8453 ( 2.45%) Amean 141 45.8533 ( 0.00%) 42.6223 ( 7.05%) Amean 172 55.1317 ( 0.00%) 50.6473 ( 8.13%) Amean 203 64.4420 ( 0.00%) 58.3957 ( 9.38%) Amean 234 73.2293 ( 0.00%) 67.1047 ( 8.36%) Amean 265 80.5220 ( 0.00%) 75.7330 ( 5.95%) Amean 296 88.7567 ( 0.00%) 82.1533 ( 7.44%) It's not a universal win as there are occasions when spreading wide and quickly is a benefit but it's more of a win than it is a loss. For other workloads, there is little difference but netperf is interesting. Without the patch, the server and client starts on different nodes but quickly get migrated due to wake_affine. Hence, the difference is overall performance is marginal but detectable: 4.15.0 4.15.0 noexit-v1r23 sdnuma-v1r23 Hmean send-64 349.09 ( 0.00%) 354.67 ( 1.60%) Hmean send-128 699.16 ( 0.00%) 702.91 ( 0.54%) Hmean send-256 1316.34 ( 0.00%) 1350.07 ( 2.56%) Hmean send-1024 5063.99 ( 0.00%) 5124.38 ( 1.19%) Hmean send-2048 9705.19 ( 0.00%) 9687.44 ( -0.18%) Hmean send-3312 14359.48 ( 0.00%) 14577.64 ( 1.52%) Hmean send-4096 16324.20 ( 0.00%) 16393.62 ( 0.43%) Hmean send-8192 26112.61 ( 0.00%) 26877.26 ( 2.93%) Hmean send-16384 37208.44 ( 0.00%) 38683.43 ( 3.96%) Hmean recv-64 349.09 ( 0.00%) 354.67 ( 1.60%) Hmean recv-128 699.16 ( 0.00%) 702.91 ( 0.54%) Hmean recv-256 1316.34 ( 0.00%) 1350.07 ( 2.56%) Hmean recv-1024 5063.99 ( 0.00%) 5124.38 ( 1.19%) Hmean recv-2048 9705.16 ( 0.00%) 9687.43 ( -0.18%) Hmean recv-3312 14359.42 ( 0.00%) 14577.59 ( 1.52%) Hmean recv-4096 16323.98 ( 0.00%) 16393.55 ( 0.43%) Hmean recv-8192 26111.85 ( 0.00%) 26876.96 ( 2.93%) Hmean recv-16384 37206.99 ( 0.00%) 38682.41 ( 3.97%) However, what is very interesting is how automatic NUMA balancing behaves. Each netperf instance runs long enough for balancing to activate: NUMA base PTE updates 4620 1473 NUMA huge PMD updates 0 0 NUMA page range updates 4620 1473 NUMA hint faults 4301 1383 NUMA hint local faults 1309 451 NUMA hint local percent 30 32 NUMA pages migrated 1335 491 AutoNUMA cost 21% 6% There is an unfortunate number of remote faults although tracing indicated that the vast majority are in shared libraries. However, the tendency to start tasks on the same node if there is capacity means that there were far fewer PTE updates and faults incurred overall. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Cc: Giovanni Gherdovich Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180213133730.24064-6-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 302dda81e192..94aea5b91a96 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5911,6 +5911,18 @@ skip_spare: if (!idlest) return NULL; + /* + * When comparing groups across NUMA domains, it's possible for the + * local domain to be very lightly loaded relative to the remote + * domains but "imbalance" skews the comparison making remote CPUs + * look much more favourable. When considering cross-domain, add + * imbalance to the runnable load on the remote node and consider + * staying local. + */ + if ((sd->flags & SD_NUMA) && + min_runnable_load + imbalance >= this_runnable_load) + return NULL; + if (min_runnable_load > (this_runnable_load + imbalance)) return NULL; -- cgit v1.2.3-71-gd317 From 7347fc87dfe6b7315e74310ee1243dc222c68086 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 13 Feb 2018 13:37:30 +0000 Subject: sched/numa: Delay retrying placement for automatic NUMA balance after wake_affine() If wake_affine() pulls a task to another node for any reason and the node is no longer preferred then temporarily stop automatic NUMA balancing pulling the task back. Otherwise, tasks with a strong waker/wakee relationship may constantly fight automatic NUMA balancing over where a task should be placed. Once again netperf is interesting here. The performance barely changes but automatic NUMA balancing is interesting: Hmean send-64 354.67 ( 0.00%) 352.15 ( -0.71%) Hmean send-128 702.91 ( 0.00%) 693.84 ( -1.29%) Hmean send-256 1350.07 ( 0.00%) 1344.19 ( -0.44%) Hmean send-1024 5124.38 ( 0.00%) 4941.24 ( -3.57%) Hmean send-2048 9687.44 ( 0.00%) 9624.45 ( -0.65%) Hmean send-3312 14577.64 ( 0.00%) 14514.35 ( -0.43%) Hmean send-4096 16393.62 ( 0.00%) 16488.30 ( 0.58%) Hmean send-8192 26877.26 ( 0.00%) 26431.63 ( -1.66%) Hmean send-16384 38683.43 ( 0.00%) 38264.91 ( -1.08%) Hmean recv-64 354.67 ( 0.00%) 352.15 ( -0.71%) Hmean recv-128 702.91 ( 0.00%) 693.84 ( -1.29%) Hmean recv-256 1350.07 ( 0.00%) 1344.19 ( -0.44%) Hmean recv-1024 5124.38 ( 0.00%) 4941.24 ( -3.57%) Hmean recv-2048 9687.43 ( 0.00%) 9624.45 ( -0.65%) Hmean recv-3312 14577.59 ( 0.00%) 14514.35 ( -0.43%) Hmean recv-4096 16393.55 ( 0.00%) 16488.20 ( 0.58%) Hmean recv-8192 26876.96 ( 0.00%) 26431.29 ( -1.66%) Hmean recv-16384 38682.41 ( 0.00%) 38263.94 ( -1.08%) NUMA alloc hit 1465986 1423090 NUMA alloc miss 0 0 NUMA interleave hit 0 0 NUMA alloc local 1465897 1423003 NUMA base PTE updates 1473 1420 NUMA huge PMD updates 0 0 NUMA page range updates 1473 1420 NUMA hint faults 1383 1312 NUMA hint local faults 451 124 NUMA hint local percent 32 9 There is a slight degrading in performance but there are slightly fewer NUMA faults. There is a large drop in the percentage of local faults but the bulk of migrations for netperf are in small shared libraries so it's reflecting the fact that automatic NUMA balancing has backed off. This is a case where despite wake_affine() and automatic NUMA balancing fighting for placement that there is a marginal benefit to rescheduling to local data quickly. However, it should be noted that wake_affine() and automatic NUMA balancing fighting each other constantly is undesirable. However, the benefit in other cases is large. This is the result for NAS with the D class sizing on a 4-socket machine: nas-mpi 4.15.0 4.15.0 sdnuma-v1r23 delayretry-v1r23 Time cg.D 557.00 ( 0.00%) 431.82 ( 22.47%) Time ep.D 77.83 ( 0.00%) 79.01 ( -1.52%) Time is.D 26.46 ( 0.00%) 26.64 ( -0.68%) Time lu.D 727.14 ( 0.00%) 597.94 ( 17.77%) Time mg.D 191.35 ( 0.00%) 146.85 ( 23.26%) 4.15.0 4.15.0 sdnuma-v1r23delayretry-v1r23 User 75665.20 70413.30 System 20321.59 8861.67 Elapsed 766.13 634.92 Minor Faults 16528502 7127941 Major Faults 4553 5068 NUMA alloc local 6963197 6749135 NUMA base PTE updates 366409093 107491434 NUMA huge PMD updates 687556 198880 NUMA page range updates 718437765 209317994 NUMA hint faults 13643410 4601187 NUMA hint local faults 9212593 3063996 NUMA hint local percent 67 66 Note the massive reduction in system CPU usage even though the percentage of local faults is barely affected. There is a massive reduction in the number of PTE updates showing that automatic NUMA balancing has backed off. A critical observation is also that there is a massive reduction in minor faults which is due to far fewer NUMA hinting faults being trapped. There were questions on NAS OMP and how it behaved related to threads being bound to CPUs. First, there are more gains than losses with this patch applied and a reduction in system CPU usage: nas-omp 4.16.0-rc1 4.16.0-rc1 sdnuma-v2r1 delayretry-v2r1 Time bt.D 436.71 ( 0.00%) 430.05 ( 1.53%) Time cg.D 201.02 ( 0.00%) 180.87 ( 10.02%) Time ep.D 32.84 ( 0.00%) 32.68 ( 0.49%) Time is.D 9.63 ( 0.00%) 9.64 ( -0.10%) Time lu.D 331.20 ( 0.00%) 304.80 ( 7.97%) Time mg.D 54.87 ( 0.00%) 52.72 ( 3.92%) Time sp.D 1108.78 ( 0.00%) 917.10 ( 17.29%) Time ua.D 378.81 ( 0.00%) 398.83 ( -5.28%) 4.16.0-rc1 4.16.0-rc1 sdnuma-v2r1delayretry-v2r1 User 305633.08 296751.91 System 451.75 357.80 Elapsed 2595.73 2368.13 However, it does not close the gap between binding and being unbound. There is negligible difference between the performance of the baseline and a patched kernel when threads are bound so it is not presented here: 4.16.0-rc1 4.16.0-rc1 delayretry-bind delayretry-unbound Time bt.D 385.02 ( 0.00%) 430.05 ( -11.70%) Time cg.D 144.02 ( 0.00%) 180.87 ( -25.59%) Time ep.D 32.85 ( 0.00%) 32.68 ( 0.52%) Time is.D 10.52 ( 0.00%) 9.64 ( 8.37%) Time lu.D 285.31 ( 0.00%) 304.80 ( -6.83%) Time mg.D 43.21 ( 0.00%) 52.72 ( -22.01%) Time sp.D 820.24 ( 0.00%) 917.10 ( -11.81%) Time ua.D 337.09 ( 0.00%) 398.83 ( -18.32%) 4.16.0-rc1 4.16.0-rc1 delayretry-binddelayretry-unbound User 277731.25 296751.91 System 261.29 357.80 Elapsed 2100.55 2368.13 Unfortunately, while performance is improved by the patch, there is still quite a long way to go before it's equivalent to hard binding. Other workloads like hackbench, tbench, dbench and schbench are barely affected. dbench shows a mix of gains and losses depending on the machine although in general, the results are more stable. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Cc: Giovanni Gherdovich Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180213133730.24064-7-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 94aea5b91a96..33662a3bdc6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1869,6 +1869,7 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { unsigned long interval = HZ; + unsigned long numa_migrate_retry; /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) @@ -1876,7 +1877,18 @@ static void numa_migrate_preferred(struct task_struct *p) /* Periodically retry migrating the task to the preferred node */ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); - p->numa_migrate_retry = jiffies + interval; + numa_migrate_retry = jiffies + interval; + + /* + * Check that the new retry threshold is after the current one. If + * the retry is in the future, it implies that wake_affine has + * temporarily asked NUMA balancing to backoff from placement. + */ + if (numa_migrate_retry > p->numa_migrate_retry) + return; + + /* Safe to try placing the task on the preferred node */ + p->numa_migrate_retry = numa_migrate_retry; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) @@ -5759,6 +5771,48 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; } +#ifdef CONFIG_NUMA_BALANCING +static void +update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) +{ + unsigned long interval; + + if (!static_branch_likely(&sched_numa_balancing)) + return; + + /* If balancing has no preference then continue gathering data */ + if (p->numa_preferred_nid == -1) + return; + + /* + * If the wakeup is not affecting locality then it is neutral from + * the perspective of NUMA balacing so continue gathering data. + */ + if (cpu_to_node(prev_cpu) == cpu_to_node(target)) + return; + + /* + * Temporarily prevent NUMA balancing trying to place waker/wakee after + * wakee has been moved by wake_affine. This will potentially allow + * related tasks to converge and update their data placement. The + * 4 * numa_scan_period is to allow the two-pass filter to migrate + * hot data to the wakers node. + */ + interval = max(sysctl_numa_balancing_scan_delay, + p->numa_scan_period << 2); + p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); + + interval = max(sysctl_numa_balancing_scan_delay, + current->numa_scan_period << 2); + current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); +} +#else +static void +update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) +{ +} +#endif + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { @@ -5774,6 +5828,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (target == nr_cpumask_bits) return prev_cpu; + update_wa_numa_placement(p, prev_cpu, target); schedstat_inc(sd->ttwu_move_affine); schedstat_inc(p->se.statistics.nr_wakeups_affine); return target; -- cgit v1.2.3-71-gd317 From 77a021be383ebdacb17594e280242f7fd116095f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Feb 2018 05:17:23 +0100 Subject: sched/core: Rename init_rq_hrtick() to hrtick_rq_init() Do that rename in order to normalize the hrtick namespace. Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1519186649-3242-2-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e7c535eee0a6..e72ca3c574fc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -333,7 +333,7 @@ void hrtick_start(struct rq *rq, u64 delay) } #endif /* CONFIG_SMP */ -static void init_rq_hrtick(struct rq *rq) +static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP rq->hrtick_csd_pending = 0; @@ -351,7 +351,7 @@ static inline void hrtick_clear(struct rq *rq) { } -static inline void init_rq_hrtick(struct rq *rq) +static inline void hrtick_rq_init(struct rq *rq) { } #endif /* CONFIG_SCHED_HRTICK */ @@ -6028,7 +6028,7 @@ void __init sched_init(void) rq->last_sched_tick = 0; #endif #endif /* CONFIG_SMP */ - init_rq_hrtick(rq); + hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); } -- cgit v1.2.3-71-gd317 From a364298359e74a414857bbbf3b725564feb22d09 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Feb 2018 05:17:24 +0100 Subject: nohz: Convert tick_nohz_tick_stopped() to bool It makes this function more self-explanatory about what it does and how to use it. Reported-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1519186649-3242-3-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/tick.h | 2 +- kernel/time/tick-sched.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index 7cc35921218e..86576d9d2311 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -113,7 +113,7 @@ enum tick_dep_bits { #ifdef CONFIG_NO_HZ_COMMON extern bool tick_nohz_enabled; -extern int tick_nohz_tick_stopped(void); +extern bool tick_nohz_tick_stopped(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 29a5733eff83..0aba0412ede5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -481,7 +481,7 @@ static int __init setup_tick_nohz(char *str) __setup("nohz=", setup_tick_nohz); -int tick_nohz_tick_stopped(void) +bool tick_nohz_tick_stopped(void) { return __this_cpu_read(tick_cpu_sched.tick_stopped); } -- cgit v1.2.3-71-gd317 From 22ab8bc02a5f6e8ffc418759894f7a6b0b632331 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Feb 2018 05:17:25 +0100 Subject: nohz: Allow to check if remote CPU tick is stopped This check is racy but provides a good heuristic to determine whether a CPU may need a remote tick or not. Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1519186649-3242-4-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/tick.h | 2 ++ kernel/time/tick-sched.c | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index 86576d9d2311..7f8c9a127f5a 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -114,6 +114,7 @@ enum tick_dep_bits { #ifdef CONFIG_NO_HZ_COMMON extern bool tick_nohz_enabled; extern bool tick_nohz_tick_stopped(void); +extern bool tick_nohz_tick_stopped_cpu(int cpu); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); @@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ #define tick_nohz_enabled (0) static inline int tick_nohz_tick_stopped(void) { return 0; } +static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0aba0412ede5..d479b21a848b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -486,6 +486,13 @@ bool tick_nohz_tick_stopped(void) return __this_cpu_read(tick_cpu_sched.tick_stopped); } +bool tick_nohz_tick_stopped_cpu(int cpu) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + return ts->tick_stopped; +} + /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * -- cgit v1.2.3-71-gd317 From 1bda3f8087fce9063da0b8aef87f17a3fe541aca Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Feb 2018 05:17:26 +0100 Subject: sched/isolation: Isolate workqueues when "nohz_full=" is set As we prepare for offloading the residual 1hz scheduler ticks to workqueue, let's affine those to housekeepers so that they don't interrupt the CPUs that don't want to be disturbed. Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1519186649-3242-5-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/isolation.h | 1 + kernel/sched/isolation.c | 3 ++- kernel/workqueue.c | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index d849431c8060..4a6582c27dea 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -12,6 +12,7 @@ enum hk_flags { HK_FLAG_SCHED = (1 << 3), HK_FLAG_TICK = (1 << 4), HK_FLAG_DOMAIN = (1 << 5), + HK_FLAG_WQ = (1 << 6), }; #ifdef CONFIG_CPU_ISOLATION diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b71b436f59f2..a2500c459617 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -3,6 +3,7 @@ * any CPU: unbound workqueues, timers, kthreads and any offloadable work. * * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker + * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ @@ -119,7 +120,7 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned int flags; - flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; return housekeeping_setup(str, flags); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 017044c26233..593dbe749174 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5565,12 +5565,13 @@ static void __init wq_numa_init(void) int __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; + int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit v1.2.3-71-gd317 From d84b31313ef8a8de55a2cbfb72f76f36d8c927fb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Feb 2018 05:17:27 +0100 Subject: sched/isolation: Offload residual 1Hz scheduler tick When a CPU runs in full dynticks mode, a 1Hz tick remains in order to keep the scheduler stats alive. However this residual tick is a burden for bare metal tasks that can't stand any interruption at all, or want to minimize them. The usual boot parameters "nohz_full=" or "isolcpus=nohz" will now outsource these scheduler ticks to the global workqueue so that a housekeeping CPU handles those remotely. The sched_class::task_tick() implementations have been audited and look safe to be called remotely as the target runqueue and its current task are passed in parameter and don't seem to be accessed locally. Note that in the case of using isolcpus, it's still up to the user to affine the global workqueues to the housekeeping CPUs through /sys/devices/virtual/workqueue/cpumask or domains isolation "isolcpus=nohz,domain". Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1519186649-3242-6-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/deadline.c | 8 +++++ kernel/sched/fair.c | 7 +++- kernel/sched/idle_task.c | 8 +++++ kernel/sched/isolation.c | 4 +++ kernel/sched/rt.c | 8 +++++ kernel/sched/sched.h | 2 ++ kernel/sched/stop_task.c | 8 +++++ 8 files changed, 136 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e72ca3c574fc..5dfef458ab52 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3125,6 +3125,96 @@ u64 scheduler_tick_max_deferment(void) return jiffies_to_nsecs(next - now); } + +struct tick_work { + int cpu; + struct delayed_work work; +}; + +static struct tick_work __percpu *tick_work_cpu; + +static void sched_tick_remote(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct tick_work *twork = container_of(dwork, struct tick_work, work); + int cpu = twork->cpu; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + /* + * Handle the tick only if it appears the remote CPU is running in full + * dynticks mode. The check is racy by nature, but missing a tick or + * having one too much is no big deal because the scheduler tick updates + * statistics and checks timeslices in a time-independent way, regardless + * of when exactly it is running. + */ + if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { + struct task_struct *curr; + u64 delta; + + rq_lock_irq(rq, &rf); + update_rq_clock(rq); + curr = rq->curr; + delta = rq_clock_task(rq) - curr->se.exec_start; + + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + curr->sched_class->task_tick(rq, curr, 0); + rq_unlock_irq(rq, &rf); + } + + /* + * Run the remote tick once per second (1Hz). This arbitrary + * frequency is large enough to avoid overload but short enough + * to keep scheduler internal stats reasonably up to date. + */ + queue_delayed_work(system_unbound_wq, dwork, HZ); +} + +static void sched_tick_start(int cpu) +{ + struct tick_work *twork; + + if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + return; + + WARN_ON_ONCE(!tick_work_cpu); + + twork = per_cpu_ptr(tick_work_cpu, cpu); + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sched_tick_stop(int cpu) +{ + struct tick_work *twork; + + if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + return; + + WARN_ON_ONCE(!tick_work_cpu); + + twork = per_cpu_ptr(tick_work_cpu, cpu); + cancel_delayed_work_sync(&twork->work); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +int __init sched_tick_offload_init(void) +{ + tick_work_cpu = alloc_percpu(struct tick_work); + BUG_ON(!tick_work_cpu); + + return 0; +} + +#else /* !CONFIG_NO_HZ_FULL */ +static inline void sched_tick_start(int cpu) { } +static inline void sched_tick_stop(int cpu) { } #endif #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ @@ -5786,6 +5876,7 @@ int sched_cpu_starting(unsigned int cpu) { set_cpu_rq_start_time(cpu); sched_rq_cpu_starting(cpu); + sched_tick_start(cpu); return 0; } @@ -5797,6 +5888,7 @@ int sched_cpu_dying(unsigned int cpu) /* Handle pending wakeups and then migrate everything off */ sched_ttwu_pending(); + sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); if (rq->rd) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9df09782025c..65cd5ead1759 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1776,6 +1776,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) enqueue_pushable_dl_task(rq, p); } +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 33662a3bdc6d..e1febd252a84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9515,7 +9515,12 @@ static void rq_offline_fair(struct rq *rq) #endif /* CONFIG_SMP */ /* - * scheduler tick hitting a task of our scheduling class: + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index d518664cce4f..e1b46e08c8e1 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -51,6 +51,14 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) rq_last_tick_reset(rq); } +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index a2500c459617..39f340dde1d7 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -13,6 +13,7 @@ #include #include #include +#include "sched.h" DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); EXPORT_SYMBOL_GPL(housekeeping_overriden); @@ -61,6 +62,9 @@ void __init housekeeping_init(void) static_branch_enable(&housekeeping_overriden); + if (housekeeping_flags & HK_FLAG_TICK) + sched_tick_offload_init(); + /* We need at least one CPU to handle housekeeping work */ WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index aad49451584e..c80563b4f6b9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2292,6 +2292,14 @@ static void watchdog(struct rq *rq, struct task_struct *p) static inline void watchdog(struct rq *rq, struct task_struct *p) { } #endif +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { struct sched_rt_entity *rt_se = &p->rt; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb5fc458547f..c1c7c788da1c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1574,6 +1574,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); +extern int __init sched_tick_offload_init(void); /* * Tick may be needed by tasks in the runqueue depending on their policy and @@ -1598,6 +1599,7 @@ static inline void sched_update_tick_dependency(struct rq *rq) tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } #else +static inline int sched_tick_offload_init(void) { return 0; } static inline void sched_update_tick_dependency(struct rq *rq) { } #endif diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 210b1f2146ff..ea8d2b6a1239 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -75,6 +75,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) cgroup_account_cputime(curr, delta_exec); } +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -- cgit v1.2.3-71-gd317 From dcdedb24159be3487e3dbbe1faa79ae7d00c92ac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Feb 2018 05:17:28 +0100 Subject: sched/nohz: Remove the 1 Hz tick code Now that the 1Hz tick is offloaded to workqueues, we can safely remove the residual code that used to handle it locally. Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1519186649-3242-7-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/nohz.h | 4 ---- kernel/sched/core.c | 29 ----------------------------- kernel/sched/idle_task.c | 1 - kernel/sched/sched.h | 11 +---------- kernel/time/tick-sched.c | 6 ------ 5 files changed, 1 insertion(+), 50 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 3d3a97d9399d..094217273ff9 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -37,8 +37,4 @@ extern void wake_up_nohz_cpu(int cpu); static inline void wake_up_nohz_cpu(int cpu) { } #endif -#ifdef CONFIG_NO_HZ_FULL -extern u64 scheduler_tick_max_deferment(void); -#endif - #endif /* _LINUX_SCHED_NOHZ_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5dfef458ab52..8fff4f16c510 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3096,35 +3096,9 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); #endif - rq_last_tick_reset(rq); } #ifdef CONFIG_NO_HZ_FULL -/** - * scheduler_tick_max_deferment - * - * Keep at least one tick per second when a single - * active task is running because the scheduler doesn't - * yet completely support full dynticks environment. - * - * This makes sure that uptime, CFS vruntime, load - * balancing, etc... continue to move forward, even - * with a very low granularity. - * - * Return: Maximum deferment in nanoseconds. - */ -u64 scheduler_tick_max_deferment(void) -{ - struct rq *rq = this_rq(); - unsigned long next, now = READ_ONCE(jiffies); - - next = rq->last_sched_tick + HZ; - - if (time_before_eq(next, now)) - return 0; - - return jiffies_to_nsecs(next - now); -} struct tick_work { int cpu; @@ -6116,9 +6090,6 @@ void __init sched_init(void) rq->last_load_update_tick = jiffies; rq->nohz_flags = 0; #endif -#ifdef CONFIG_NO_HZ_FULL - rq->last_sched_tick = 0; -#endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index e1b46e08c8e1..48b8a83f5185 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -48,7 +48,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { - rq_last_tick_reset(rq); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c1c7c788da1c..dc6c8b5a24ad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -727,9 +727,7 @@ struct rq { #endif /* CONFIG_SMP */ unsigned long nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_NO_HZ_FULL - unsigned long last_sched_tick; -#endif + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -1626,13 +1624,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } -static inline void rq_last_tick_reset(struct rq *rq) -{ -#ifdef CONFIG_NO_HZ_FULL - rq->last_sched_tick = jiffies; -#endif -} - extern void update_rq_clock(struct rq *rq); extern void activate_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d479b21a848b..f2fa2e940fe5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -748,12 +748,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, delta = KTIME_MAX; } -#ifdef CONFIG_NO_HZ_FULL - /* Limit the tick delta to the maximum scheduler deferment */ - if (!ts->inidle) - delta = min(delta, scheduler_tick_max_deferment()); -#endif - /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; -- cgit v1.2.3-71-gd317 From d1897c9538edafd4ae6bbd03cc075962ddde2c21 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 21 Feb 2018 11:39:22 -0800 Subject: cgroup: fix rule checking for threaded mode switching A domain cgroup isn't allowed to be turned threaded if its subtree is populated or domain controllers are enabled. cgroup_enable_threaded() depended on cgroup_can_be_thread_root() test to enforce this rule. A parent which has populated domain descendants or have domain controllers enabled can't become a thread root, so the above rules are enforced automatically. However, for the root cgroup which can host mixed domain and threaded children, cgroup_can_be_thread_root() doesn't check any of those conditions and thus first level cgroups ends up escaping those rules. This patch fixes the bug by adding explicit checks for those rules in cgroup_enable_threaded(). Reported-by: Michael Kerrisk (man-pages) Signed-off-by: Tejun Heo Fixes: 8cfd8147df67 ("cgroup: implement cgroup v2 thread support") Cc: stable@vger.kernel.org # v4.14+ --- kernel/cgroup/cgroup.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8cda3bc3ae22..4bfb2908ec15 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3183,6 +3183,16 @@ static int cgroup_enable_threaded(struct cgroup *cgrp) if (cgroup_is_threaded(cgrp)) return 0; + /* + * If @cgroup is populated or has domain controllers enabled, it + * can't be switched. While the below cgroup_can_be_thread_root() + * test can catch the same conditions, that's only when @parent is + * not mixable, so let's check it explicitly. + */ + if (cgroup_is_populated(cgrp) || + cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) + return -EOPNOTSUPP; + /* we're joining the parent's domain, ensure its validity */ if (!cgroup_is_valid_domain(dom_cgrp) || !cgroup_can_be_thread_root(dom_cgrp)) -- cgit v1.2.3-71-gd317 From 23138ead270045f1b3e912e667967b6094244999 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 21 Feb 2018 04:30:07 -0500 Subject: audit: return on memory error to avoid null pointer dereference If there is a memory allocation error when trying to change an audit kernel feature value, the ignored allocation error will trigger a NULL pointer dereference oops on subsequent use of that pointer. Return instead. Passes audit-testsuite. See: https://github.com/linux-audit/audit-kernel/issues/76 Signed-off-by: Richard Guy Briggs [PM: not necessary (other funcs check for NULL), but a good practice] Signed-off-by: Paul Moore --- kernel/audit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 5c2544984375..2de74be7cef5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1059,6 +1059,8 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); + if (!ab) + return; audit_log_task_info(ab, current); audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, -- cgit v1.2.3-71-gd317 From 304ec482f562885b178b370cd50340447585d1c0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 22 Feb 2018 14:28:50 -0500 Subject: get rid of pointless includes of fs_struct.h Signed-off-by: Al Viro --- arch/arc/kernel/troubleshoot.c | 1 - drivers/staging/lustre/lnet/libcfs/linux/linux-curproc.c | 1 - drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c | 1 - fs/ceph/dir.c | 1 - fs/nfs/nfs4xdr.c | 1 - fs/orangefs/acl.c | 1 - kernel/exec_domain.c | 1 - security/loadpin/loadpin.c | 1 - 8 files changed, 8 deletions(-) (limited to 'kernel') diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index 6e9a0a9a6a04..783b20354f8b 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-curproc.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-curproc.c index 1d8949f1a4fa..91e48c42533c 100644 --- a/drivers/staging/lustre/lnet/libcfs/linux/linux-curproc.c +++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-curproc.c @@ -38,7 +38,6 @@ */ #include -#include #include #include diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c index 6f92ea272186..4eb9ba455e98 100644 --- a/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c +++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c @@ -34,7 +34,6 @@ #define DEBUG_SUBSYSTEM S_LNET #include #include -#include #include #include diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0c4346806e17..364a91fb3599 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -2,7 +2,6 @@ #include #include -#include #include #include #include diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 65c9c4175145..b993ad282de2 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,7 +52,6 @@ #include #include #include -#include #include "nfs4_fs.h" #include "internal.h" diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 480ea059a680..10587413b20e 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -9,7 +9,6 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" #include -#include struct posix_acl *orangefs_get_acl(struct inode *inode, int type) { diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0975b0268545..a5697119290e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -19,7 +19,6 @@ #include #include #include -#include #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c index dbe6efde77a0..5fa191252c8f 100644 --- a/security/loadpin/loadpin.c +++ b/security/loadpin/loadpin.c @@ -19,7 +19,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3-71-gd317 From ce423631ce1f20564f818e7de6bc0eee0c01badd Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 20 Feb 2018 09:52:38 -0500 Subject: audit: track the owner of the command mutex ourselves Evidently the __mutex_owner() function was never intended for use outside the core mutex code, so build a thing locking wrapper around the mutex code which allows us to track the mutex owner. One, arguably positive, side effect is that this allows us to hide the audit_cmd_mutex inside of kernel/audit.c behind the lock/unlock functions. Reported-by: Peter Zijlstra Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 66 ++++++++++++++++++++++++++++++++++++++++++++--------- kernel/audit.h | 3 ++- kernel/audit_tree.c | 8 +++---- 3 files changed, 61 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 2de74be7cef5..1a3e75d9a66c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -181,9 +181,21 @@ static char *audit_feature_names[2] = { "loginuid_immutable", }; - -/* Serialize requests from userspace. */ -DEFINE_MUTEX(audit_cmd_mutex); +/** + * struct audit_ctl_mutex - serialize requests from userspace + * @lock: the mutex used for locking + * @owner: the task which owns the lock + * + * Description: + * This is the lock struct used to ensure we only process userspace requests + * in an orderly fashion. We can't simply use a mutex/lock here because we + * need to track lock ownership so we don't end up blocking the lock owner in + * audit_log_start() or similar. + */ +static struct audit_ctl_mutex { + struct mutex lock; + void *owner; +} audit_cmd_mutex; /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -227,6 +239,36 @@ int auditd_test_task(struct task_struct *task) return rc; } +/** + * audit_ctl_lock - Take the audit control lock + */ +void audit_ctl_lock(void) +{ + mutex_lock(&audit_cmd_mutex.lock); + audit_cmd_mutex.owner = current; +} + +/** + * audit_ctl_unlock - Drop the audit control lock + */ +void audit_ctl_unlock(void) +{ + audit_cmd_mutex.owner = NULL; + mutex_unlock(&audit_cmd_mutex.lock); +} + +/** + * audit_ctl_owner_current - Test to see if the current task owns the lock + * + * Description: + * Return true if the current task owns the audit control lock, false if it + * doesn't own the lock. + */ +static bool audit_ctl_owner_current(void) +{ + return (current == audit_cmd_mutex.owner); +} + /** * auditd_pid_vnr - Return the auditd PID relative to the namespace * @@ -861,8 +903,8 @@ int audit_send_list(void *_dest) struct sock *sk = audit_get_sk(dest->net); /* wait for parent to finish and send an ACK */ - mutex_lock(&audit_cmd_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_lock(); + audit_ctl_unlock(); while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(sk, skb, dest->portid, 0); @@ -903,8 +945,8 @@ static int audit_send_reply_thread(void *arg) struct audit_reply *reply = (struct audit_reply *)arg; struct sock *sk = audit_get_sk(reply->net); - mutex_lock(&audit_cmd_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_lock(); + audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ @@ -1469,7 +1511,7 @@ static void audit_receive(struct sk_buff *skb) nlh = nlmsg_hdr(skb); len = skb->len; - mutex_lock(&audit_cmd_mutex); + audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { err = audit_receive_msg(skb, nlh); /* if err or if this message says it wants a response */ @@ -1478,7 +1520,7 @@ static void audit_receive(struct sk_buff *skb) nlh = nlmsg_next(nlh, &len); } - mutex_unlock(&audit_cmd_mutex); + audit_ctl_unlock(); } /* Run custom bind function on netlink socket group connect or bind requests. */ @@ -1550,6 +1592,9 @@ static int __init audit_init(void) for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); + mutex_init(&audit_cmd_mutex.lock); + audit_cmd_mutex.owner = NULL; + pr_info("initializing netlink subsys (%s)\n", audit_default ? "enabled" : "disabled"); register_pernet_subsys(&audit_net_ops); @@ -1713,8 +1758,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block * while holding the mutex */ - if (!(auditd_test_task(current) || - (current == __mutex_owner(&audit_cmd_mutex)))) { + if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; while (audit_backlog_limit && diff --git a/kernel/audit.h b/kernel/audit.h index af5bc59487ed..214e14948370 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -341,4 +341,5 @@ extern struct list_head *audit_killed_trees(void); #define audit_filter_inodes(t,c) AUDIT_DISABLED #endif -extern struct mutex audit_cmd_mutex; +extern void audit_ctl_lock(void); +extern void audit_ctl_unlock(void); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index fd353120e0d9..67e6956c0b61 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -709,7 +709,7 @@ static int prune_tree_thread(void *unused) schedule(); } - mutex_lock(&audit_cmd_mutex); + audit_ctl_lock(); mutex_lock(&audit_filter_mutex); while (!list_empty(&prune_list)) { @@ -727,7 +727,7 @@ static int prune_tree_thread(void *unused) } mutex_unlock(&audit_filter_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_unlock(); } return 0; } @@ -924,7 +924,7 @@ static void audit_schedule_prune(void) */ void audit_kill_trees(struct list_head *list) { - mutex_lock(&audit_cmd_mutex); + audit_ctl_lock(); mutex_lock(&audit_filter_mutex); while (!list_empty(list)) { @@ -942,7 +942,7 @@ void audit_kill_trees(struct list_head *list) } mutex_unlock(&audit_filter_mutex); - mutex_unlock(&audit_cmd_mutex); + audit_ctl_unlock(); } /* -- cgit v1.2.3-71-gd317 From ca36960211eb228bcbc7aaebfa0d027368a94c60 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Feb 2018 22:29:05 +0100 Subject: bpf: allow xadd only on aligned memory The requirements around atomic_add() / atomic64_add() resp. their JIT implementations differ across architectures. E.g. while x86_64 seems just fine with BPF's xadd on unaligned memory, on arm64 it triggers via interpreter but also JIT the following crash: [ 830.864985] Unable to handle kernel paging request at virtual address ffff8097d7ed6703 [...] [ 830.916161] Internal error: Oops: 96000021 [#1] SMP [ 830.984755] CPU: 37 PID: 2788 Comm: test_verifier Not tainted 4.16.0-rc2+ #8 [ 830.991790] Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.29 07/17/2017 [ 830.998998] pstate: 80400005 (Nzcv daif +PAN -UAO) [ 831.003793] pc : __ll_sc_atomic_add+0x4/0x18 [ 831.008055] lr : ___bpf_prog_run+0x1198/0x1588 [ 831.012485] sp : ffff00001ccabc20 [ 831.015786] x29: ffff00001ccabc20 x28: ffff8017d56a0f00 [ 831.021087] x27: 0000000000000001 x26: 0000000000000000 [ 831.026387] x25: 000000c168d9db98 x24: 0000000000000000 [ 831.031686] x23: ffff000008203878 x22: ffff000009488000 [ 831.036986] x21: ffff000008b14e28 x20: ffff00001ccabcb0 [ 831.042286] x19: ffff0000097b5080 x18: 0000000000000a03 [ 831.047585] x17: 0000000000000000 x16: 0000000000000000 [ 831.052885] x15: 0000ffffaeca8000 x14: 0000000000000000 [ 831.058184] x13: 0000000000000000 x12: 0000000000000000 [ 831.063484] x11: 0000000000000001 x10: 0000000000000000 [ 831.068783] x9 : 0000000000000000 x8 : 0000000000000000 [ 831.074083] x7 : 0000000000000000 x6 : 000580d428000000 [ 831.079383] x5 : 0000000000000018 x4 : 0000000000000000 [ 831.084682] x3 : ffff00001ccabcb0 x2 : 0000000000000001 [ 831.089982] x1 : ffff8097d7ed6703 x0 : 0000000000000001 [ 831.095282] Process test_verifier (pid: 2788, stack limit = 0x0000000018370044) [ 831.102577] Call trace: [ 831.105012] __ll_sc_atomic_add+0x4/0x18 [ 831.108923] __bpf_prog_run32+0x4c/0x70 [ 831.112748] bpf_test_run+0x78/0xf8 [ 831.116224] bpf_prog_test_run_xdp+0xb4/0x120 [ 831.120567] SyS_bpf+0x77c/0x1110 [ 831.123873] el0_svc_naked+0x30/0x34 [ 831.127437] Code: 97fffe97 17ffffec 00000000 f9800031 (885f7c31) Reason for this is because memory is required to be aligned. In case of BPF, we always enforce alignment in terms of stack access, but not when accessing map values or packet data when the underlying arch (e.g. arm64) has CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS set. xadd on packet data that is local to us anyway is just wrong, so forbid this case entirely. The only place where xadd makes sense in fact are map values; xadd on stack is wrong as well, but it's been around for much longer. Specifically enforce strict alignment in case of xadd, so that we handle this case generically and avoid such crashes in the first place. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 42 +++++++++++++-------- tools/testing/selftests/bpf/test_verifier.c | 58 +++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5fb69a85d967..c6eff108aa99 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1356,6 +1356,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_CTX; } +static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = cur_regs(env) + regno; + + return type_is_pkt_pointer(reg->type); +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1416,10 +1423,10 @@ static int check_generic_ptr_alignment(struct bpf_verifier_env *env, } static int check_ptr_alignment(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int off, int size) + const struct bpf_reg_state *reg, int off, + int size, bool strict_alignment_once) { - bool strict = env->strict_alignment; + bool strict = env->strict_alignment || strict_alignment_once; const char *pointer_desc = ""; switch (reg->type) { @@ -1576,9 +1583,9 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, - int bpf_size, enum bpf_access_type t, - int value_regno) +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, + int off, int bpf_size, enum bpf_access_type t, + int value_regno, bool strict_alignment_once) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; @@ -1590,7 +1597,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return size; /* alignment checks will add in reg->off themselves */ - err = check_ptr_alignment(env, reg, off, size); + err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); if (err) return err; @@ -1735,21 +1742,23 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return -EACCES; } - if (is_ctx_reg(env, insn->dst_reg)) { - verbose(env, "BPF_XADD stores into R%d context is not allowed\n", - insn->dst_reg); + if (is_ctx_reg(env, insn->dst_reg) || + is_pkt_reg(env, insn->dst_reg)) { + verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", + insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ? + "context" : "packet"); return -EACCES; } /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, -1); + BPF_SIZE(insn->code), BPF_READ, -1, true); if (err) return err; /* check whether atomic_add can write into the same memory */ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, -1); + BPF_SIZE(insn->code), BPF_WRITE, -1, true); } /* when register 'regno' is passed into function that will read 'access_size' @@ -2388,7 +2397,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); + err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, + BPF_WRITE, -1, false); if (err) return err; } @@ -4632,7 +4642,7 @@ static int do_check(struct bpf_verifier_env *env) */ err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, - insn->dst_reg); + insn->dst_reg, false); if (err) return err; @@ -4684,7 +4694,7 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, - insn->src_reg); + insn->src_reg, false); if (err) return err; @@ -4719,7 +4729,7 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, - -1); + -1, false); if (err) return err; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index c73592fa3d41..437c0b1c9d21 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -11163,6 +11163,64 @@ static struct bpf_test tests[] = { .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, + { + "xadd/w check unaligned stack", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -7), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "misaligned stack access off", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "xadd/w check unaligned map", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_STX_XADD(BPF_W, BPF_REG_0, BPF_REG_1, 3), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 3), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, + .result = REJECT, + .errstr = "misaligned value access off", + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "xadd/w check unaligned pkt", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data_end)), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 2), + BPF_MOV64_IMM(BPF_REG_0, 99), + BPF_JMP_IMM(BPF_JA, 0, 0, 6), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), + BPF_ST_MEM(BPF_W, BPF_REG_2, 3, 0), + BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 1), + BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "BPF_XADD stores into R2 packet", + .prog_type = BPF_PROG_TYPE_XDP, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit v1.2.3-71-gd317 From ad7c946b35ad455417fdd4bc0e17deda4011841b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Jan 2018 14:35:52 -0800 Subject: rcu: Create RCU-specific workqueues with rescuers RCU's expedited grace periods can participate in out-of-memory deadlocks due to all available system_wq kthreads being blocked and there not being memory available to create more. This commit prevents such deadlocks by allocating an RCU-specific workqueue_struct at early boot time, and providing it with a rescuer to ensure forward progress. This uses the shiny new init_rescuer() function provided by Tejun (but indirectly). This commit also causes SRCU to use this new RCU-specific workqueue_struct. Note that SRCU's use of workqueues never blocks them waiting for readers, so this should be safe from a forward-progress viewpoint. Note that this moves SRCU from system_power_efficient_wq to a normal workqueue. In the unlikely event that this results in measurable degradation, a separate power-efficient workqueue will be creates for SRCU. Reported-by: Prateek Sood Reported-by: Tejun Heo Signed-off-by: Paul E. McKenney Acked-by: Tejun Heo --- kernel/rcu/rcu.h | 1 + kernel/rcu/srcutree.c | 8 +++----- kernel/rcu/tree.c | 6 ++++++ kernel/rcu/tree_exp.h | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 1c868bcfd705..7a693e31184a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -485,6 +485,7 @@ void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); +extern struct workqueue_struct *rcu_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 045b559b9f22..743d18379256 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -492,8 +492,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, - &sdp->work, delay); + srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); } /* @@ -691,8 +690,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); srcu_gp_start(sp); - queue_delayed_work(system_power_efficient_wq, &sp->work, - srcu_get_delay(sp)); + queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp)); } spin_unlock_irqrestore_rcu_node(sp, flags); } @@ -1225,7 +1223,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) spin_unlock_irq_rcu_node(sp); if (pushgp) - queue_delayed_work(system_power_efficient_wq, &sp->work, delay); + queue_delayed_work(rcu_gp_wq, &sp->work, delay); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 99d404c6bbbb..2a734692a581 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4167,6 +4167,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) pr_cont("\n"); } +struct workqueue_struct *rcu_gp_wq; + void __init rcu_init(void) { int cpu; @@ -4193,6 +4195,10 @@ void __init rcu_init(void) rcu_cpu_starting(cpu); rcutree_online_cpu(cpu); } + + /* Create workqueue for expedited GPs and for Tree SRCU. */ + rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_gp_wq); } #include "tree_exp.h" diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6ad87642f44a..f72eefab8543 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -626,7 +626,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, rew.rew_rsp = rsp; rew.rew_s = s; INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - schedule_work(&rew.rew_work); + queue_work(rcu_gp_wq, &rew.rew_work); } /* Wait for expedited grace period to complete. */ -- cgit v1.2.3-71-gd317 From 7f852afe448c95691ead6b57bae5f37562d060b5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 17 Jan 2018 14:01:28 +0800 Subject: clocksource: Don't walk the clocksource list for empty override If the override clocksource name is empty there is no point in walking the clocksource list for a match. Signed-off-by: Baolin Wang Signed-off-by: Thomas Gleixner Cc: arnd@arndb.de Cc: sboyd@codeaurora.org Cc: broonie@kernel.org Cc: john.stultz@linaro.org Link: https://lkml.kernel.org/r/069ce2a605546bcad6552968cff755f0a03f9f10.1516167691.git.baolin.wang@linaro.org --- kernel/time/clocksource.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 65f9e3f24dde..c5fdcb13200f 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -594,6 +594,9 @@ static void __clocksource_select(bool skipcur) if (!best) return; + if (!strlen(override_name)) + goto found; + /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { if (skipcur && cs == curr_clocksource) @@ -625,6 +628,7 @@ static void __clocksource_select(bool skipcur) break; } +found: if (curr_clocksource != best && !timekeeping_notify(best)) { pr_info("Switched to clocksource %s\n", best->name); curr_clocksource = best; -- cgit v1.2.3-71-gd317 From e87821d18cf4db19d634a04061c0a1b7eb9c0e65 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 17 Jan 2018 14:01:29 +0800 Subject: clocksource: Use DEVICE_ATTR_RW/RO/WO to define device attributes Convert DEVICE_ATTR to DEVICE_ATTR_RW/RO/WO which is the preferred and simpler way of implementation. Signed-off-by: Baolin Wang Signed-off-by: Thomas Gleixner Cc: arnd@arndb.de Cc: sboyd@codeaurora.org Cc: broonie@kernel.org Cc: john.stultz@linaro.org Link: https://lkml.kernel.org/r/8f35c77e753e957b61187e8e7b2e4a3d61e4a72b.1516167691.git.baolin.wang@linaro.org --- kernel/time/clocksource.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c5fdcb13200f..7ce53465782b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -857,16 +857,16 @@ EXPORT_SYMBOL(clocksource_unregister); #ifdef CONFIG_SYSFS /** - * sysfs_show_current_clocksources - sysfs interface for current clocksource + * current_clocksource_show - sysfs interface for current clocksource * @dev: unused * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing current clocksource. */ -static ssize_t -sysfs_show_current_clocksources(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t current_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) { ssize_t count = 0; @@ -895,7 +895,7 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) } /** - * sysfs_override_clocksource - interface for manually overriding clocksource + * current_clocksource_store - interface for manually overriding clocksource * @dev: unused * @attr: unused * @buf: name of override clocksource @@ -904,9 +904,9 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) * Takes input from sysfs interface for manually overriding the default * clocksource selection. */ -static ssize_t sysfs_override_clocksource(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t current_clocksource_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { ssize_t ret; @@ -920,9 +920,10 @@ static ssize_t sysfs_override_clocksource(struct device *dev, return ret; } +static DEVICE_ATTR_RW(current_clocksource); /** - * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * unbind_clocksource_store - interface for manually unbinding clocksource * @dev: unused * @attr: unused * @buf: unused @@ -930,7 +931,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, * * Takes input from sysfs interface for manually unbinding a clocksource. */ -static ssize_t sysfs_unbind_clocksource(struct device *dev, +static ssize_t unbind_clocksource_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -954,19 +955,19 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, return ret ? ret : count; } +static DEVICE_ATTR_WO(unbind_clocksource); /** - * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * available_clocksource_show - sysfs interface for listing clocksource * @dev: unused * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing registered clocksources */ -static ssize_t -sysfs_show_available_clocksources(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t available_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) { struct clocksource *src; ssize_t count = 0; @@ -990,17 +991,7 @@ sysfs_show_available_clocksources(struct device *dev, return count; } - -/* - * Sysfs setup bits: - */ -static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, - sysfs_override_clocksource); - -static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); - -static DEVICE_ATTR(available_clocksource, 0444, - sysfs_show_available_clocksources, NULL); +static DEVICE_ATTR_RO(available_clocksource); static struct bus_type clocksource_subsys = { .name = "clocksource", -- cgit v1.2.3-71-gd317 From 27263e8dc0f6fe27540a843611ec14a000591c41 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 17 Jan 2018 14:01:30 +0800 Subject: clocksource: Use ATTRIBUTE_GROUPS Use ATTRIBUTE_GROUPS instead of manually creating the individual device files. Signed-off-by: Baolin Wang Signed-off-by: Thomas Gleixner Cc: arnd@arndb.de Cc: sboyd@codeaurora.org Cc: broonie@kernel.org Cc: john.stultz@linaro.org Link: https://lkml.kernel.org/r/d80dccb981dc2461781ebb8d71a32ccdc1b0e6f9.1516167691.git.baolin.wang@linaro.org --- kernel/time/clocksource.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7ce53465782b..0e974cface0b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -993,6 +993,14 @@ static ssize_t available_clocksource_show(struct device *dev, } static DEVICE_ATTR_RO(available_clocksource); +static struct attribute *clocksource_attrs[] = { + &dev_attr_current_clocksource.attr, + &dev_attr_unbind_clocksource.attr, + &dev_attr_available_clocksource.attr, + NULL +}; +ATTRIBUTE_GROUPS(clocksource); + static struct bus_type clocksource_subsys = { .name = "clocksource", .dev_name = "clocksource", @@ -1001,6 +1009,7 @@ static struct bus_type clocksource_subsys = { static struct device device_clocksource = { .id = 0, .bus = &clocksource_subsys, + .groups = clocksource_groups, }; static int __init init_clocksource_sysfs(void) @@ -1009,17 +1018,7 @@ static int __init init_clocksource_sysfs(void) if (!error) error = device_register(&device_clocksource); - if (!error) - error = device_create_file( - &device_clocksource, - &dev_attr_current_clocksource); - if (!error) - error = device_create_file(&device_clocksource, - &dev_attr_unbind_clocksource); - if (!error) - error = device_create_file( - &device_clocksource, - &dev_attr_available_clocksource); + return error; } -- cgit v1.2.3-71-gd317 From 04860d48a8aba5b21ae5ba1f86b0d4e3bc628fff Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 26 Feb 2018 14:49:26 +0100 Subject: locking/lockdep: Show unadorned pointers Show unadorned pointers in lockdep reports - lockdep is a debugging facility and hashing pointers there doesn't make a whole lotta sense. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/20180226134926.23069-1-bp@alien8.de --- kernel/locking/lockdep.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 89b5f83f1969..12a2805dd64a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -557,7 +557,7 @@ static void print_lock(struct held_lock *hlock) } print_lock_name(lock_classes + class_idx - 1); - printk(KERN_CONT ", at: [<%p>] %pS\n", + printk(KERN_CONT ", at: [<%px>] %pS\n", (void *)hlock->acquire_ip, (void *)hlock->acquire_ip); } @@ -808,7 +808,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); - printk("\nnew class %p: %s", class->key, class->name); + printk("\nnew class %px: %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); @@ -1407,7 +1407,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) } printk("%*s }\n", depth, ""); - printk("%*s ... key at: [<%p>] %pS\n", + printk("%*s ... key at: [<%px>] %pS\n", depth, "", class->key, class->key); } @@ -2340,7 +2340,7 @@ cache_hit: if (very_verbose(class)) { printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", + "%016Lx tail class: [%px] %s\n", (unsigned long long)chain_key, class->key, class->name); } @@ -2349,7 +2349,7 @@ cache_hit: } if (very_verbose(class)) { - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", + printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n", (unsigned long long)chain_key, class->key, class->name); } @@ -2676,16 +2676,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { printk("irq event stamp: %u\n", curr->irq_events); - printk("hardirqs last enabled at (%u): [<%p>] %pS\n", + printk("hardirqs last enabled at (%u): [<%px>] %pS\n", curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, (void *)curr->hardirq_enable_ip); - printk("hardirqs last disabled at (%u): [<%p>] %pS\n", + printk("hardirqs last disabled at (%u): [<%px>] %pS\n", curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, (void *)curr->hardirq_disable_ip); - printk("softirqs last enabled at (%u): [<%p>] %pS\n", + printk("softirqs last enabled at (%u): [<%px>] %pS\n", curr->softirq_enable_event, (void *)curr->softirq_enable_ip, (void *)curr->softirq_enable_ip); - printk("softirqs last disabled at (%u): [<%p>] %pS\n", + printk("softirqs last disabled at (%u): [<%px>] %pS\n", curr->softirq_disable_event, (void *)curr->softirq_disable_ip, (void *)curr->softirq_disable_ip); } @@ -3207,7 +3207,7 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, * Sanity check, the lock-class key must be persistent: */ if (!static_obj(key)) { - printk("BUG: key %p not in .data!\n", key); + printk("BUG: key %px not in .data!\n", key); /* * What it says above ^^^^^, I suggest you read it. */ @@ -3322,7 +3322,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } atomic_inc((atomic_t *)&class->ops); if (very_verbose(class)) { - printk("\nacquire class [%p] %s", class->key, class->name); + printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); @@ -4376,7 +4376,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); pr_warn("-------------------------\n"); - pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); -- cgit v1.2.3-71-gd317 From d61e2944b6364006e3d7a0152aaafda741c8c876 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 26 Feb 2018 17:50:43 +0200 Subject: genirq: Add wakeup sysfs node to show IRQ wakeup state Surprisingly there is no simple way to see if the IRQ line in question is wakeup source or not. Note that wakeup might be an OOB (out-of-band) source like GPIO line which makes things slightly more complicated. Add a sysfs node to cover this case. Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Tested-by: Tony Lindgren Cc: Grygorii Strashko Cc: "Rafael J . Wysocki" Link: https://lkml.kernel.org/r/20180226155043.67937-1-andriy.shevchenko@linux.intel.com --- Documentation/ABI/testing/sysfs-kernel-irq | 7 +++++++ kernel/irq/irqdesc.c | 17 +++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-kernel-irq b/Documentation/ABI/testing/sysfs-kernel-irq index eb074b100986..8910d0c4bcd8 100644 --- a/Documentation/ABI/testing/sysfs-kernel-irq +++ b/Documentation/ABI/testing/sysfs-kernel-irq @@ -51,3 +51,10 @@ Date: September 2016 KernelVersion: 4.9 Contact: Craig Gallek Description: The type of the interrupt. Either the string 'level' or 'edge'. + +What: /sys/kernel/irq//wakeup +Date: March 2018 +KernelVersion: 4.17 +Contact: Andy Shevchenko +Description: The wakeup state of the interrupt. Either the string + 'enabled' or 'disabled'. diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 49b54e9979cc..d9ded088d336 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -210,6 +210,22 @@ static ssize_t type_show(struct kobject *kobj, } IRQ_ATTR_RO(type); +static ssize_t wakeup_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + ret = sprintf(buf, "%s\n", + irqd_is_wakeup_set(&desc->irq_data) ? "enabled" : "disabled"); + raw_spin_unlock_irq(&desc->lock); + + return ret; + +} +IRQ_ATTR_RO(wakeup); + static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -253,6 +269,7 @@ static struct attribute *irq_attrs[] = { &chip_name_attr.attr, &hwirq_attr.attr, &type_attr.attr, + &wakeup_attr.attr, &name_attr.attr, &actions_attr.attr, NULL -- cgit v1.2.3-71-gd317 From fde9fc766e96c494b82931b1d270a9a751be07c0 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Mon, 19 Feb 2018 16:55:06 +0000 Subject: signals: Move put_compat_sigset to compat.h to silence hardened usercopy Since commit afcc90f8621e ("usercopy: WARN() on slab cache usercopy region violations"), MIPS systems booting with a compat root filesystem emit a warning when copying compat siginfo to userspace: WARNING: CPU: 0 PID: 953 at mm/usercopy.c:81 usercopy_warn+0x98/0xe8 Bad or missing usercopy whitelist? Kernel memory exposure attempt detected from SLAB object 'task_struct' (offset 1432, size 16)! Modules linked in: CPU: 0 PID: 953 Comm: S01logging Not tainted 4.16.0-rc2 #10 Stack : ffffffff808c0000 0000000000000000 0000000000000001 65ac85163f3bdc4a 65ac85163f3bdc4a 0000000000000000 90000000ff667ab8 ffffffff808c0000 00000000000003f8 ffffffff808d0000 00000000000000d1 0000000000000000 000000000000003c 0000000000000000 ffffffff808c8ca8 ffffffff808d0000 ffffffff808d0000 ffffffff80810000 fffffc0000000000 ffffffff80785c30 0000000000000009 0000000000000051 90000000ff667eb0 90000000ff667db0 000000007fe0d938 0000000000000018 ffffffff80449958 0000000020052798 ffffffff808c0000 90000000ff664000 90000000ff667ab0 00000000100c0000 ffffffff80698810 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 ffffffff8010d02c 65ac85163f3bdc4a ... Call Trace: [] show_stack+0x9c/0x130 [] dump_stack+0x90/0xd0 [] __warn+0x100/0x118 [] warn_slowpath_fmt+0x4c/0x70 [] usercopy_warn+0x98/0xe8 [] __check_object_size+0xfc/0x250 [] put_compat_sigset+0x30/0x88 [] setup_rt_frame_n32+0xc4/0x160 [] do_signal+0x19c/0x230 [] do_notify_resume+0x60/0x78 [] work_notifysig+0x10/0x18 ---[ end trace 88fffbf69147f48a ]--- Commit 5905429ad856 ("fork: Provide usercopy whitelisting for task_struct") noted that: "While the blocked and saved_sigmask fields of task_struct are copied to userspace (via sigmask_to_save() and setup_rt_frame()), it is always copied with a static length (i.e. sizeof(sigset_t))." However, this is not true in the case of compat signals, whose sigset is copied by put_compat_sigset and receives size as an argument. At most call sites, put_compat_sigset is copying a sigset from the current task_struct. This triggers a warning when CONFIG_HARDENED_USERCOPY is active. However, by marking this function as static inline, the warning can be avoided because in all of these cases the size is constant at compile time, which is allowed. The only site where this is not the case is handling the rt_sigpending syscall, but there the copy is being made from a stack local variable so does not trigger the warning. Move put_compat_sigset to compat.h, and mark it static inline. This fixes the WARN on MIPS. Fixes: afcc90f8621e ("usercopy: WARN() on slab cache usercopy region violations") Signed-off-by: Matt Redfearn Acked-by: Kees Cook Cc: "Dmitry V . Levin" Cc: Al Viro Cc: kernel-hardening@lists.openwall.com Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/18639/ Signed-off-by: James Hogan --- include/linux/compat.h | 26 ++++++++++++++++++++++++-- kernel/compat.c | 19 ------------------- 2 files changed, 24 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 8a9643857c4a..c4139c7a0de0 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -17,6 +17,7 @@ #include #include #include /* for aio_context_t */ +#include #include #include @@ -550,8 +551,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat); -extern int put_compat_sigset(compat_sigset_t __user *compat, - const sigset_t *set, unsigned int size); + +/* + * Defined inline such that size can be compile time constant, which avoids + * CONFIG_HARDENED_USERCOPY complaining about copies from task_struct + */ +static inline int +put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, + unsigned int size) +{ + /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ +#ifdef __BIG_ENDIAN + compat_sigset_t v; + switch (_NSIG_WORDS) { + case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; + case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; + case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; + case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; + } + return copy_to_user(compat, &v, size) ? -EFAULT : 0; +#else + return copy_to_user(compat, set, size) ? -EFAULT : 0; +#endif +} asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes, diff --git a/kernel/compat.c b/kernel/compat.c index 3247fe761f60..3f5fa8902e7d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -488,25 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) } EXPORT_SYMBOL_GPL(get_compat_sigset); -int -put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, - unsigned int size) -{ - /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ -#ifdef __BIG_ENDIAN - compat_sigset_t v; - switch (_NSIG_WORDS) { - case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; - case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; - case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; - case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; - } - return copy_to_user(compat, &v, size) ? -EFAULT : 0; -#else - return copy_to_user(compat, set, size) ? -EFAULT : 0; -#endif -} - #ifdef CONFIG_NUMA COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, compat_uptr_t __user *, pages32, -- cgit v1.2.3-71-gd317 From c2e513821d5df5e772287f6d0c23fd17b7c2bb1a Mon Sep 17 00:00:00 2001 From: Mario Leinweber Date: Fri, 2 Mar 2018 13:20:07 -0500 Subject: sched/deadline: Clean up various coding style details - Fixed style error: Missing space before the open parenthesis - Fixed style warnings: 2x Missing blank line after declaration One warning left: else after return (I don't feel comfortable fixing that without side effects) Signed-off-by: Mario Leinweber Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20180302182007.28691-1-marioleinweber@web.de Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8d9562d890d3..6a9defebbb54 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -42,8 +42,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx) return; /* adapted from lib/prio_heap.c */ - while(1) { + while (1) { u64 largest_dl; + l = left_child(idx); r = right_child(idx); largest = idx; @@ -131,6 +132,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, return 1; } else { int best_cpu = cpudl_maximum(cp); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && @@ -205,6 +207,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) old_idx = cp->elements[cpu].idx; if (old_idx == IDX_INVALID) { int new_idx = cp->size++; + cp->elements[new_idx].dl = dl; cp->elements[new_idx].cpu = cpu; cp->elements[cpu].idx = new_idx; -- cgit v1.2.3-71-gd317 From 97fb7a0a8944bd6d2c5634e1e0fa689a5c40bc22 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Mar 2018 14:01:12 +0100 Subject: sched: Clean up and harmonize the coding style of the scheduler code base A good number of small style inconsistencies have accumulated in the scheduler core, so do a pass over them to harmonize all these details: - fix speling in comments, - use curly braces for multi-line statements, - remove unnecessary parentheses from integer literals, - capitalize consistently, - remove stray newlines, - add comments where necessary, - remove invalid/unnecessary comments, - align structure definitions and other data types vertically, - add missing newlines for increased readability, - fix vertical tabulation where it's misaligned, - harmonize preprocessor conditional block labeling and vertical alignment, - remove line-breaks where they uglify the code, - add newline after local variable definitions, No change in functionality: md5: 1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm 1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/autogroup.c | 12 +- kernel/sched/autogroup.h | 8 +- kernel/sched/clock.c | 22 +- kernel/sched/core.c | 6 +- kernel/sched/cpuacct.c | 20 +- kernel/sched/cpudeadline.c | 13 +- kernel/sched/cpudeadline.h | 27 +- kernel/sched/cpufreq_schedutil.c | 129 +++++----- kernel/sched/cpupri.c | 9 +- kernel/sched/cpupri.h | 24 +- kernel/sched/cputime.c | 48 ++-- kernel/sched/deadline.c | 51 ++-- kernel/sched/debug.c | 88 +++---- kernel/sched/fair.c | 183 +++++++------- kernel/sched/idle.c | 6 +- kernel/sched/idle_task.c | 3 +- kernel/sched/isolation.c | 2 +- kernel/sched/loadavg.c | 30 +-- kernel/sched/membarrier.c | 18 +- kernel/sched/rt.c | 25 +- kernel/sched/sched.h | 529 ++++++++++++++++++++------------------- kernel/sched/stats.c | 7 +- kernel/sched/stats.h | 86 +++---- kernel/sched/stop_task.c | 3 +- kernel/sched/swait.c | 3 + kernel/sched/topology.c | 42 ++-- kernel/sched/wait.c | 4 + kernel/sched/wait_bit.c | 18 +- 28 files changed, 706 insertions(+), 710 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index bb4b9fe026a1..ff1b7b647b86 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -168,18 +168,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) autogroup_kref_put(prev); } -/* Allocates GFP_KERNEL, cannot be called under any spinlock */ +/* Allocates GFP_KERNEL, cannot be called under any spinlock: */ void sched_autogroup_create_attach(struct task_struct *p) { struct autogroup *ag = autogroup_create(); autogroup_move_group(p, ag); - /* drop extra reference added by autogroup_create() */ + + /* Drop extra reference added by autogroup_create(): */ autogroup_kref_put(ag); } EXPORT_SYMBOL(sched_autogroup_create_attach); -/* Cannot be called under siglock. Currently has no users */ +/* Cannot be called under siglock. Currently has no users: */ void sched_autogroup_detach(struct task_struct *p) { autogroup_move_group(p, &autogroup_default); @@ -202,7 +203,6 @@ static int __init setup_autogroup(char *str) return 1; } - __setup("noautogroup", setup_autogroup); #ifdef CONFIG_PROC_FS @@ -224,7 +224,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) if (nice < 0 && !can_nice(current, nice)) return -EPERM; - /* this is a heavy operation taking global locks.. */ + /* This is a heavy operation, taking global locks.. */ if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) return -EAGAIN; @@ -267,4 +267,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } -#endif /* CONFIG_SCHED_DEBUG */ +#endif diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 27cd22b89824..49e6ec9559cf 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -7,9 +7,9 @@ struct autogroup { /* - * reference doesn't mean how many thread attach to this - * autogroup now. It just stands for the number of task - * could use this autogroup. + * Reference doesn't mean how many threads attach to this + * autogroup now. It just stands for the number of tasks + * which could use this autogroup. */ struct kref kref; struct task_group *tg; @@ -56,11 +56,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) return tg; } -#ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { return 0; } -#endif #endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e086babe6c61..7da6bec8a2ff 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,5 +1,5 @@ /* - * sched_clock for unstable cpu clocks + * sched_clock() for unstable CPU clocks * * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * @@ -11,7 +11,7 @@ * Guillaume Chazarain * * - * What: + * What this file implements: * * cpu_clock(i) provides a fast (execution time) high resolution * clock with bounded drift between CPUs. The value of cpu_clock(i) @@ -26,11 +26,11 @@ * at 0 on boot (but people really shouldn't rely on that). * * cpu_clock(i) -- can be used from any context, including NMI. - * local_clock() -- is cpu_clock() on the current cpu. + * local_clock() -- is cpu_clock() on the current CPU. * * sched_clock_cpu(i) * - * How: + * How it is implemented: * * The implementation either uses sched_clock() when * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the @@ -302,21 +302,21 @@ again: * cmpxchg64 below only protects one readout. * * We must reread via sched_clock_local() in the retry case on - * 32bit as an NMI could use sched_clock_local() via the + * 32-bit kernels as an NMI could use sched_clock_local() via the * tracer and hit between the readout of - * the low32bit and the high 32bit portion. + * the low 32-bit and the high 32-bit portion. */ this_clock = sched_clock_local(my_scd); /* - * We must enforce atomic readout on 32bit, otherwise the - * update on the remote cpu can hit inbetween the readout of - * the low32bit and the high 32bit portion. + * We must enforce atomic readout on 32-bit, otherwise the + * update on the remote CPU can hit inbetween the readout of + * the low 32-bit and the high 32-bit portion. */ remote_clock = cmpxchg64(&scd->clock, 0, 0); #else /* - * On 64bit the read of [my]scd->clock is atomic versus the - * update, so we can avoid the above 32bit dance. + * On 64-bit kernels the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32-bit dance. */ sched_clock_local(my_scd); again: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8fff4f16c510..9427b59551c1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -135,7 +135,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * [L] ->on_rq * RELEASE (rq->lock) * - * If we observe the old cpu in task_rq_lock, the acquire of + * If we observe the old CPU in task_rq_lock, the acquire of * the old rq->lock will fully serialize against the stores. * * If we observe the new CPU in task_rq_lock, the acquire will @@ -1457,7 +1457,7 @@ EXPORT_SYMBOL_GPL(kick_process); * * - cpu_active must be a subset of cpu_online * - * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, + * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, * see __set_cpus_allowed_ptr(). At this point the newly online * CPU isn't yet part of the sched domains, and balancing will not * see it. @@ -3037,7 +3037,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* - * 64-bit doesn't need locks to atomically read a 64bit value. + * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. * Reading ->on_cpu is racy, but this is ok. * diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 44ab32a4fab6..1abd325e733a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -18,7 +18,7 @@ * (balbir@in.ibm.com). */ -/* Time spent by the tasks of the cpu accounting group executing in ... */ +/* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { CPUACCT_STAT_USER, /* ... user mode */ CPUACCT_STAT_SYSTEM, /* ... kernel mode */ @@ -35,12 +35,12 @@ struct cpuacct_usage { u64 usages[CPUACCT_STAT_NSTATS]; }; -/* track cpu usage of a group of tasks and its child groups */ +/* track CPU usage of a group of tasks and its child groups */ struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - struct cpuacct_usage __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every CPU */ + struct cpuacct_usage __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; }; static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) @@ -48,7 +48,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) return css ? container_of(css, struct cpuacct, css) : NULL; } -/* return cpu accounting group to which this task belongs */ +/* Return CPU accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { return css_ca(task_css(tsk, cpuacct_cgrp_id)); @@ -65,7 +65,7 @@ static struct cpuacct root_cpuacct = { .cpuusage = &root_cpuacct_cpuusage, }; -/* create a new cpu accounting group */ +/* Create a new CPU accounting group */ static struct cgroup_subsys_state * cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -96,7 +96,7 @@ out: return ERR_PTR(-ENOMEM); } -/* destroy an existing cpu accounting group */ +/* Destroy an existing CPU accounting group */ static void cpuacct_css_free(struct cgroup_subsys_state *css) { struct cpuacct *ca = css_ca(css); @@ -162,7 +162,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) #endif } -/* return total cpu usage (in nanoseconds) of a group */ +/* Return total CPU usage (in nanoseconds) of a group */ static u64 __cpuusage_read(struct cgroup_subsys_state *css, enum cpuacct_stat_index index) { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 6a9defebbb54..cb172b61d191 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -10,7 +10,6 @@ * as published by the Free Software Foundation; version 2 * of the License. */ - #include #include #include @@ -147,9 +146,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } /* - * cpudl_clear - remove a cpu from the cpudl max-heap + * cpudl_clear - remove a CPU from the cpudl max-heap * @cp: the cpudl max-heap context - * @cpu: the target cpu + * @cpu: the target CPU * * Notes: assumes cpu_rq(cpu)->lock is locked * @@ -188,8 +187,8 @@ void cpudl_clear(struct cpudl *cp, int cpu) /* * cpudl_set - update the cpudl max-heap * @cp: the cpudl max-heap context - * @cpu: the target cpu - * @dl: the new earliest deadline for this cpu + * @cpu: the target CPU + * @dl: the new earliest deadline for this CPU * * Notes: assumes cpu_rq(cpu)->lock is locked * @@ -224,7 +223,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) /* * cpudl_set_freecpu - Set the cpudl.free_cpus * @cp: the cpudl max-heap context - * @cpu: rd attached cpu + * @cpu: rd attached CPU */ void cpudl_set_freecpu(struct cpudl *cp, int cpu) { @@ -234,7 +233,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu) /* * cpudl_clear_freecpu - Clear the cpudl.free_cpus * @cp: the cpudl max-heap context - * @cpu: rd attached cpu + * @cpu: rd attached CPU */ void cpudl_clear_freecpu(struct cpudl *cp, int cpu) { diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index b010d26e108e..c26e7a0e5a66 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,35 +1,28 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CPUDL_H -#define _LINUX_CPUDL_H - #include #include -#define IDX_INVALID -1 +#define IDX_INVALID -1 struct cpudl_item { - u64 dl; - int cpu; - int idx; + u64 dl; + int cpu; + int idx; }; struct cpudl { - raw_spinlock_t lock; - int size; - cpumask_var_t free_cpus; - struct cpudl_item *elements; + raw_spinlock_t lock; + int size; + cpumask_var_t free_cpus; + struct cpudl_item *elements; }; - #ifdef CONFIG_SMP -int cpudl_find(struct cpudl *cp, struct task_struct *p, - struct cpumask *later_mask); +int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); void cpudl_clear(struct cpudl *cp, int cpu); -int cpudl_init(struct cpudl *cp); +int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); #endif /* CONFIG_SMP */ - -#endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7936f548e071..0dad8160e00f 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -20,52 +20,52 @@ #include "sched.h" struct sugov_tunables { - struct gov_attr_set attr_set; - unsigned int rate_limit_us; + struct gov_attr_set attr_set; + unsigned int rate_limit_us; }; struct sugov_policy { - struct cpufreq_policy *policy; - - struct sugov_tunables *tunables; - struct list_head tunables_hook; - - raw_spinlock_t update_lock; /* For shared policies */ - u64 last_freq_update_time; - s64 freq_update_delay_ns; - unsigned int next_freq; - unsigned int cached_raw_freq; - - /* The next fields are only needed if fast switch cannot be used. */ - struct irq_work irq_work; - struct kthread_work work; - struct mutex work_lock; - struct kthread_worker worker; - struct task_struct *thread; - bool work_in_progress; - - bool need_freq_update; + struct cpufreq_policy *policy; + + struct sugov_tunables *tunables; + struct list_head tunables_hook; + + raw_spinlock_t update_lock; /* For shared policies */ + u64 last_freq_update_time; + s64 freq_update_delay_ns; + unsigned int next_freq; + unsigned int cached_raw_freq; + + /* The next fields are only needed if fast switch cannot be used: */ + struct irq_work irq_work; + struct kthread_work work; + struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; + bool work_in_progress; + + bool need_freq_update; }; struct sugov_cpu { - struct update_util_data update_util; - struct sugov_policy *sg_policy; - unsigned int cpu; + struct update_util_data update_util; + struct sugov_policy *sg_policy; + unsigned int cpu; - bool iowait_boost_pending; - unsigned int iowait_boost; - unsigned int iowait_boost_max; + bool iowait_boost_pending; + unsigned int iowait_boost; + unsigned int iowait_boost_max; u64 last_update; - /* The fields below are only needed when sharing a policy. */ - unsigned long util_cfs; - unsigned long util_dl; - unsigned long max; - unsigned int flags; + /* The fields below are only needed when sharing a policy: */ + unsigned long util_cfs; + unsigned long util_dl; + unsigned long max; + unsigned int flags; - /* The field below is for single-CPU policies only. */ + /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON - unsigned long saved_idle_calls; + unsigned long saved_idle_calls; #endif }; @@ -79,9 +79,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) /* * Since cpufreq_update_util() is called with rq->lock held for - * the @target_cpu, our per-cpu data is fully serialized. + * the @target_cpu, our per-CPU data is fully serialized. * - * However, drivers cannot in general deal with cross-cpu + * However, drivers cannot in general deal with cross-CPU * requests, so while get_next_freq() will work, our * sugov_update_commit() call may not for the fast switching platforms. * @@ -111,6 +111,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) } delta_ns = time - sg_policy->last_freq_update_time; + return delta_ns >= sg_policy->freq_update_delay_ns; } @@ -345,8 +346,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) return get_next_freq(sg_policy, util, max); } -static void sugov_update_shared(struct update_util_data *hook, u64 time, - unsigned int flags) +static void +sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; @@ -423,8 +424,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) return sprintf(buf, "%u\n", tunables->rate_limit_us); } -static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, - size_t count) +static ssize_t +rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) { struct sugov_tunables *tunables = to_sugov_tunables(attr_set); struct sugov_policy *sg_policy; @@ -479,11 +480,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) { struct task_struct *thread; struct sched_attr attr = { - .size = sizeof(struct sched_attr), - .sched_policy = SCHED_DEADLINE, - .sched_flags = SCHED_FLAG_SUGOV, - .sched_nice = 0, - .sched_priority = 0, + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, /* * Fake (unused) bandwidth; workaround to "fix" * priority inheritance. @@ -663,21 +664,21 @@ static int sugov_start(struct cpufreq_policy *policy) struct sugov_policy *sg_policy = policy->governor_data; unsigned int cpu; - sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; - sg_policy->last_freq_update_time = 0; - sg_policy->next_freq = UINT_MAX; - sg_policy->work_in_progress = false; - sg_policy->need_freq_update = false; - sg_policy->cached_raw_freq = 0; + sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; + sg_policy->last_freq_update_time = 0; + sg_policy->next_freq = UINT_MAX; + sg_policy->work_in_progress = false; + sg_policy->need_freq_update = false; + sg_policy->cached_raw_freq = 0; for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); memset(sg_cpu, 0, sizeof(*sg_cpu)); - sg_cpu->cpu = cpu; - sg_cpu->sg_policy = sg_policy; - sg_cpu->flags = 0; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + sg_cpu->cpu = cpu; + sg_cpu->sg_policy = sg_policy; + sg_cpu->flags = 0; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } for_each_cpu(cpu, policy->cpus) { @@ -721,14 +722,14 @@ static void sugov_limits(struct cpufreq_policy *policy) } static struct cpufreq_governor schedutil_gov = { - .name = "schedutil", - .owner = THIS_MODULE, - .dynamic_switching = true, - .init = sugov_init, - .exit = sugov_exit, - .start = sugov_start, - .stop = sugov_stop, - .limits = sugov_limits, + .name = "schedutil", + .owner = THIS_MODULE, + .dynamic_switching = true, + .init = sugov_init, + .exit = sugov_exit, + .start = sugov_start, + .stop = sugov_stop, + .limits = sugov_limits, }; #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 2511aba36b89..f43e14ccb67d 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -14,7 +14,7 @@ * * going from the lowest priority to the highest. CPUs in the INVALID state * are not eligible for routing. The system maintains this state with - * a 2 dimensional bitmap (the first for priority class, the second for cpus + * a 2 dimensional bitmap (the first for priority class, the second for CPUs * in that class). Therefore a typical application without affinity * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * searches). For tasks with affinity restrictions, the algorithm has a @@ -26,7 +26,6 @@ * as published by the Free Software Foundation; version 2 * of the License. */ - #include #include #include @@ -128,9 +127,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, } /** - * cpupri_set - update the cpu priority setting + * cpupri_set - update the CPU priority setting * @cp: The cpupri context - * @cpu: The target cpu + * @cpu: The target CPU * @newpri: The priority (INVALID-RT99) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked @@ -151,7 +150,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) return; /* - * If the cpu was currently mapped to a different value, we + * If the CPU was currently mapped to a different value, we * need to map it to the new value then remove the old value. * Note, we must add the new value first, otherwise we risk the * cpu being missed by the priority loop in cpupri_find. diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index bab050019071..141a06c914c6 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,32 +1,26 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CPUPRI_H -#define _LINUX_CPUPRI_H - #include #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) -#define CPUPRI_INVALID -1 -#define CPUPRI_IDLE 0 -#define CPUPRI_NORMAL 1 +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 /* values 2-101 are RT priorities 0-99 */ struct cpupri_vec { - atomic_t count; - cpumask_var_t mask; + atomic_t count; + cpumask_var_t mask; }; struct cpupri { - struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - int *cpu_to_pri; + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + int *cpu_to_pri; }; #ifdef CONFIG_SMP -int cpupri_find(struct cpupri *cp, - struct task_struct *p, struct cpumask *lowest_mask); +int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -int cpupri_init(struct cpupri *cp); +int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); #endif - -#endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index bac6ac9a4ec7..d3b450b57ade 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -113,9 +113,9 @@ static inline void task_group_account_field(struct task_struct *p, int index, } /* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update + * Account user CPU time to a process. + * @p: the process that the CPU time gets accounted to + * @cputime: the CPU time spent in user space since the last update */ void account_user_time(struct task_struct *p, u64 cputime) { @@ -135,9 +135,9 @@ void account_user_time(struct task_struct *p, u64 cputime) } /* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update + * Account guest CPU time to a process. + * @p: the process that the CPU time gets accounted to + * @cputime: the CPU time spent in virtual machine since the last update */ void account_guest_time(struct task_struct *p, u64 cputime) { @@ -159,9 +159,9 @@ void account_guest_time(struct task_struct *p, u64 cputime) } /* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update + * Account system CPU time to a process and desired cpustat field + * @p: the process that the CPU time gets accounted to + * @cputime: the CPU time spent in kernel space since the last update * @index: pointer to cpustat field that has to be updated */ void account_system_index_time(struct task_struct *p, @@ -179,10 +179,10 @@ void account_system_index_time(struct task_struct *p, } /* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to + * Account system CPU time to a process. + * @p: the process that the CPU time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update + * @cputime: the CPU time spent in kernel space since the last update */ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) { @@ -205,7 +205,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) /* * Account for involuntary wait time. - * @cputime: the cpu time spent in involuntary wait + * @cputime: the CPU time spent in involuntary wait */ void account_steal_time(u64 cputime) { @@ -216,7 +216,7 @@ void account_steal_time(u64 cputime) /* * Account for idle time. - * @cputime: the cpu time spent in idle wait + * @cputime: the CPU time spent in idle wait */ void account_idle_time(u64 cputime) { @@ -338,7 +338,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat - * @p: the process that the cpu time gets accounted to + * @p: the process that the CPU time gets accounted to * @user_tick: is the tick from userspace * @rq: the pointer to rq * @@ -400,17 +400,16 @@ static void irqtime_account_idle_ticks(int ticks) irqtime_account_process_tick(current, 0, rq, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static inline void irqtime_account_idle_ticks(int ticks) {} +static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int nr_ticks) {} + struct rq *rq, int nr_ticks) { } #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING - -#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +# ifndef __ARCH_HAS_VTIME_TASK_SWITCH void vtime_common_task_switch(struct task_struct *prev) { if (is_idle_task(prev)) @@ -421,8 +420,7 @@ void vtime_common_task_switch(struct task_struct *prev) vtime_flush(prev); arch_vtime_task_switch(prev); } -#endif - +# endif #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ @@ -469,10 +467,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) *ut = cputime.utime; *st = cputime.stime; } -#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ + /* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to + * Account a single tick of CPU time. + * @p: the process that the CPU time gets accounted to * @user_tick: indicates if the tick is a user or a system tick */ void account_process_tick(struct task_struct *p, int user_tick) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 65cd5ead1759..58f8b7b37983 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -539,12 +539,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p /* * If we cannot preempt any rq, fall back to pick any - * online cpu. + * online CPU: */ cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); if (cpu >= nr_cpu_ids) { /* - * Fail to find any suitable cpu. + * Failed to find any suitable CPU. * The task will never come back! */ BUG_ON(dl_bandwidth_enabled()); @@ -608,8 +608,7 @@ static inline void queue_pull_task(struct rq *rq) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, - int flags); +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); /* * We are being explicitly informed that a new instance is starting, @@ -1873,7 +1872,7 @@ static int find_later_rq(struct task_struct *task) /* * We have to consider system topology and task affinity - * first, then we can look for a suitable cpu. + * first, then we can look for a suitable CPU. */ if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) return -1; @@ -1887,7 +1886,7 @@ static int find_later_rq(struct task_struct *task) * Now we check how well this matches with task's * affinity and system topology. * - * The last cpu where the task run is our first + * The last CPU where the task run is our first * guess, since it is most likely cache-hot there. */ if (cpumask_test_cpu(cpu, later_mask)) @@ -1917,9 +1916,9 @@ static int find_later_rq(struct task_struct *task) best_cpu = cpumask_first_and(later_mask, sched_domain_span(sd)); /* - * Last chance: if a cpu being in both later_mask + * Last chance: if a CPU being in both later_mask * and current sd span is valid, that becomes our - * choice. Of course, the latest possible cpu is + * choice. Of course, the latest possible CPU is * already under consideration through later_mask. */ if (best_cpu < nr_cpu_ids) { @@ -2075,7 +2074,7 @@ retry: if (task == next_task) { /* * The task is still there. We don't try - * again, some other cpu will pull it when ready. + * again, some other CPU will pull it when ready. */ goto out; } @@ -2308,7 +2307,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one - * from an overloaded cpu, if any. + * from an overloaded CPU, if any. */ if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) return; @@ -2634,17 +2633,17 @@ void __dl_clear_params(struct task_struct *p) { struct sched_dl_entity *dl_se = &p->dl; - dl_se->dl_runtime = 0; - dl_se->dl_deadline = 0; - dl_se->dl_period = 0; - dl_se->flags = 0; - dl_se->dl_bw = 0; - dl_se->dl_density = 0; + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; + dl_se->dl_density = 0; - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; - dl_se->dl_non_contending = 0; - dl_se->dl_overrun = 0; + dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; + dl_se->dl_non_contending = 0; + dl_se->dl_overrun = 0; } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) @@ -2663,21 +2662,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) #ifdef CONFIG_SMP int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) { - unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, - cs_cpus_allowed); + unsigned int dest_cpu; struct dl_bw *dl_b; bool overflow; int cpus, ret; unsigned long flags; + dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); + rcu_read_lock_sched(); dl_b = dl_bw_of(dest_cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); cpus = dl_bw_cpus(dest_cpu); overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); - if (overflow) + if (overflow) { ret = -EBUSY; - else { + } else { /* * We reserve space for this task in the destination * root_domain, as we can't fail after this point. @@ -2689,6 +2689,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo } raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); + return ret; } @@ -2709,6 +2710,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, ret = 0; raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); rcu_read_unlock_sched(); + return ret; } @@ -2726,6 +2728,7 @@ bool dl_cpu_busy(unsigned int cpu) overflow = __dl_overflow(dl_b, cpus, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); + return overflow; } #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1ca0130ed4f9..7c82a9b88510 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -9,7 +9,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - #include #include #include @@ -274,34 +273,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) if (table == NULL) return NULL; - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", - &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring, false); + set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); + set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); /* &table[13] is terminator */ return table; @@ -332,8 +316,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) return table; } -static cpumask_var_t sd_sysctl_cpus; -static struct ctl_table_header *sd_sysctl_header; +static cpumask_var_t sd_sysctl_cpus; +static struct ctl_table_header *sd_sysctl_header; void register_sched_domain_sysctl(void) { @@ -413,14 +397,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group { struct sched_entity *se = tg->se[cpu]; -#define P(F) \ - SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) -#define P_SCHEDSTAT(F) \ - SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) -#define PN(F) \ - SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN_SCHEDSTAT(F) \ - SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) +#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) +#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) if (!se) return; @@ -428,6 +408,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); + if (schedstat_enabled()) { PN_SCHEDSTAT(se->statistics.wait_start); PN_SCHEDSTAT(se->statistics.sleep_start); @@ -440,6 +421,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN_SCHEDSTAT(se->statistics.wait_sum); P_SCHEDSTAT(se->statistics.wait_count); } + P(se->load.weight); P(se->runnable_weight); #ifdef CONFIG_SMP @@ -464,6 +446,7 @@ static char *task_group_path(struct task_group *tg) return group_path; cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif @@ -799,9 +782,9 @@ void sysrq_sched_debug_show(void) /* * This itererator needs some explanation. * It returns 1 for the header position. - * This means 2 is cpu 0. - * In a hotplugged system some cpus, including cpu 0, may be missing so we have - * to use cpumask_* to iterate over the cpus. + * This means 2 is CPU 0. + * In a hotplugged system some CPUs, including CPU 0, may be missing so we have + * to use cpumask_* to iterate over the CPUs. */ static void *sched_debug_start(struct seq_file *file, loff_t *offset) { @@ -821,6 +804,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset) if (n < nr_cpu_ids) return (void *)(unsigned long)(n + 2); + return NULL; } @@ -835,10 +819,10 @@ static void sched_debug_stop(struct seq_file *file, void *data) } static const struct seq_operations sched_debug_sops = { - .start = sched_debug_start, - .next = sched_debug_next, - .stop = sched_debug_stop, - .show = sched_debug_show, + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, }; static int sched_debug_release(struct inode *inode, struct file *file) @@ -876,14 +860,10 @@ static int __init init_sched_debug_procfs(void) __initcall(init_sched_debug_procfs); -#define __P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define __PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) +#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) #ifdef CONFIG_NUMA_BALANCING diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e1febd252a84..1f877de96c9b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,7 +20,6 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ - #include #include @@ -103,7 +102,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; #ifdef CONFIG_SMP /* - * For asym packing, by default the lower numbered cpu has higher priority. + * For asym packing, by default the lower numbered CPU has higher priority. */ int __weak arch_asym_cpu_priority(int cpu) { @@ -1181,7 +1180,7 @@ pid_t task_numa_group_id(struct task_struct *p) } /* - * The averaged statistics, shared & private, memory & cpu, + * The averaged statistics, shared & private, memory & CPU, * occupy the first half of the array. The second half of the * array is for current counters, which are averaged into the * first set by task_numa_placement. @@ -1587,7 +1586,7 @@ static void task_numa_compare(struct task_numa_env *env, * be incurred if the tasks were swapped. */ if (cur) { - /* Skip this swap candidate if cannot move to the source cpu */ + /* Skip this swap candidate if cannot move to the source CPU: */ if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) goto unlock; @@ -1631,7 +1630,7 @@ static void task_numa_compare(struct task_numa_env *env, goto balance; } - /* Balance doesn't matter much if we're running a task per cpu */ + /* Balance doesn't matter much if we're running a task per CPU: */ if (imp > env->best_imp && src_rq->nr_running == 1 && dst_rq->nr_running == 1) goto assign; @@ -1676,7 +1675,7 @@ balance: */ if (!cur) { /* - * select_idle_siblings() uses an per-cpu cpumask that + * select_idle_siblings() uses an per-CPU cpumask that * can be used from IRQ context. */ local_irq_disable(); @@ -3362,7 +3361,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) } /* - * Called within set_task_rq() right before setting a task's cpu. The + * Called within set_task_rq() right before setting a task's CPU. The * caller only guarantees p->pi_lock is held; no other assumptions, * including the state of rq->lock, should be made. */ @@ -3541,7 +3540,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* * runnable_sum can't be lower than running_sum - * As running sum is scale with cpu capacity wehreas the runnable sum + * As running sum is scale with CPU capacity wehreas the runnable sum * is not we rescale running_sum 1st */ running_sum = se->avg.util_sum / @@ -4688,7 +4687,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (!se) add_nr_running(rq, task_delta); - /* determine whether we need to wake up potentially idle cpu */ + /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); } @@ -5053,7 +5052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) } /* - * Both these cpu hotplug callbacks race against unregister_fair_sched_group() + * Both these CPU hotplug callbacks race against unregister_fair_sched_group() * * The race is harmless, since modifying bandwidth settings of unhooked group * bits doesn't do much. @@ -5098,7 +5097,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) */ cfs_rq->runtime_remaining = 1; /* - * Offline rq is schedulable till cpu is completely disabled + * Offline rq is schedulable till CPU is completely disabled * in take_cpu_down(), so we prevent new cfs throttling here. */ cfs_rq->runtime_enabled = 0; @@ -5335,8 +5334,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); * * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load * - * If a cpu misses updates for n ticks (as it was idle) and update gets - * called on the n+1-th tick when cpu may be busy, then we have: + * If a CPU misses updates for n ticks (as it was idle) and update gets + * called on the n+1-th tick when CPU may be busy, then we have: * * load_n = (1 - 1/2^i)^n * load_0 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load @@ -5480,7 +5479,7 @@ static unsigned long weighted_cpuload(struct rq *rq) #ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. * * Therefore we need to avoid the delta approach from the regular tick when @@ -5591,7 +5590,7 @@ void cpu_load_update_active(struct rq *this_rq) } /* - * Return a low guess at the load of a migration-source cpu weighted + * Return a low guess at the load of a migration-source CPU weighted * according to the scheduling class and "nice" value. * * We want to under-estimate the load of migration sources, to @@ -5609,7 +5608,7 @@ static unsigned long source_load(int cpu, int type) } /* - * Return a high guess at the load of a migration-target cpu weighted + * Return a high guess at the load of a migration-target CPU weighted * according to the scheduling class and "nice" value. */ static unsigned long target_load(int cpu, int type) @@ -5889,7 +5888,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - /* Bias balancing toward cpus of our domain */ + /* Bias balancing toward CPUs of our domain */ if (local_group) load = source_load(i, load_idx); else @@ -5919,7 +5918,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (min_runnable_load > (runnable_load + imbalance)) { /* * The runnable load is significantly smaller - * so we can pick this new cpu + * so we can pick this new CPU: */ min_runnable_load = runnable_load; min_avg_load = avg_load; @@ -5928,7 +5927,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, (100*min_avg_load > imbalance_scale*avg_load)) { /* * The runnable loads are close so take the - * blocked load into account through avg_load. + * blocked load into account through avg_load: */ min_avg_load = avg_load; idlest = group; @@ -5989,7 +5988,7 @@ skip_spare: } /* - * find_idlest_group_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. */ static int find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) @@ -6067,12 +6066,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p new_cpu = find_idlest_group_cpu(group, p, cpu); if (new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ + /* Now try balancing at a lower domain level of 'cpu': */ sd = sd->child; continue; } - /* Now try balancing at a lower domain level of new_cpu */ + /* Now try balancing at a lower domain level of 'new_cpu': */ cpu = new_cpu; weight = sd->span_weight; sd = NULL; @@ -6082,7 +6081,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p if (tmp->flags & sd_flag) sd = tmp; } - /* while loop will break here if sd == NULL */ } return new_cpu; @@ -6278,12 +6276,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; /* - * If the previous cpu is cache affine and idle, don't be stupid. + * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) return prev; - /* Check a recently used CPU as a potential idle candidate */ + /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && recent_used_cpu != target && @@ -6292,7 +6290,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { /* * Replace recent_used_cpu with prev as it is a potential - * candidate for the next wake. + * candidate for the next wake: */ p->recent_used_cpu = prev; return recent_used_cpu; @@ -6357,7 +6355,7 @@ static inline unsigned long task_util(struct task_struct *p) } /* - * cpu_util_wake: Compute cpu utilization with any contributions from + * cpu_util_wake: Compute CPU utilization with any contributions from * the waking task p removed. */ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) @@ -6403,10 +6401,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * - * Balances load by selecting the idlest cpu in the idlest group, or under - * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. + * Balances load by selecting the idlest CPU in the idlest group, or under + * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. * - * Returns the target cpu number. + * Returns the target CPU number. * * preempt must be disabled. */ @@ -6431,7 +6429,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f break; /* - * If both cpu and prev_cpu are part of this domain, + * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && @@ -6482,9 +6480,9 @@ pick_cpu: static void detach_entity_cfs_rq(struct sched_entity *se); /* - * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the - * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. + * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ static void migrate_task_rq_fair(struct task_struct *p) { @@ -6918,17 +6916,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * BASICS * * The purpose of load-balancing is to achieve the same basic fairness the - * per-cpu scheduler provides, namely provide a proportional amount of compute + * per-CPU scheduler provides, namely provide a proportional amount of compute * time to each task. This is expressed in the following equation: * * W_i,n/P_i == W_j,n/P_j for all i,j (1) * - * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight + * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight * W_i,0 is defined as: * * W_i,0 = \Sum_j w_i,j (2) * - * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight + * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight * is derived from the nice value as per sched_prio_to_weight[]. * * The weight average is an exponential decay average of the instantaneous @@ -6936,7 +6934,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) * - * C_i is the compute capacity of cpu i, typically it is the + * C_i is the compute capacity of CPU i, typically it is the * fraction of 'recent' time available for SCHED_OTHER task execution. But it * can also include other factors [XXX]. * @@ -6957,11 +6955,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * SCHED DOMAINS * * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) - * for all i,j solution, we create a tree of cpus that follows the hardware + * for all i,j solution, we create a tree of CPUs that follows the hardware * topology where each level pairs two lower groups (or better). This results - * in O(log n) layers. Furthermore we reduce the number of cpus going up the + * in O(log n) layers. Furthermore we reduce the number of CPUs going up the * tree to only the first of the previous level and we decrease the frequency - * of load-balance at each level inv. proportional to the number of cpus in + * of load-balance at each level inv. proportional to the number of CPUs in * the groups. * * This yields: @@ -6970,7 +6968,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * \Sum { --- * --- * 2^i } = O(n) (5) * i = 0 2^i 2^i * `- size of each group - * | | `- number of cpus doing load-balance + * | | `- number of CPUs doing load-balance * | `- freq * `- sum over all levels * @@ -6978,7 +6976,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * this makes (5) the runtime complexity of the balancer. * * An important property here is that each CPU is still (indirectly) connected - * to every other cpu in at most O(log n) steps: + * to every other CPU in at most O(log n) steps: * * The adjacency matrix of the resulting graph is given by: * @@ -6990,7 +6988,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * A^(log_2 n)_i,j != 0 for all i,j (7) * - * Showing there's indeed a path between every cpu in at most O(log n) steps. + * Showing there's indeed a path between every CPU in at most O(log n) steps. * The task movement gives a factor of O(m), giving a convergence complexity * of: * @@ -7000,7 +6998,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * WORK CONSERVING * * In order to avoid CPUs going idle while there's still work to do, new idle - * balancing is more aggressive and has the newly idle cpu iterate up the domain + * balancing is more aggressive and has the newly idle CPU iterate up the domain * tree itself instead of relying on other CPUs to bring it work. * * This adds some complexity to both (5) and (8) but it reduces the total idle @@ -7021,7 +7019,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) * - * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i. * * The big problem is S_k, its a global sum needed to compute a local (W_i) * property. @@ -7185,7 +7183,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->flags |= LBF_SOME_PINNED; /* - * Remember if this task can be migrated to any other cpu in + * Remember if this task can be migrated to any other CPU in * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * @@ -7195,7 +7193,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) return 0; - /* Prevent to re-select dst_cpu via env's cpus */ + /* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { env->flags |= LBF_DST_PINNED; @@ -7769,8 +7767,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) * Group imbalance indicates (and tries to solve) the problem where balancing * groups is inadequate due to ->cpus_allowed constraints. * - * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a - * cpumask covering 1 cpu of the first group and 3 cpus of the second group. + * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a + * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. * Something like: * * { 0 1 2 3 } { 4 5 6 7 } @@ -7778,7 +7776,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) * * If we were to balance group-wise we'd place two tasks in the first group and * two tasks in the second group. Clearly this is undesired as it will overload - * cpu 3 and leave one of the cpus in the second group unused. + * cpu 3 and leave one of the CPUs in the second group unused. * * The current solution to this issue is detecting the skew in the first group * by noticing the lower domain failed to reach balance and had difficulty @@ -7891,7 +7889,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); - /* Bias balancing toward cpus of our domain */ + /* Bias balancing toward CPUs of our domain: */ if (local_group) load = target_load(i, load_idx); else @@ -7977,7 +7975,7 @@ asym_packing: if (!(env->sd->flags & SD_ASYM_PACKING)) return true; - /* No ASYM_PACKING if target cpu is already busy */ + /* No ASYM_PACKING if target CPU is already busy */ if (env->idle == CPU_NOT_IDLE) return true; /* @@ -7990,7 +7988,7 @@ asym_packing: if (!sds->busiest) return true; - /* Prefer to move from lowest priority cpu's work */ + /* Prefer to move from lowest priority CPU's work */ if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu)) return true; @@ -8243,7 +8241,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages - * to ensure cpu-load equilibrium, look at wider averages. XXX + * to ensure CPU-load equilibrium, look at wider averages. XXX */ busiest->load_per_task = min(busiest->load_per_task, sds->avg_load); @@ -8262,7 +8260,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* - * If there aren't any idle cpus, avoid creating some. + * If there aren't any idle CPUs, avoid creating some. */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { @@ -8276,9 +8274,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* - * We're trying to get all the cpus to the average_load, so we don't + * We're trying to get all the CPUs to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded cpu below the average load. At the same time, + * reduce the max loaded CPU below the average load. At the same time, * we also don't want to reduce the group load below the group * capacity. Thus we look for the minimum possible imbalance. */ @@ -8372,9 +8370,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (env->idle == CPU_IDLE) { /* - * This cpu is idle. If the busiest group is not overloaded + * This CPU is idle. If the busiest group is not overloaded * and there is no imbalance between this and busiest group - * wrt idle cpus, it is balanced. The imbalance becomes + * wrt idle CPUs, it is balanced. The imbalance becomes * significant if the diff is greater than 1 otherwise we * might end up to just move the imbalance on another group */ @@ -8402,7 +8400,7 @@ out_balanced: } /* - * find_busiest_queue - find the busiest runqueue among the cpus in group. + * find_busiest_queue - find the busiest runqueue among the CPUs in the group. */ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) @@ -8446,7 +8444,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, /* * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu capacity. + * which is not scaled with the CPU capacity. */ if (rq->nr_running == 1 && wl > env->imbalance && @@ -8454,9 +8452,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; /* - * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu capacity, so - * that the load can be moved away from the cpu that is + * For the load comparisons with the other CPU's, consider + * the weighted_cpuload() scaled with the CPU capacity, so + * that the load can be moved away from the CPU that is * potentially running at a lower capacity. * * Thus we're looking for max(wl_i / capacity_i), crosswise @@ -8527,13 +8525,13 @@ static int should_we_balance(struct lb_env *env) return 0; /* - * In the newly idle case, we will allow all the cpu's + * In the newly idle case, we will allow all the CPUs * to do the newly idle load balance. */ if (env->idle == CPU_NEWLY_IDLE) return 1; - /* Try to find first idle cpu */ + /* Try to find first idle CPU */ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { if (!idle_cpu(cpu)) continue; @@ -8546,7 +8544,7 @@ static int should_we_balance(struct lb_env *env) balance_cpu = group_balance_cpu(sg); /* - * First idle cpu or the first cpu(busiest) in this sched group + * First idle CPU or the first CPU(busiest) in this sched group * is eligible for doing load balancing at this and above domains. */ return balance_cpu == env->dst_cpu; @@ -8655,7 +8653,7 @@ more_balance: * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group * where they can run. The upper limit on how many times we - * iterate on same src_cpu is dependent on number of cpus in our + * iterate on same src_cpu is dependent on number of CPUs in our * sched_group. * * This changes load balance semantics a bit on who can move @@ -8672,7 +8670,7 @@ more_balance: */ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { - /* Prevent to re-select dst_cpu via env's cpus */ + /* Prevent to re-select dst_cpu via env's CPUs */ cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); @@ -8734,9 +8732,10 @@ more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); - /* don't kick the active_load_balance_cpu_stop, - * if the curr task on busiest cpu can't be - * moved to this_cpu + /* + * Don't kick the active_load_balance_cpu_stop, + * if the curr task on busiest CPU can't be + * moved to this_cpu: */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { raw_spin_unlock_irqrestore(&busiest->lock, @@ -8962,7 +8961,7 @@ out: } /* - * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * active_load_balance_cpu_stop is run by the CPU stopper. It pushes * running tasks off the busiest CPU onto idle CPUs. It requires at * least 1 task to be running on each physical CPU where possible, and * avoids physical / logical imbalances. @@ -8986,7 +8985,7 @@ static int active_load_balance_cpu_stop(void *data) if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) goto out_unlock; - /* make sure the requested cpu hasn't gone down in the meantime */ + /* Make sure the requested CPU hasn't gone down in the meantime: */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) goto out_unlock; @@ -8998,7 +8997,7 @@ static int active_load_balance_cpu_stop(void *data) /* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. + * Bjorn Helgaas on a 128-CPU setup. */ BUG_ON(busiest_rq == target_rq); @@ -9100,7 +9099,7 @@ static void nohz_balancer_kick(void) return; /* * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target cpu which + * This way we generate a sched IPI on the target CPU which * is idle. And the softirq performing nohz idle load balance * will be run before returning from the IPI. */ @@ -9157,14 +9156,12 @@ unlock: } /* - * This routine will record that the cpu is going idle with tick stopped. + * This routine will record that the CPU is going idle with tick stopped. * This info will be used in performing idle load balancing in the future. */ void nohz_balance_enter_idle(int cpu) { - /* - * If this cpu is going down, then nothing needs to be done. - */ + /* If this CPU is going down, then nothing needs to be done: */ if (!cpu_active(cpu)) return; @@ -9175,9 +9172,7 @@ void nohz_balance_enter_idle(int cpu) if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; - /* - * If we're a completely isolated CPU, we don't play. - */ + /* If we're a completely isolated CPU, we don't play: */ if (on_null_domain(cpu_rq(cpu))) return; @@ -9286,7 +9281,7 @@ out: /* * next_balance will be updated only when there is a need. - * When the cpu is attached to null domain for ex, it will not be + * When the CPU is attached to null domain for ex, it will not be * updated. */ if (likely(update_next_balance)) { @@ -9310,7 +9305,7 @@ out: #ifdef CONFIG_NO_HZ_COMMON /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. + * rebalancing for all the CPUs for whom scheduler ticks are stopped. */ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { @@ -9330,8 +9325,8 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) continue; /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load + * If this CPU gets work to do, stop the load balancing + * work being done for other CPUs. Next load * balancing owner will pick it up. */ if (need_resched()) @@ -9373,13 +9368,13 @@ end: /* * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu in the system. + * of an idle CPU in the system. * - This rq has more than one task. * - This rq has at least one CFS task and the capacity of the CPU is * significantly reduced because of RT tasks or IRQs. - * - At parent of LLC scheduler domain level, this cpu's scheduler group has - * multiple busy cpu. - * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler + * - At parent of LLC scheduler domain level, this CPU's scheduler group has + * multiple busy CPUs. + * - For SD_ASYM_PACKING, if the lower numbered CPU's in the scheduler * domain span are idle. */ static inline bool nohz_kick_needed(struct rq *rq) @@ -9469,10 +9464,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) CPU_IDLE : CPU_NOT_IDLE; /* - * If this cpu has a pending nohz_balance_kick, then do the - * balancing on behalf of the other idle cpus whose ticks are + * If this CPU has a pending nohz_balance_kick, then do the + * balancing on behalf of the other idle CPUs whose ticks are * stopped. Do nohz_idle_balance *before* rebalance_domains to - * give the idle cpus a chance to load balance. Else we may + * give the idle CPUs a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 7dae9eb8c042..343d25f85477 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -1,5 +1,5 @@ /* - * Generic entry point for the idle threads + * Generic entry points for the idle threads */ #include #include @@ -332,8 +332,8 @@ void cpu_startup_entry(enum cpuhp_state state) { /* * This #ifdef needs to die, but it's too late in the cycle to - * make this generic (arm and sh have never invoked the canary - * init for the non boot cpus!). Will be fixed in 3.11 + * make this generic (ARM and SH have never invoked the canary + * init for the non boot CPUs!). Will be fixed in 3.11 */ #ifdef CONFIG_X86 /* diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 48b8a83f5185..ec73680922f8 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -14,7 +14,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } -#endif /* CONFIG_SMP */ +#endif /* * Idle tasks are unconditionally rescheduled: @@ -30,6 +30,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf put_prev_task(rq, prev); update_idle_core(rq); schedstat_inc(rq->sched_goidle); + return rq->idle; } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 39f340dde1d7..aad5f48a07c6 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -6,13 +6,13 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ - #include #include #include #include #include #include + #include "sched.h" DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 89a989e4d758..a398e7e28a8a 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -32,29 +32,29 @@ * Due to a number of reasons the above turns in the mess below: * * - for_each_possible_cpu() is prohibitively expensive on machines with - * serious number of cpus, therefore we need to take a distributed approach + * serious number of CPUs, therefore we need to take a distributed approach * to calculating nr_active. * * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } * * So assuming nr_active := 0 when we start out -- true per definition, we - * can simply take per-cpu deltas and fold those into a global accumulate + * can simply take per-CPU deltas and fold those into a global accumulate * to obtain the same result. See calc_load_fold_active(). * - * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * Furthermore, in order to avoid synchronizing all per-CPU delta folding * across the machine, we assume 10 ticks is sufficient time for every - * cpu to have completed this task. + * CPU to have completed this task. * * This places an upper-bound on the IRQ-off latency of the machine. Then * again, being late doesn't loose the delta, just wrecks the sample. * - * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because - * this would add another cross-cpu cacheline miss and atomic operation - * to the wakeup path. Instead we increment on whatever cpu the task ran - * when it went into uninterruptible state and decrement on whatever cpu + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because + * this would add another cross-CPU cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever CPU the task ran + * when it went into uninterruptible state and decrement on whatever CPU * did the wakeup. This means that only the sum of nr_uninterruptible over - * all cpus yields the correct result. + * all CPUs yields the correct result. * * This covers the NO_HZ=n code, for extra head-aches, see the comment below. */ @@ -115,11 +115,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * Handle NO_HZ for the global load-average. * * Since the above described distributed algorithm to compute the global - * load-average relies on per-cpu sampling from the tick, it is affected by + * load-average relies on per-CPU sampling from the tick, it is affected by * NO_HZ. * * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon - * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * entering NO_HZ state such that we can include this as an 'extra' CPU delta * when we read the global state. * * Obviously reality has to ruin such a delightfully simple scheme: @@ -146,9 +146,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * busy state. * * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which + * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ + * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ * intervals. * * When making the ILB scale, we should try to pull this in as well. @@ -299,7 +299,7 @@ calc_load_n(unsigned long load, unsigned long exp, } /* - * NO_HZ can leave us missing all per-cpu ticks calling + * NO_HZ can leave us missing all per-CPU ticks calling * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. @@ -363,7 +363,7 @@ void calc_global_load(unsigned long ticks) return; /* - * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. + * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. */ delta = calc_load_nohz_fold(); if (delta) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 5d0762633639..2c6ae2413fa2 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -27,18 +27,18 @@ * except MEMBARRIER_CMD_QUERY. */ #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE -#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ - (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) #else #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif -#define MEMBARRIER_CMD_BITMASK \ - (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ - | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ - | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ - | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ +#define MEMBARRIER_CMD_BITMASK \ + (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) static void ipi_mb(void *info) @@ -85,6 +85,7 @@ static int membarrier_global_expedited(void) */ if (cpu == raw_smp_processor_id()) continue; + rcu_read_lock(); p = task_rcu_dereference(&cpu_rq(cpu)->curr); if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & @@ -188,6 +189,7 @@ static int membarrier_private_expedited(int flags) * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ + return 0; } @@ -219,6 +221,7 @@ static int membarrier_register_global_expedited(void) } atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); + return 0; } @@ -253,6 +256,7 @@ static int membarrier_register_private_expedited(int flags) synchronize_sched(); } atomic_or(state, &mm->membarrier_state); + return 0; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c80563b4f6b9..e40498872111 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1453,9 +1453,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) return; /* - * There appears to be other cpus that can accept - * current and none to run 'p', so lets reschedule - * to try and push current away: + * There appear to be other CPUs that can accept + * the current task but none can run 'p', so lets reschedule + * to try and push the current task away: */ requeue_task_rt(rq, p, 1); resched_curr(rq); @@ -1596,12 +1596,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) if (!task_running(rq, p) && cpumask_test_cpu(cpu, &p->cpus_allowed)) return 1; + return 0; } /* * Return the highest pushable rq's task, which is suitable to be executed - * on the cpu, NULL otherwise + * on the CPU, NULL otherwise */ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) { @@ -1639,11 +1640,11 @@ static int find_lowest_rq(struct task_struct *task) return -1; /* No targets found */ /* - * At this point we have built a mask of cpus representing the + * At this point we have built a mask of CPUs representing the * lowest priority tasks in the system. Now we want to elect * the best one based on our affinity and topology. * - * We prioritize the last cpu that the task executed on since + * We prioritize the last CPU that the task executed on since * it is most likely cache-hot in that location. */ if (cpumask_test_cpu(cpu, lowest_mask)) @@ -1651,7 +1652,7 @@ static int find_lowest_rq(struct task_struct *task) /* * Otherwise, we consult the sched_domains span maps to figure - * out which cpu is logically closest to our hot cache data. + * out which CPU is logically closest to our hot cache data. */ if (!cpumask_test_cpu(this_cpu, lowest_mask)) this_cpu = -1; /* Skip this_cpu opt if not among lowest */ @@ -1692,6 +1693,7 @@ static int find_lowest_rq(struct task_struct *task) cpu = cpumask_any(lowest_mask); if (cpu < nr_cpu_ids) return cpu; + return -1; } @@ -1827,7 +1829,7 @@ retry: * The task hasn't migrated, and is still the next * eligible task, but we failed to find a run-queue * to push it to. Do not retry in this case, since - * other cpus will pull from us when ready. + * other CPUs will pull from us when ready. */ goto out; } @@ -1919,7 +1921,7 @@ static int rto_next_cpu(struct root_domain *rd) * rt_next_cpu() will simply return the first CPU found in * the rto_mask. * - * If rto_next_cpu() is called with rto_cpu is a valid cpu, it + * If rto_next_cpu() is called with rto_cpu is a valid CPU, it * will return the next CPU found in the rto_mask. * * If there are no more CPUs left in the rto_mask, then a check is made @@ -1980,7 +1982,7 @@ static void tell_cpu_to_push(struct rq *rq) raw_spin_lock(&rq->rd->rto_lock); /* - * The rto_cpu is updated under the lock, if it has a valid cpu + * The rto_cpu is updated under the lock, if it has a valid CPU * then the IPI is still running and will continue due to the * update to loop_next, and nothing needs to be done here. * Otherwise it is finishing up and an ipi needs to be sent. @@ -2105,7 +2107,7 @@ static void pull_rt_task(struct rq *this_rq) /* * There's a chance that p is higher in priority - * than what's currently running on its cpu. + * than what's currently running on its CPU. * This is just that p is wakeing up and hasn't * had a chance to schedule. We only pull * p if it is lower in priority than the @@ -2693,6 +2695,7 @@ int sched_rr_handler(struct ctl_table *table, int write, msecs_to_jiffies(sysctl_sched_rr_timeslice); } mutex_unlock(&mutex); + return ret; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dc6c8b5a24ad..bd1461ae06e4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ - +/* + * Scheduler internal types and methods: + */ #include #include #include @@ -79,11 +81,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } * and does not change the user-interface for setting shares/weights. * * We increase resolution only if we have enough bits to allow this increased - * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are - * pretty high and the returns do not justify the increased costs. + * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit + * are pretty high and the returns do not justify the increased costs. * - * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to - * increase coverage and consistency always enable it on 64bit platforms. + * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to + * increase coverage and consistency always enable it on 64-bit platforms. */ #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) @@ -111,16 +113,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } * 10 -> just above 1us * 9 -> just above 0.5us */ -#define DL_SCALE (10) - -/* - * These are the 'tuning knobs' of the scheduler: - */ +#define DL_SCALE 10 /* - * single value that denotes runtime == period, ie unlimited time. + * Single value that denotes runtime == period, ie unlimited time. */ -#define RUNTIME_INF ((u64)~0ULL) +#define RUNTIME_INF ((u64)~0ULL) static inline int idle_policy(int policy) { @@ -235,9 +233,9 @@ void __dl_clear_params(struct task_struct *p); * control. */ struct dl_bandwidth { - raw_spinlock_t dl_runtime_lock; - u64 dl_runtime; - u64 dl_period; + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; }; static inline int dl_bandwidth_enabled(void) @@ -246,8 +244,9 @@ static inline int dl_bandwidth_enabled(void) } struct dl_bw { - raw_spinlock_t lock; - u64 bw, total_bw; + raw_spinlock_t lock; + u64 bw; + u64 total_bw; }; static inline void __dl_update(struct dl_bw *dl_b, s64 bw); @@ -273,20 +272,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } -void dl_change_utilization(struct task_struct *p, u64 new_bw); +extern void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); -extern int sched_dl_global_validate(void); +extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); -extern int sched_dl_overflow(struct task_struct *p, int policy, - const struct sched_attr *attr); +extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); -extern int dl_task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed); -extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, - const struct cpumask *trial); +extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED @@ -300,32 +296,36 @@ extern struct list_head task_groups; struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH - raw_spinlock_t lock; - ktime_t period; - u64 quota, runtime; - s64 hierarchical_quota; - u64 runtime_expires; - - int idle, period_active; - struct hrtimer period_timer, slack_timer; - struct list_head throttled_cfs_rq; - - /* statistics */ - int nr_periods, nr_throttled; - u64 throttled_time; + raw_spinlock_t lock; + ktime_t period; + u64 quota; + u64 runtime; + s64 hierarchical_quota; + u64 runtime_expires; + + int idle; + int period_active; + struct hrtimer period_timer; + struct hrtimer slack_timer; + struct list_head throttled_cfs_rq; + + /* Statistics: */ + int nr_periods; + int nr_throttled; + u64 throttled_time; #endif }; -/* task group related information */ +/* Task group related information */ struct task_group { struct cgroup_subsys_state css; #ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each cpu */ - struct sched_entity **se; - /* runqueue "owned" by this group on each cpu */ - struct cfs_rq **cfs_rq; - unsigned long shares; + /* schedulable entities of this group on each CPU */ + struct sched_entity **se; + /* runqueue "owned" by this group on each CPU */ + struct cfs_rq **cfs_rq; + unsigned long shares; #ifdef CONFIG_SMP /* @@ -333,29 +333,29 @@ struct task_group { * it in its own cacheline separated from the fields above which * will also be accessed at each tick. */ - atomic_long_t load_avg ____cacheline_aligned; + atomic_long_t load_avg ____cacheline_aligned; #endif #endif #ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity **rt_se; - struct rt_rq **rt_rq; + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; - struct rt_bandwidth rt_bandwidth; + struct rt_bandwidth rt_bandwidth; #endif - struct rcu_head rcu; - struct list_head list; + struct rcu_head rcu; + struct list_head list; - struct task_group *parent; - struct list_head siblings; - struct list_head children; + struct task_group *parent; + struct list_head siblings; + struct list_head children; #ifdef CONFIG_SCHED_AUTOGROUP - struct autogroup *autogroup; + struct autogroup *autogroup; #endif - struct cfs_bandwidth cfs_bandwidth; + struct cfs_bandwidth cfs_bandwidth; }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -369,8 +369,8 @@ struct task_group { * (The default weight is 1024 - so there's no practical * limitation from this.) */ -#define MIN_SHARES (1UL << 1) -#define MAX_SHARES (1UL << 18) +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) #endif typedef int (*tg_visitor)(struct task_group *, void *); @@ -443,35 +443,39 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { - struct load_weight load; - unsigned long runnable_weight; - unsigned int nr_running, h_nr_running; + struct load_weight load; + unsigned long runnable_weight; + unsigned int nr_running; + unsigned int h_nr_running; - u64 exec_clock; - u64 min_vruntime; + u64 exec_clock; + u64 min_vruntime; #ifndef CONFIG_64BIT - u64 min_vruntime_copy; + u64 min_vruntime_copy; #endif - struct rb_root_cached tasks_timeline; + struct rb_root_cached tasks_timeline; /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next, *last, *skip; + struct sched_entity *curr; + struct sched_entity *next; + struct sched_entity *last; + struct sched_entity *skip; #ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; + unsigned int nr_spread_over; #endif #ifdef CONFIG_SMP /* * CFS load tracking */ - struct sched_avg avg; + struct sched_avg avg; #ifndef CONFIG_64BIT - u64 load_last_update_time_copy; + u64 load_last_update_time_copy; #endif struct { raw_spinlock_t lock ____cacheline_aligned; @@ -482,9 +486,9 @@ struct cfs_rq { } removed; #ifdef CONFIG_FAIR_GROUP_SCHED - unsigned long tg_load_avg_contrib; - long propagate; - long prop_runnable_sum; + unsigned long tg_load_avg_contrib; + long propagate; + long prop_runnable_sum; /* * h_load = weight * f(tg) @@ -492,36 +496,38 @@ struct cfs_rq { * Where f(tg) is the recursive weight fraction assigned to * this group. */ - unsigned long h_load; - u64 last_h_load_update; - struct sched_entity *h_load_next; + unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; #endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ /* * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in * a hierarchy). Non-leaf lrqs hold other higher schedulable entities * (like users, containers etc.) * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. + * This list is used during load balance. */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_CFS_BANDWIDTH - int runtime_enabled; - u64 runtime_expires; - s64 runtime_remaining; - - u64 throttled_clock, throttled_clock_task; - u64 throttled_clock_task_time; - int throttled, throttle_count; - struct list_head throttled_list; + int runtime_enabled; + u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_clock; + u64 throttled_clock_task; + u64 throttled_clock_task_time; + int throttled; + int throttle_count; + struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -538,45 +544,45 @@ static inline int rt_bandwidth_enabled(void) /* Real-Time classes' related field in a runqueue: */ struct rt_rq { - struct rt_prio_array active; - unsigned int rt_nr_running; - unsigned int rr_nr_running; + struct rt_prio_array active; + unsigned int rt_nr_running; + unsigned int rr_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { - int curr; /* highest queued rt task prio */ + int curr; /* highest queued rt task prio */ #ifdef CONFIG_SMP - int next; /* next highest */ + int next; /* next highest */ #endif } highest_prio; #endif #ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - unsigned long rt_nr_total; - int overloaded; - struct plist_head pushable_tasks; + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; #endif /* CONFIG_SMP */ - int rt_queued; + int rt_queued; - int rt_throttled; - u64 rt_time; - u64 rt_runtime; + int rt_throttled; + u64 rt_time; + u64 rt_runtime; /* Nests inside the rq lock: */ - raw_spinlock_t rt_runtime_lock; + raw_spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; + unsigned long rt_nr_boosted; - struct rq *rq; - struct task_group *tg; + struct rq *rq; + struct task_group *tg; #endif }; /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ - struct rb_root_cached root; + struct rb_root_cached root; - unsigned long dl_nr_running; + unsigned long dl_nr_running; #ifdef CONFIG_SMP /* @@ -586,28 +592,28 @@ struct dl_rq { * should migrate somewhere else. */ struct { - u64 curr; - u64 next; + u64 curr; + u64 next; } earliest_dl; - unsigned long dl_nr_migratory; - int overloaded; + unsigned long dl_nr_migratory; + int overloaded; /* * Tasks on this rq that can be pushed away. They are kept in * an rb-tree, ordered by tasks' deadlines, with caching * of the leftmost (earliest deadline) element. */ - struct rb_root_cached pushable_dl_tasks_root; + struct rb_root_cached pushable_dl_tasks_root; #else - struct dl_bw dl_bw; + struct dl_bw dl_bw; #endif /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a * task blocks */ - u64 running_bw; + u64 running_bw; /* * Utilization of the tasks "assigned" to this runqueue (including @@ -618,14 +624,14 @@ struct dl_rq { * This is needed to compute the "inactive utilization" for the * runqueue (inactive utilization = this_bw - running_bw). */ - u64 this_bw; - u64 extra_bw; + u64 this_bw; + u64 extra_bw; /* * Inverse of the fraction of CPU utilization that can be reclaimed * by the GRUB algorithm. */ - u64 bw_ratio; + u64 bw_ratio; }; #ifdef CONFIG_SMP @@ -638,51 +644,51 @@ static inline bool sched_asym_prefer(int a, int b) /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by - * fully partitioning the member cpus from any other cpuset. Whenever a new + * fully partitioning the member CPUs from any other cpuset. Whenever a new * exclusive cpuset is created, we also create and attach a new root-domain * object. * */ struct root_domain { - atomic_t refcount; - atomic_t rto_count; - struct rcu_head rcu; - cpumask_var_t span; - cpumask_var_t online; + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; /* Indicate more than one runnable task for any CPU */ - bool overload; + bool overload; /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). */ - cpumask_var_t dlo_mask; - atomic_t dlo_count; - struct dl_bw dl_bw; - struct cpudl cpudl; + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; #ifdef HAVE_RT_PUSH_IPI /* * For IPI pull requests, loop across the rto_mask. */ - struct irq_work rto_push_work; - raw_spinlock_t rto_lock; + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; /* These are only updated and read within rto_lock */ - int rto_loop; - int rto_cpu; + int rto_loop; + int rto_cpu; /* These atomics are updated outside of a lock */ - atomic_t rto_loop_next; - atomic_t rto_loop_start; + atomic_t rto_loop_next; + atomic_t rto_loop_start; #endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ - cpumask_var_t rto_mask; - struct cpupri cpupri; + cpumask_var_t rto_mask; + struct cpupri cpupri; - unsigned long max_cpu_capacity; + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -708,39 +714,39 @@ extern void rto_push_irq_work_func(struct irq_work *work); */ struct rq { /* runqueue lock: */ - raw_spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because * remote CPUs use both these fields when doing load calculation. */ - unsigned int nr_running; + unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING - unsigned int nr_numa_running; - unsigned int nr_preferred_running; + unsigned int nr_numa_running; + unsigned int nr_preferred_running; #endif #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_SMP - unsigned long last_load_update_tick; + unsigned long last_load_update_tick; #endif /* CONFIG_SMP */ - unsigned long nohz_flags; + unsigned long nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ - /* capture load from *all* tasks on this cpu: */ - struct load_weight load; - unsigned long nr_load_updates; - u64 nr_switches; + /* capture load from *all* tasks on this CPU: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; - struct cfs_rq cfs; - struct rt_rq rt; - struct dl_rq dl; + struct cfs_rq cfs; + struct rt_rq rt; + struct dl_rq dl; #ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this cpu: */ - struct list_head leaf_cfs_rq_list; - struct list_head *tmp_alone_branch; + /* list of leaf cfs_rq on this CPU: */ + struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -749,94 +755,98 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned long nr_uninterruptible; + unsigned long nr_uninterruptible; - struct task_struct *curr, *idle, *stop; - unsigned long next_balance; - struct mm_struct *prev_mm; + struct task_struct *curr; + struct task_struct *idle; + struct task_struct *stop; + unsigned long next_balance; + struct mm_struct *prev_mm; - unsigned int clock_update_flags; - u64 clock; - u64 clock_task; + unsigned int clock_update_flags; + u64 clock; + u64 clock_task; - atomic_t nr_iowait; + atomic_t nr_iowait; #ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; + struct root_domain *rd; + struct sched_domain *sd; + + unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; - unsigned long cpu_capacity; - unsigned long cpu_capacity_orig; + struct callback_head *balance_callback; - struct callback_head *balance_callback; + unsigned char idle_balance; - unsigned char idle_balance; /* For active balancing */ - int active_balance; - int push_cpu; - struct cpu_stop_work active_balance_work; - /* cpu of this runqueue: */ - int cpu; - int online; + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + + /* CPU of this runqueue: */ + int cpu; + int online; struct list_head cfs_tasks; - u64 rt_avg; - u64 age_stamp; - u64 idle_stamp; - u64 avg_idle; + u64 rt_avg; + u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; /* This is used to determine avg_idle's max value */ - u64 max_idle_balance_cost; + u64 max_idle_balance_cost; #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING - u64 prev_irq_time; + u64 prev_irq_time; #endif #ifdef CONFIG_PARAVIRT - u64 prev_steal_time; + u64 prev_steal_time; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - u64 prev_steal_time_rq; + u64 prev_steal_time_rq; #endif /* calc_load related fields */ - unsigned long calc_load_update; - long calc_load_active; + unsigned long calc_load_update; + long calc_load_active; #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP - int hrtick_csd_pending; - call_single_data_t hrtick_csd; + int hrtick_csd_pending; + call_single_data_t hrtick_csd; #endif - struct hrtimer hrtick_timer; + struct hrtimer hrtick_timer; #endif #ifdef CONFIG_SCHEDSTATS /* latency stats */ - struct sched_info rq_sched_info; - unsigned long long rq_cpu_time; + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ - unsigned int yld_count; + unsigned int yld_count; /* schedule() stats */ - unsigned int sched_count; - unsigned int sched_goidle; + unsigned int sched_count; + unsigned int sched_goidle; /* try_to_wake_up() stats */ - unsigned int ttwu_count; - unsigned int ttwu_local; + unsigned int ttwu_count; + unsigned int ttwu_local; #endif #ifdef CONFIG_SMP - struct llist_head wake_list; + struct llist_head wake_list; #endif #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ - struct cpuidle_state *idle_state; + struct cpuidle_state *idle_state; #endif }; @@ -902,9 +912,9 @@ static inline u64 __rq_clock_broken(struct rq *rq) * one position though, because the next rq_unpin_lock() will shift it * back. */ -#define RQCF_REQ_SKIP 0x01 -#define RQCF_ACT_SKIP 0x02 -#define RQCF_UPDATED 0x04 +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 static inline void assert_clock_updated(struct rq *rq) { @@ -1057,12 +1067,12 @@ extern void sched_ttwu_pending(void); /** * highest_flag_domain - Return highest sched_domain containing flag. - * @cpu: The cpu whose highest level of sched domain is to + * @cpu: The CPU whose highest level of sched domain is to * be returned. * @flag: The flag to check for the highest sched_domain - * for the given cpu. + * for the given CPU. * - * Returns the highest sched_domain of a cpu which contains the given flag. + * Returns the highest sched_domain of a CPU which contains the given flag. */ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) { @@ -1097,30 +1107,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { - atomic_t ref; + atomic_t ref; /* * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU. */ - unsigned long capacity; - unsigned long min_capacity; /* Min per-CPU capacity in group */ - unsigned long next_update; - int imbalance; /* XXX unrelated to capacity but shared group state */ + unsigned long capacity; + unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long next_update; + int imbalance; /* XXX unrelated to capacity but shared group state */ #ifdef CONFIG_SCHED_DEBUG - int id; + int id; #endif - unsigned long cpumask[0]; /* balance mask */ + unsigned long cpumask[0]; /* Balance mask */ }; struct sched_group { - struct sched_group *next; /* Must be a circular list */ - atomic_t ref; + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; - unsigned int group_weight; + unsigned int group_weight; struct sched_group_capacity *sgc; - int asym_prefer_cpu; /* cpu of highest priority in group */ + int asym_prefer_cpu; /* CPU of highest priority in group */ /* * The CPUs this group covers. @@ -1129,7 +1139,7 @@ struct sched_group { * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) */ - unsigned long cpumask[0]; + unsigned long cpumask[0]; }; static inline struct cpumask *sched_group_span(struct sched_group *sg) @@ -1146,8 +1156,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) } /** - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. - * @group: The group whose first cpu is to be returned. + * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. + * @group: The group whose first CPU is to be returned. */ static inline unsigned int group_first_cpu(struct sched_group *group) { @@ -1357,9 +1367,9 @@ static inline int task_on_rq_migrating(struct task_struct *p) /* * wake flags */ -#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -#define WF_FORK 0x02 /* child wakeup after fork */ -#define WF_MIGRATED 0x4 /* internal use, task got migrated */ +#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* Child wakeup after fork */ +#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1370,11 +1380,11 @@ static inline int task_on_rq_migrating(struct task_struct *p) * slice expiry etc. */ -#define WEIGHT_IDLEPRIO 3 -#define WMULT_IDLEPRIO 1431655765 +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 -extern const int sched_prio_to_weight[40]; -extern const u32 sched_prio_to_wmult[40]; +extern const int sched_prio_to_weight[40]; +extern const u32 sched_prio_to_wmult[40]; /* * {de,en}queue flags: @@ -1396,9 +1406,9 @@ extern const u32 sched_prio_to_wmult[40]; */ #define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ -#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ -#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -1420,10 +1430,10 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); - void (*yield_task) (struct rq *rq); - bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); - void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); /* * It is the responsibility of the pick_next_task() method that will @@ -1433,16 +1443,16 @@ struct sched_class { * May return RETRY_TASK when it finds a higher prio class has runnable * tasks. */ - struct task_struct * (*pick_next_task) (struct rq *rq, - struct task_struct *prev, - struct rq_flags *rf); - void (*put_prev_task) (struct rq *rq, struct task_struct *p); + struct task_struct * (*pick_next_task)(struct rq *rq, + struct task_struct *prev, + struct rq_flags *rf); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p); - void (*task_woken) (struct rq *this_rq, struct task_struct *task); + void (*task_woken)(struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, const struct cpumask *newmask); @@ -1451,31 +1461,31 @@ struct sched_class { void (*rq_offline)(struct rq *rq); #endif - void (*set_curr_task) (struct rq *rq); - void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); - void (*task_fork) (struct task_struct *p); - void (*task_dead) (struct task_struct *p); + void (*set_curr_task)(struct rq *rq); + void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + void (*task_fork)(struct task_struct *p); + void (*task_dead)(struct task_struct *p); /* * The switched_from() call is allowed to drop rq->lock, therefore we * cannot assume the switched_from/switched_to pair is serliazed by * rq->lock. They are however serialized by p->pi_lock. */ - void (*switched_from) (struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*switched_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); + int oldprio); - unsigned int (*get_rr_interval) (struct rq *rq, - struct task_struct *task); + unsigned int (*get_rr_interval)(struct rq *rq, + struct task_struct *task); - void (*update_curr) (struct rq *rq); + void (*update_curr)(struct rq *rq); -#define TASK_SET_GROUP 0 -#define TASK_MOVE_GROUP 1 +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_change_group) (struct task_struct *p, int type); + void (*task_change_group)(struct task_struct *p, int type); #endif }; @@ -1524,6 +1534,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { SCHED_WARN_ON(!rcu_read_lock_held()); + return rq->idle_state; } #else @@ -1562,9 +1573,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); -#define BW_SHIFT 20 -#define BW_UNIT (1 << BW_SHIFT) -#define RATIO_SHIFT 8 +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); @@ -1814,8 +1825,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) /* * Unfair double_lock_balance: Optimizes throughput at the expense of * latency by eliminating extra atomic operations when the locks are - * already in proper order on entry. This favors lower cpu-ids and will - * grant the double lock to lower cpus over higher ids under contention, + * already in proper order on entry. This favors lower CPU-ids and will + * grant the double lock to lower CPUs over higher ids under contention, * regardless of entry order into the function. */ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) @@ -1847,7 +1858,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) { if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ + /* printk() doesn't work well under rq->lock */ raw_spin_unlock(&this_rq->lock); BUG_ON(1); } @@ -2106,15 +2117,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity -#ifndef arch_scale_freq_invariant -#define arch_scale_freq_invariant() (true) -#endif -#else /* arch_scale_freq_capacity */ -#define arch_scale_freq_invariant() (false) +# ifndef arch_scale_freq_invariant +# define arch_scale_freq_invariant() true +# endif +#else +# define arch_scale_freq_invariant() false #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL - static inline unsigned long cpu_util_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -2124,5 +2134,4 @@ static inline unsigned long cpu_util_cfs(struct rq *rq) { return rq->cfs.avg.util_avg; } - #endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 940b1fa1d2ce..968c1fe3099a 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -78,8 +78,8 @@ static int show_schedstat(struct seq_file *seq, void *v) * This itererator needs some explanation. * It returns 1 for the header position. * This means 2 is cpu 0. - * In a hotplugged system some cpus, including cpu 0, may be missing so we have - * to use cpumask_* to iterate over the cpus. + * In a hotplugged system some CPUs, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the CPUs. */ static void *schedstat_start(struct seq_file *file, loff_t *offset) { @@ -99,12 +99,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset) if (n < nr_cpu_ids) return (void *)(unsigned long)(n + 2); + return NULL; } static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) { (*offset)++; + return schedstat_start(file, offset); } @@ -134,6 +136,7 @@ static const struct file_operations proc_schedstat_operations = { static int __init proc_schedstat_init(void) { proc_create("schedstat", 0, NULL, &proc_schedstat_operations); + return 0; } subsys_initcall(proc_schedstat_init); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8e7b58de61e7..8aea199a39b4 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) #define __schedstat_inc(var) do { var++; } while (0) -#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) #define __schedstat_add(var, amt) do { var += (amt); } while (0) -#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) -#define __schedstat_set(var, val) do { var = (val); } while (0) -#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) -#define schedstat_val(var) (var) -#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) - -#else /* !CONFIG_SCHEDSTATS */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long long delta) -{} -static inline void -rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) -{} -static inline void -rq_sched_info_depart(struct rq *rq, unsigned long long delta) -{} -#define schedstat_enabled() 0 -#define __schedstat_inc(var) do { } while (0) -#define schedstat_inc(var) do { } while (0) -#define __schedstat_add(var, amt) do { } while (0) -#define schedstat_add(var, amt) do { } while (0) -#define __schedstat_set(var, val) do { } while (0) -#define schedstat_set(var, val) do { } while (0) -#define schedstat_val(var) 0 -#define schedstat_val_or_zero(var) 0 +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val) do { var = (val); } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) + +#else /* !CONFIG_SCHEDSTATS: */ +static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } +# define schedstat_enabled() 0 +# define __schedstat_inc(var) do { } while (0) +# define schedstat_inc(var) do { } while (0) +# define __schedstat_add(var, amt) do { } while (0) +# define schedstat_add(var, amt) do { } while (0) +# define __schedstat_set(var, val) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +# define schedstat_val(var) 0 +# define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SCHED_INFO @@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) /* * We are interested in knowing how long it was from the *first* time a - * task was queued to the time that it finally hit a cpu, we call this routine - * from dequeue_task() to account for possible rq->clock skew across cpus. The - * delta taken on each cpu would annul the skew. + * task was queued to the time that it finally hit a CPU, we call this routine + * from dequeue_task() to account for possible rq->clock skew across CPUs. The + * delta taken on each CPU would annul the skew. */ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) { @@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) } /* - * Called when a task finally hits the cpu. We can now calculate how + * Called when a task finally hits the CPU. We can now calculate how * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ @@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) */ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) { - if (unlikely(sched_info_on())) + if (unlikely(sched_info_on())) { if (!t->sched_info.last_queued) t->sched_info.last_queued = rq_clock(rq); + } } /* @@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) */ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) { - unsigned long long delta = rq_clock(rq) - - t->sched_info.last_arrival; + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; rq_sched_info_depart(rq, delta); @@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct rq *rq, - struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { /* - * prev now departs the cpu. It's not interesting to record + * prev now departs the CPU. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ @@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq, if (next != rq->idle) sched_info_arrive(rq, next); } + static inline void -sched_info_switch(struct rq *rq, - struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { if (unlikely(sched_info_on())) __sched_info_switch(rq, prev, next); } -#else -#define sched_info_queued(rq, t) do { } while (0) -#define sched_info_reset_dequeued(t) do { } while (0) -#define sched_info_dequeued(rq, t) do { } while (0) -#define sched_info_depart(rq, t) do { } while (0) -#define sched_info_arrive(rq, next) do { } while (0) -#define sched_info_switch(rq, t, next) do { } while (0) + +#else /* !CONFIG_SCHED_INFO: */ +# define sched_info_queued(rq, t) do { } while (0) +# define sched_info_reset_dequeued(t) do { } while (0) +# define sched_info_dequeued(rq, t) do { } while (0) +# define sched_info_depart(rq, t) do { } while (0) +# define sched_info_arrive(rq, next) do { } while (0) +# define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index ea8d2b6a1239..c183b790ca54 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - /* * stop-task scheduling class. * @@ -9,6 +7,7 @@ * * See kernel/stop_machine.c */ +#include "sched.h" #ifdef CONFIG_SMP static int diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 9ff1555341ed..b88ab4e0207f 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * (simple wait queues ) implementation: + */ #include #include diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 519b024f4e94..219eee70e457 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -41,8 +41,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); return -1; } @@ -50,12 +49,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); } if (!cpumask_test_cpu(cpu, sched_group_span(group))) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } printk(KERN_DEBUG "%*s groups:", level + 1, ""); @@ -115,8 +112,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); + printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); return 0; } @@ -595,7 +591,7 @@ int group_balance_cpu(struct sched_group *sg) * are not. * * This leads to a few particularly weird cases where the sched_domain's are - * not of the same number for each cpu. Consider: + * not of the same number for each CPU. Consider: * * NUMA-2 0-3 0-3 * groups: {0-2},{1-3} {1-3},{0-2} @@ -780,7 +776,7 @@ fail: * ^ ^ ^ ^ * `-' `-' * - * The sched_domains are per-cpu and have a two way link (parent & child) and + * The sched_domains are per-CPU and have a two way link (parent & child) and * denote the ever growing mask of CPUs belonging to that level of topology. * * Each sched_domain has a circular (double) linked list of sched_group's, each @@ -1021,6 +1017,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) d->rd = alloc_rootdomain(); if (!d->rd) return sa_sd; + return sa_rootdomain; } @@ -1047,12 +1044,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd) } #ifdef CONFIG_NUMA -static int sched_domains_numa_levels; enum numa_topology_type sched_numa_topology_type; -static int *sched_domains_numa_distance; -int sched_max_numa_distance; -static struct cpumask ***sched_domains_numa_masks; -static int sched_domains_curr_level; + +static int sched_domains_numa_levels; +static int sched_domains_curr_level; + +int sched_max_numa_distance; +static int *sched_domains_numa_distance; +static struct cpumask ***sched_domains_numa_masks; #endif /* @@ -1074,11 +1073,11 @@ static int sched_domains_curr_level; * SD_ASYM_PACKING - describes SMT quirks */ #define TOPOLOGY_SD_FLAGS \ - (SD_SHARE_CPUCAPACITY | \ + (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ - SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_ASYM_CPUCAPACITY | \ + SD_NUMA | \ + SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * @@ -1628,7 +1627,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); #endif - /* Fixup, ensure @sd has at least @child cpus. */ + /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), sched_domain_span(child)); @@ -1720,6 +1719,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); + return ret; } @@ -1824,6 +1824,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, return 1; tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, new ? (new + idx_new) : &tmp, sizeof(struct sched_domain_attr)); @@ -1929,4 +1930,3 @@ match2: mutex_unlock(&sched_domains_mutex); } - diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 929ecb7d6b78..7b2a142ae629 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -107,6 +107,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, break; } } + return nr_exclusive; } @@ -317,6 +318,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) spin_unlock(&wq->lock); schedule(); spin_lock(&wq->lock); + return 0; } EXPORT_SYMBOL(do_wait_intr); @@ -333,6 +335,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) spin_unlock_irq(&wq->lock); schedule(); spin_lock_irq(&wq->lock); + return 0; } EXPORT_SYMBOL(do_wait_intr_irq); @@ -378,6 +381,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i if (ret) list_del_init(&wq_entry->entry); + return ret; } EXPORT_SYMBOL(autoremove_wake_function); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 84cb3acd9260..5293c59163a6 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -29,8 +29,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync wait_bit->key.bit_nr != key->bit_nr || test_bit(key->bit_nr, key->flags)) return 0; - else - return autoremove_wake_function(wq_entry, mode, sync, key); + + return autoremove_wake_function(wq_entry, mode, sync, key); } EXPORT_SYMBOL(wake_bit_function); @@ -50,7 +50,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) ret = (*action)(&wbq_entry->key, mode); } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; } EXPORT_SYMBOL(__wait_on_bit); @@ -73,6 +75,7 @@ int __sched out_of_line_wait_on_bit_timeout( DEFINE_WAIT_BIT(wq_entry, word, bit); wq_entry.key.timeout = jiffies + timeout; + return __wait_on_bit(wq_head, &wq_entry, action, mode); } EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); @@ -120,6 +123,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq_head)) __wake_up(wq_head, TASK_NORMAL, 1, &key); } @@ -157,6 +161,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) { if (BITS_PER_LONG == 64) { unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); } return bit_waitqueue(p, 0); @@ -173,6 +178,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo wait_bit->key.bit_nr != key->bit_nr || atomic_read(val) != 0) return 0; + return autoremove_wake_function(wq_entry, mode, sync, key); } @@ -196,6 +202,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en ret = (*action)(val, mode); } while (!ret && atomic_read(val) != 0); finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; } @@ -226,6 +233,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode) schedule(); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL(atomic_t_wait); @@ -250,6 +258,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode) schedule(); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL(bit_wait); @@ -259,6 +268,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode) io_schedule(); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL(bit_wait_io); @@ -266,11 +276,13 @@ EXPORT_SYMBOL(bit_wait_io); __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); @@ -278,11 +290,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); if (signal_pending_state(mode, current)) return -EINTR; + return 0; } EXPORT_SYMBOL_GPL(bit_wait_io_timeout); -- cgit v1.2.3-71-gd317 From 325ea10c0809406ce23f038602abbc454f3f761d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Mar 2018 12:20:47 +0100 Subject: sched/headers: Simplify and clean up header usage in the scheduler Do the following cleanups and simplifications: - sched/sched.h already includes , so no need to include it in sched/core.c again. - order the headers alphabetically - add all headers to kernel/sched/sched.h - remove all unnecessary includes from the .c files that are already included in kernel/sched/sched.h. Finally, make all scheduler .c files use a single common header: #include "sched.h" ... which now contains a union of the relied upon headers. This makes the various .c files easier to read and easier to handle. Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/deadline.h | 6 --- kernel/sched/autogroup.c | 9 ++--- kernel/sched/autogroup.h | 4 -- kernel/sched/clock.c | 14 +------ kernel/sched/completion.c | 5 +-- kernel/sched/core.c | 40 +++++++------------- kernel/sched/cpuacct.c | 13 +------ kernel/sched/cpudeadline.c | 5 +-- kernel/sched/cpudeadline.h | 2 - kernel/sched/cpufreq.c | 1 - kernel/sched/cpufreq_schedutil.c | 8 +--- kernel/sched/cpupri.c | 6 +-- kernel/sched/cpupri.h | 1 - kernel/sched/cputime.c | 10 ++--- kernel/sched/deadline.c | 3 -- kernel/sched/debug.c | 11 +----- kernel/sched/fair.c | 16 +------- kernel/sched/idle.c | 15 +------- kernel/sched/idle_task.c | 5 +-- kernel/sched/isolation.c | 7 ---- kernel/sched/loadavg.c | 4 -- kernel/sched/membarrier.c | 9 +---- kernel/sched/rt.c | 4 -- kernel/sched/sched.h | 81 ++++++++++++++++++++++++++-------------- kernel/sched/stats.c | 13 +++---- kernel/sched/swait.c | 3 +- kernel/sched/topology.c | 4 -- kernel/sched/wait.c | 9 +---- kernel/sched/wait_bit.c | 5 +-- 29 files changed, 94 insertions(+), 219 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index a5bc8728ead7..0cb034331cbb 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -1,8 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SCHED_DEADLINE_H -#define _LINUX_SCHED_DEADLINE_H - -#include /* * SCHED_DEADLINE tasks has negative priorities, reflecting @@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b) { return (s64)(a - b) < 0; } - -#endif /* _LINUX_SCHED_DEADLINE_H */ diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index ff1b7b647b86..6be6c575b6cd 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,10 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include - +/* + * Auto-group scheduling implementation: + */ #include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 49e6ec9559cf..b96419974a1f 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,10 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_SCHED_AUTOGROUP -#include -#include -#include - struct autogroup { /* * Reference doesn't mean how many threads attach to this diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 7da6bec8a2ff..10c83e73837a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -52,19 +52,7 @@ * that is otherwise invisible (TSC gets stopped). * */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" /* * Scheduler clock - returns current time in nanosec units. diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 0926aef10dad..5d2d56b0817a 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -11,10 +11,7 @@ * typically be used for exclusion which gives rise to priority inversion. * Waiting for completion is a typically sync point, but not an exclusion point. */ - -#include -#include -#include +#include "sched.h" /** * complete: - signals a single thread waiting on this completion diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9427b59551c1..e1e334ba8ff9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5,37 +5,11 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" #include #include -#ifdef CONFIG_PARAVIRT -#include -#endif -#include "sched.h" #include "../workqueue_internal.h" #include "../smpboot.h" @@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq) raw_spin_unlock_irq(&rq->lock); } +/* + * NOP if the arch has not defined these: + */ + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif + +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 1abd325e733a..9fbb10383434 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,22 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sched.h" - /* * CPU accounting code for task groups. * * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index cb172b61d191..50316455ea66 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -10,10 +10,7 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include -#include -#include -#include "cpudeadline.h" +#include "sched.h" static inline int parent(int i) { diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index c26e7a0e5a66..0adeda93b5fb 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,6 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#include -#include #define IDX_INVALID -1 diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index dbc51442ecbc..5e54cbcae673 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -8,7 +8,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - #include "sched.h" DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 0dad8160e00f..feb5f89020f2 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -11,14 +11,10 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include - #include "sched.h" +#include + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index f43e14ccb67d..daaadf939ccb 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -26,11 +26,7 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include -#include -#include -#include -#include "cpupri.h" +#include "sched.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ static int convert_prio(int prio) diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 141a06c914c6..7dc20a3232e7 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,5 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#include #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index d3b450b57ade..0796f938c4f0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -1,10 +1,6 @@ -#include -#include -#include -#include -#include -#include -#include +/* + * Simple CPU accounting cgroup controller + */ #include "sched.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 58f8b7b37983..af491f537636 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,9 +17,6 @@ */ #include "sched.h" -#include -#include - struct dl_bandwidth def_dl_bandwidth; static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7c82a9b88510..644d9a464380 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1,7 +1,7 @@ /* * kernel/sched/debug.c * - * Print the CFS rbtree + * Print the CFS rbtree and other debugging details * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar * @@ -9,15 +9,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include -#include -#include -#include -#include -#include -#include -#include - #include "sched.h" static DEFINE_SPINLOCK(sched_debug_lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f877de96c9b..f5591071ae98 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,24 +20,10 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" #include -#include "sched.h" - /* * Targeted preemption latency for CPU-bound tasks: * diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 343d25f85477..2760e0357271 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -1,23 +1,10 @@ /* * Generic entry points for the idle threads */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#include "sched.h" #include -#include "sched.h" - /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index ec73680922f8..488222ac4651 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - /* * idle-task scheduling class. * - * (NOTE: these are not related to SCHED_IDLE tasks which are + * (NOTE: these are not related to SCHED_IDLE batch scheduling tasks which are * handled in sched/fair.c) */ +#include "sched.h" #ifdef CONFIG_SMP static int diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index aad5f48a07c6..e6802181900f 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -6,13 +6,6 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ -#include -#include -#include -#include -#include -#include - #include "sched.h" DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a398e7e28a8a..a171c1258109 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,10 +6,6 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ - -#include -#include - #include "sched.h" /* diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 2c6ae2413fa2..76e0eaf4654e 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -13,14 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - -#include -#include -#include -#include -#include - -#include "sched.h" /* for cpu_rq(). */ +#include "sched.h" /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e40498872111..a3d438fec46c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -3,12 +3,8 @@ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ - #include "sched.h" -#include -#include - int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bd1461ae06e4..23ba4dd76ac4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3,39 +3,71 @@ * Scheduler internal types and methods: */ #include + #include -#include -#include -#include -#include #include -#include -#include -#include -#include +#include #include -#include -#include +#include +#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include +#include +#include +#include +#include + +#include -#include -#include #include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include -#include -#include -#include +#include +#include +#include +#include +#include + +#include #ifdef CONFIG_PARAVIRT -#include +# include #endif #include "cpupri.h" @@ -1357,13 +1389,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) return p->on_rq == TASK_ON_RQ_MIGRATING; } -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_post_lock_switch -# define finish_arch_post_lock_switch() do { } while (0) -#endif - /* * wake flags */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 968c1fe3099a..ab112cbfd7c8 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -1,14 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include - +/* + * /proc/schedstat implementation + */ #include "sched.h" /* - * bump this up when changing the output format or the meaning of an existing + * Current schedstat API version. + * + * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ #define SCHEDSTAT_VERSION 15 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index b88ab4e0207f..b6fb2c3b3ff7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,8 +2,7 @@ /* * (simple wait queues ) implementation: */ -#include -#include +#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 219eee70e457..64cc564f5255 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2,10 +2,6 @@ /* * Scheduler topology setup/handling methods */ -#include -#include -#include - #include "sched.h" DEFINE_MUTEX(sched_domains_mutex); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 7b2a142ae629..928be527477e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -3,14 +3,7 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ -#include -#include -#include -#include -#include -#include -#include -#include +#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 5293c59163a6..4239c78f5cd3 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,10 +1,7 @@ /* * The implementation of the wait_bit*() and related waiting APIs: */ -#include -#include -#include -#include +#include "sched.h" #define WAIT_TABLE_BITS 8 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) -- cgit v1.2.3-71-gd317 From a92057e14beb233e8c891f4de075f2a468c71f15 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Mar 2018 15:44:39 +0100 Subject: sched/idle: Merge kernel/sched/idle.c and kernel/sched/idle_task.c Merge these two small .c modules as they implement two aspects of idle task handling. Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 5 +- kernel/sched/idle.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++- kernel/sched/idle_task.c | 117 -------------------------------------------- 3 files changed, 125 insertions(+), 120 deletions(-) delete mode 100644 kernel/sched/idle_task.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index e2f9d4feff40..d9a02b318108 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o -obj-y += wait.o wait_bit.o swait.o completion.o idle.o +obj-y += idle.o fair.o rt.o deadline.o +obj-y += wait.o wait_bit.o swait.o completion.o + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2760e0357271..2975f195e1c4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -1,5 +1,9 @@ /* - * Generic entry points for the idle threads + * Generic entry points for the idle threads and + * implementation of the idle task scheduling class. + * + * (NOTE: these are not related to SCHED_IDLE batch scheduled + * tasks which are handled in sched/fair.c ) */ #include "sched.h" @@ -33,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable) static int __init cpu_idle_poll_setup(char *__unused) { cpu_idle_force_poll = 1; + return 1; } __setup("nohlt", cpu_idle_poll_setup); @@ -40,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup); static int __init cpu_idle_nopoll_setup(char *__unused) { cpu_idle_force_poll = 0; + return 1; } __setup("hlt", cpu_idle_nopoll_setup); @@ -51,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void) trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); stop_critical_timings(); + while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); start_critical_timings(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); + return 1; } @@ -337,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state) while (1) do_idle(); } + +/* + * idle-task scheduling class. + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + return task_cpu(p); /* IDLE tasks as never migrated */ +} +#endif + +/* + * Idle tasks are unconditionally rescheduled: + */ +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) +{ + resched_curr(rq); +} + +static struct task_struct * +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + put_prev_task(rq, prev); + update_idle_core(rq); + schedstat_inc(rq->sched_goidle); + + return rq->idle; +} + +/* + * It is not legal to sleep in the idle task - print a warning + * message if some code attempts to do it: + */ +static void +dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) +{ + raw_spin_unlock_irq(&rq->lock); + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + raw_spin_lock_irq(&rq->lock); +} + +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +{ +} + +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ +static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_idle(struct rq *rq) +{ +} + +static void switched_to_idle(struct rq *rq, struct task_struct *p) +{ + BUG(); +} + +static void +prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +{ + BUG(); +} + +static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +static void update_curr_idle(struct rq *rq) +{ +} + +/* + * Simple, special scheduling class for the per-CPU idle tasks: + */ +const struct sched_class idle_sched_class = { + /* .next is NULL */ + /* no enqueue/yield_task for idle tasks */ + + /* dequeue is not valid, we print a debug message there: */ + .dequeue_task = dequeue_task_idle, + + .check_preempt_curr = check_preempt_curr_idle, + + .pick_next_task = pick_next_task_idle, + .put_prev_task = put_prev_task_idle, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_idle, + .set_cpus_allowed = set_cpus_allowed_common, +#endif + + .set_curr_task = set_curr_task_idle, + .task_tick = task_tick_idle, + + .get_rr_interval = get_rr_interval_idle, + + .prio_changed = prio_changed_idle, + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, +}; diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c deleted file mode 100644 index 488222ac4651..000000000000 --- a/kernel/sched/idle_task.c +++ /dev/null @@ -1,117 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * idle-task scheduling class. - * - * (NOTE: these are not related to SCHED_IDLE batch scheduling tasks which are - * handled in sched/fair.c) - */ -#include "sched.h" - -#ifdef CONFIG_SMP -static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) -{ - return task_cpu(p); /* IDLE tasks as never migrated */ -} -#endif - -/* - * Idle tasks are unconditionally rescheduled: - */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) -{ - resched_curr(rq); -} - -static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - put_prev_task(rq, prev); - update_idle_core(rq); - schedstat_inc(rq->sched_goidle); - - return rq->idle; -} - -/* - * It is not legal to sleep in the idle task - print a warning - * message if some code attempts to do it: - */ -static void -dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) -{ - raw_spin_unlock_irq(&rq->lock); - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - raw_spin_lock_irq(&rq->lock); -} - -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) -{ -} - -/* - * scheduler tick hitting a task of our scheduling class. - * - * NOTE: This function can be called remotely by the tick offload that - * goes along full dynticks. Therefore no local assumption can be made - * and everything must be accessed through the @rq and @curr passed in - * parameters. - */ -static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) -{ -} - -static void set_curr_task_idle(struct rq *rq) -{ -} - -static void switched_to_idle(struct rq *rq, struct task_struct *p) -{ - BUG(); -} - -static void -prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) -{ - BUG(); -} - -static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -{ - return 0; -} - -static void update_curr_idle(struct rq *rq) -{ -} - -/* - * Simple, special scheduling class for the per-CPU idle tasks: - */ -const struct sched_class idle_sched_class = { - /* .next is NULL */ - /* no enqueue/yield_task for idle tasks */ - - /* dequeue is not valid, we print a debug message there: */ - .dequeue_task = dequeue_task_idle, - - .check_preempt_curr = check_preempt_curr_idle, - - .pick_next_task = pick_next_task_idle, - .put_prev_task = put_prev_task_idle, - -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_idle, - .set_cpus_allowed = set_cpus_allowed_common, -#endif - - .set_curr_task = set_curr_task_idle, - .task_tick = task_tick_idle, - - .get_rr_interval = get_rr_interval_idle, - - .prio_changed = prio_changed_idle, - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, -}; -- cgit v1.2.3-71-gd317 From 02d8ec9456f47b8865f1ff3fbb532e12a760d3b5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Mar 2018 16:27:54 +0100 Subject: sched/deadline, rt: Rename queue_push_tasks/queue_pull_task to create separate namespace There are similarly named functions in both of these modules: kernel/sched/deadline.c:static inline void queue_push_tasks(struct rq *rq) kernel/sched/deadline.c:static inline void queue_pull_task(struct rq *rq) kernel/sched/deadline.c:static inline void queue_push_tasks(struct rq *rq) kernel/sched/deadline.c:static inline void queue_pull_task(struct rq *rq) kernel/sched/deadline.c: queue_push_tasks(rq); kernel/sched/deadline.c: queue_pull_task(rq); kernel/sched/deadline.c: queue_push_tasks(rq); kernel/sched/deadline.c: queue_pull_task(rq); kernel/sched/rt.c:static inline void queue_push_tasks(struct rq *rq) kernel/sched/rt.c:static inline void queue_pull_task(struct rq *rq) kernel/sched/rt.c:static inline void queue_push_tasks(struct rq *rq) kernel/sched/rt.c: queue_push_tasks(rq); kernel/sched/rt.c: queue_pull_task(rq); kernel/sched/rt.c: queue_push_tasks(rq); kernel/sched/rt.c: queue_pull_task(rq); ... which makes it harder to grep for them. Prefix them with deadline_ and rt_, respectively. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Thomas Gleixner Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 16 ++++++++-------- kernel/sched/rt.c | 14 +++++++------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index af491f537636..8b7c2b35bec9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -511,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head); static void push_dl_tasks(struct rq *); static void pull_dl_task(struct rq *); -static inline void queue_push_tasks(struct rq *rq) +static inline void deadline_queue_push_tasks(struct rq *rq) { if (!has_pushable_dl_tasks(rq)) return; @@ -519,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq) queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); } -static inline void queue_pull_task(struct rq *rq) +static inline void deadline_queue_pull_task(struct rq *rq) { queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); } @@ -594,11 +594,11 @@ static inline void pull_dl_task(struct rq *rq) { } -static inline void queue_push_tasks(struct rq *rq) +static inline void deadline_queue_push_tasks(struct rq *rq) { } -static inline void queue_pull_task(struct rq *rq) +static inline void deadline_queue_pull_task(struct rq *rq) { } #endif /* CONFIG_SMP */ @@ -1759,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); - queue_push_tasks(rq); + deadline_queue_push_tasks(rq); return p; } @@ -2309,7 +2309,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) return; - queue_pull_task(rq); + deadline_queue_pull_task(rq); } /* @@ -2331,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) - queue_push_tasks(rq); + deadline_queue_push_tasks(rq); #endif if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); @@ -2356,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, * or lowering its prio, so... */ if (!rq->dl.overloaded) - queue_pull_task(rq); + deadline_queue_pull_task(rq); /* * If we now have a earlier deadline task than p, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a3d438fec46c..4f4fd3b157f1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -355,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head); static void push_rt_tasks(struct rq *); static void pull_rt_task(struct rq *); -static inline void queue_push_tasks(struct rq *rq) +static inline void rt_queue_push_tasks(struct rq *rq) { if (!has_pushable_tasks(rq)) return; @@ -363,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq) queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); } -static inline void queue_pull_task(struct rq *rq) +static inline void rt_queue_pull_task(struct rq *rq) { queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); } @@ -421,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq) { } -static inline void queue_push_tasks(struct rq *rq) +static inline void rt_queue_push_tasks(struct rq *rq) { } #endif /* CONFIG_SMP */ @@ -1565,7 +1565,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); - queue_push_tasks(rq); + rt_queue_push_tasks(rq); return p; } @@ -2185,7 +2185,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; - queue_pull_task(rq); + rt_queue_pull_task(rq); } void __init init_sched_rt_class(void) @@ -2216,7 +2216,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) - queue_push_tasks(rq); + rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) resched_curr(rq); @@ -2240,7 +2240,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * may need to pull tasks to this runqueue. */ if (oldprio < p->prio) - queue_pull_task(rq); + rt_queue_pull_task(rq); /* * If there's a higher priority task waiting to run -- cgit v1.2.3-71-gd317 From 14a7405b2e814221a951bd7a76ce4a8d24c1b3be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Mar 2018 16:32:24 +0100 Subject: sched/core: Undefine tracepoint creation at the end of core.c Make it easier to concatenate all the scheduler .c files for single-module compilation. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Thomas Gleixner Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e1e334ba8ff9..4f5eeb63ab5b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7071,3 +7071,5 @@ const u32 sched_prio_to_wmult[40] = { /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; + +#undef CREATE_TRACE_POINTS -- cgit v1.2.3-71-gd317 From 11dd2666375e191757dd4271d5020820c6d0e4a5 Mon Sep 17 00:00:00 2001 From: Greg Edwards Date: Mon, 5 Mar 2018 15:05:20 -0700 Subject: audit: do not panic on invalid boot parameter If you pass in an invalid audit boot parameter value, e.g. "audit=off", the kernel panics very early in boot before the regular console is initialized. Unless you have earlyprintk enabled, there is no indication of what the problem is on the console. Convert the panic() calls to pr_err(), and leave auditing enabled if an invalid parameter value was passed in. Modify the parameter to also accept "on" or "off" as valid values, and update the documentation accordingly. Signed-off-by: Greg Edwards Signed-off-by: Paul Moore --- Documentation/admin-guide/kernel-parameters.txt | 14 +++++++------- kernel/audit.c | 21 ++++++++++++++------- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 46b26bfee27b..93366b00bb62 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -387,15 +387,15 @@ Use software keyboard repeat audit= [KNL] Enable the audit sub-system - Format: { "0" | "1" } (0 = disabled, 1 = enabled) - 0 - kernel audit is disabled and can not be enabled - until the next reboot + Format: { "0" | "1" | "off" | "on" } + 0 | off - kernel audit is disabled and can not be + enabled until the next reboot unset - kernel audit is initialized but disabled and will be fully enabled by the userspace auditd. - 1 - kernel audit is initialized and partially enabled, - storing at most audit_backlog_limit messages in - RAM until it is fully enabled by the userspace - auditd. + 1 | on - kernel audit is initialized and partially + enabled, storing at most audit_backlog_limit + messages in RAM until it is fully enabled by the + userspace auditd. Default: unset audit_backlog_limit= [KNL] Set the audit queue size limit. diff --git a/kernel/audit.c b/kernel/audit.c index 1a3e75d9a66c..69ef8de69f03 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1615,19 +1615,26 @@ static int __init audit_init(void) } postcore_initcall(audit_init); -/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ +/* + * Process kernel command-line parameter at boot time. + * audit={0|off} or audit={1|on}. + */ static int __init audit_enable(char *str) { - long val; - - if (kstrtol(str, 0, &val)) - panic("audit: invalid 'audit' parameter value (%s)\n", str); - audit_default = (val ? AUDIT_ON : AUDIT_OFF); + if (!strcasecmp(str, "off") || !strcmp(str, "0")) + audit_default = AUDIT_OFF; + else if (!strcasecmp(str, "on") || !strcmp(str, "1")) + audit_default = AUDIT_ON; + else { + pr_err("audit: invalid 'audit' parameter value (%s)\n", str); + audit_default = AUDIT_ON; + } if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; if (audit_set_enabled(audit_default)) - panic("audit: error setting audit state (%d)\n", audit_default); + pr_err("audit: error setting audit state (%d)\n", + audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); -- cgit v1.2.3-71-gd317 From 167f5594b5efa20a26ff03b3424f793887e6b448 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 6 Mar 2018 15:56:47 +1100 Subject: kernel/memremap: Remove stale devres_free() call devm_memremap_pages() was re-worked in e8d513483300 "memremap: change devm_memremap_pages interface to use struct dev_pagemap" to take a caller allocated struct dev_pagemap as a function parameter. A call to devres_free() was left in the error cleanup path which results in a kernel panic if the remap fails for some reason. Remove it to fix the panic and let devm_memremap_pages() fail gracefully. Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...") Signed-off-by: Oliver O'Halloran Reviewed-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Signed-off-by: Dan Williams --- kernel/memremap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 4dd4274cabe2..895e6b76b25e 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -427,7 +427,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: err_radix: pgmap_radix_release(res, pgoff); - devres_free(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); -- cgit v1.2.3-71-gd317 From 6b4f3d01052a479c7ebbe99d52a663558dc1be2a Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Fri, 8 Sep 2017 12:40:01 -0400 Subject: usb, signal, security: only pass the cred, not the secid, to kill_pid_info_as_cred and security_task_kill commit d178bc3a708f39cbfefc3fab37032d3f2511b4ec ("user namespace: usb: make usb urbs user namespace aware (v2)") changed kill_pid_info_as_uid to kill_pid_info_as_cred, saving and passing a cred structure instead of uids. Since the secid can be obtained from the cred, drop the secid fields from the usb_dev_state and async structures, and drop the secid argument to kill_pid_info_as_cred. Replace the secid argument to security_task_kill with the cred. Update SELinux, Smack, and AppArmor to use the cred, which avoids the need for Smack and AppArmor to use a secid at all in this hook. Further changes to Smack might still be required to take full advantage of this change, since it should now be possible to perform capability checking based on the supplied cred. The changes to Smack and AppArmor have only been compile-tested. Signed-off-by: Stephen Smalley Acked-by: Paul Moore Acked-by: Casey Schaufler Acked-by: Greg Kroah-Hartman Acked-by: John Johansen Signed-off-by: James Morris --- drivers/usb/core/devio.c | 10 ++-------- include/linux/lsm_hooks.h | 5 +++-- include/linux/sched/signal.h | 2 +- include/linux/security.h | 4 ++-- kernel/signal.c | 6 +++--- security/apparmor/lsm.c | 17 ++++++++++++----- security/security.c | 4 ++-- security/selinux/hooks.c | 7 +++++-- security/smack/smack_lsm.c | 12 +++++------- 9 files changed, 35 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index d526595bc959..76e16c5251b9 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -65,7 +65,6 @@ struct usb_dev_state { const struct cred *cred; void __user *disccontext; unsigned long ifclaimed; - u32 secid; u32 disabled_bulk_eps; bool privileges_dropped; unsigned long interface_allowed_mask; @@ -95,7 +94,6 @@ struct async { struct usb_memory *usbm; unsigned int mem_usage; int status; - u32 secid; u8 bulk_addr; u8 bulk_status; }; @@ -586,7 +584,6 @@ static void async_completed(struct urb *urb) struct usb_dev_state *ps = as->ps; struct siginfo sinfo; struct pid *pid = NULL; - u32 secid = 0; const struct cred *cred = NULL; int signr; @@ -602,7 +599,6 @@ static void async_completed(struct urb *urb) sinfo.si_addr = as->userurb; pid = get_pid(as->pid); cred = get_cred(as->cred); - secid = as->secid; } snoop(&urb->dev->dev, "urb complete\n"); snoop_urb(urb->dev, as->userurb, urb->pipe, urb->actual_length, @@ -618,7 +614,7 @@ static void async_completed(struct urb *urb) spin_unlock(&ps->lock); if (signr) { - kill_pid_info_as_cred(sinfo.si_signo, &sinfo, pid, cred, secid); + kill_pid_info_as_cred(sinfo.si_signo, &sinfo, pid, cred); put_pid(pid); put_cred(cred); } @@ -1013,7 +1009,6 @@ static int usbdev_open(struct inode *inode, struct file *file) init_waitqueue_head(&ps->wait); ps->disc_pid = get_pid(task_pid(current)); ps->cred = get_current_cred(); - security_task_getsecid(current, &ps->secid); smp_wmb(); list_add_tail(&ps->list, &dev->filelist); file->private_data = ps; @@ -1727,7 +1722,6 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb as->ifnum = ifnum; as->pid = get_pid(task_pid(current)); as->cred = get_current_cred(); - security_task_getsecid(current, &as->secid); snoop_urb(ps->dev, as->userurb, as->urb->pipe, as->urb->transfer_buffer_length, 0, SUBMIT, NULL, 0); @@ -2617,7 +2611,7 @@ static void usbdev_remove(struct usb_device *udev) sinfo.si_code = SI_ASYNCIO; sinfo.si_addr = ps->disccontext; kill_pid_info_as_cred(ps->discsignr, &sinfo, - ps->disc_pid, ps->cred, ps->secid); + ps->disc_pid, ps->cred); } } } diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 7161d8e7ee79..e0ac011d07a5 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -672,7 +672,8 @@ * @p contains the task_struct for process. * @info contains the signal information. * @sig contains the signal value. - * @secid contains the sid of the process where the signal originated + * @cred contains the cred of the process where the signal originated, or + * NULL if the current task is the originator. * Return 0 if permission is granted. * @task_prctl: * Check permission before performing a process control operation on the @@ -1564,7 +1565,7 @@ union security_list_options { int (*task_getscheduler)(struct task_struct *p); int (*task_movememory)(struct task_struct *p); int (*task_kill)(struct task_struct *p, struct siginfo *info, - int sig, u32 secid); + int sig, const struct cred *cred); int (*task_prctl)(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); void (*task_to_inode)(struct task_struct *p, struct inode *inode); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 23b4f9cb82db..a7ce74c74e49 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -319,7 +319,7 @@ extern int force_sig_info(int, struct siginfo *, struct task_struct *); extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *, - const struct cred *, u32); + const struct cred *); extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern __must_check bool do_notify_parent(struct task_struct *, int); diff --git a/include/linux/security.h b/include/linux/security.h index 73f1ef625d40..3f5fd988ee87 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -347,7 +347,7 @@ int security_task_setscheduler(struct task_struct *p); int security_task_getscheduler(struct task_struct *p); int security_task_movememory(struct task_struct *p); int security_task_kill(struct task_struct *p, struct siginfo *info, - int sig, u32 secid); + int sig, const struct cred *cred); int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); void security_task_to_inode(struct task_struct *p, struct inode *inode); @@ -1010,7 +1010,7 @@ static inline int security_task_movememory(struct task_struct *p) static inline int security_task_kill(struct task_struct *p, struct siginfo *info, int sig, - u32 secid) + const struct cred *cred) { return 0; } diff --git a/kernel/signal.c b/kernel/signal.c index c6e4c83dc090..b033292f4beb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -770,7 +770,7 @@ static int check_kill_permission(int sig, struct siginfo *info, } } - return security_task_kill(t, info, sig, 0); + return security_task_kill(t, info, sig, NULL); } /** @@ -1361,7 +1361,7 @@ static int kill_as_cred_perm(const struct cred *cred, /* like kill_pid_info(), but doesn't use uid/euid of "current" */ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, - const struct cred *cred, u32 secid) + const struct cred *cred) { int ret = -EINVAL; struct task_struct *p; @@ -1380,7 +1380,7 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, ret = -EPERM; goto out_unlock; } - ret = security_task_kill(p, info, sig, secid); + ret = security_task_kill(p, info, sig, cred); if (ret) goto out_unlock; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 9a65eeaf7dfa..77bdfa7f8428 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -717,16 +717,23 @@ static int apparmor_task_setrlimit(struct task_struct *task, } static int apparmor_task_kill(struct task_struct *target, struct siginfo *info, - int sig, u32 secid) + int sig, const struct cred *cred) { struct aa_label *cl, *tl; int error; - if (secid) - /* TODO: after secid to label mapping is done. - * Dealing with USB IO specific behavior + if (cred) { + /* + * Dealing with USB IO specific behavior */ - return 0; + cl = aa_get_newest_cred_label(cred); + tl = aa_get_task_label(target); + error = aa_may_signal(cl, tl, sig); + aa_put_label(cl); + aa_put_label(tl); + return error; + } + cl = __begin_current_label_crit_section(); tl = aa_get_task_label(target); error = aa_may_signal(cl, tl, sig); diff --git a/security/security.c b/security/security.c index 1cd8526cb0b7..14c291910d25 100644 --- a/security/security.c +++ b/security/security.c @@ -1114,9 +1114,9 @@ int security_task_movememory(struct task_struct *p) } int security_task_kill(struct task_struct *p, struct siginfo *info, - int sig, u32 secid) + int sig, const struct cred *cred) { - return call_int_hook(task_kill, 0, p, info, sig, secid); + return call_int_hook(task_kill, 0, p, info, sig, cred); } int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 8644d864e3c1..8abd542c6b7c 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4036,16 +4036,19 @@ static int selinux_task_movememory(struct task_struct *p) } static int selinux_task_kill(struct task_struct *p, struct siginfo *info, - int sig, u32 secid) + int sig, const struct cred *cred) { + u32 secid; u32 perm; if (!sig) perm = PROCESS__SIGNULL; /* null signal; existence test */ else perm = signal_to_av(sig); - if (!secid) + if (!cred) secid = current_sid(); + else + secid = cred_sid(cred); return avc_has_perm(secid, task_sid(p), SECCLASS_PROCESS, perm, NULL); } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 03fdecba93bb..feada2665322 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -2228,15 +2228,13 @@ static int smack_task_movememory(struct task_struct *p) * @p: the task object * @info: unused * @sig: unused - * @secid: identifies the smack to use in lieu of current's + * @cred: identifies the cred to use in lieu of current's * * Return 0 if write access is permitted * - * The secid behavior is an artifact of an SELinux hack - * in the USB code. Someday it may go away. */ static int smack_task_kill(struct task_struct *p, struct siginfo *info, - int sig, u32 secid) + int sig, const struct cred *cred) { struct smk_audit_info ad; struct smack_known *skp; @@ -2252,17 +2250,17 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info, * Sending a signal requires that the sender * can write the receiver. */ - if (secid == 0) { + if (cred == NULL) { rc = smk_curacc(tkp, MAY_DELIVER, &ad); rc = smk_bu_task(p, MAY_DELIVER, rc); return rc; } /* - * If the secid isn't 0 we're dealing with some USB IO + * If the cred isn't NULL we're dealing with some USB IO * specific behavior. This is not clean. For one thing * we can't take privilege into account. */ - skp = smack_from_secid(secid); + skp = smk_of_task(cred->security); rc = smk_access(skp, tkp, MAY_DELIVER, &ad); rc = smk_bu_note("USB signal", skp, tkp, MAY_DELIVER, rc); return rc; -- cgit v1.2.3-71-gd317 From 95da0cdb723260362fc126a563285ac66a193da7 Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Tue, 6 Mar 2018 10:55:01 -0800 Subject: bpf: add support to read sample address in bpf program This commit adds new field "addr" to bpf_perf_event_data which could be read and used by bpf programs attached to perf events. The value of the field is copied from bpf_perf_event_data_kern.addr and contains the address value recorded by specifying sample_type with PERF_SAMPLE_ADDR when calling perf_event_open. Signed-off-by: Teng Qin Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf_perf_event.h | 1 + kernel/trace/bpf_trace.c | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h index 8f95303f9d80..eb1b9d21250c 100644 --- a/include/uapi/linux/bpf_perf_event.h +++ b/include/uapi/linux/bpf_perf_event.h @@ -13,6 +13,7 @@ struct bpf_perf_event_data { bpf_user_pt_regs_t regs; __u64 sample_period; + __u64 addr; }; #endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c0a9e310d715..c634e093951f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -726,8 +726,7 @@ const struct bpf_prog_ops tracepoint_prog_ops = { static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, - sample_period); + const int size_u64 = sizeof(u64); if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; @@ -738,8 +737,13 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type switch (off) { case bpf_ctx_range(struct bpf_perf_event_data, sample_period): - bpf_ctx_record_field_size(info, size_sp); - if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) + return false; + break; + case bpf_ctx_range(struct bpf_perf_event_data, addr): + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) return false; break; default: @@ -766,6 +770,14 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct perf_sample_data, period, 8, target_size)); break; + case offsetof(struct bpf_perf_event_data, addr): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), si->dst_reg, si->src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct perf_sample_data, addr, 8, + target_size)); + break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs), si->dst_reg, si->src_reg, -- cgit v1.2.3-71-gd317 From 5ad751053704df3f00d2bb2dc9345c697c212150 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 6 Mar 2018 10:49:12 +0100 Subject: panic: Add closing panic marker parenthesis Otherwise it looks unbalanced. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20180306094920.16917-2-bp@alien8.de --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 2cfef408fec9..9fb023d0cae1 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -289,7 +289,7 @@ void panic(const char *fmt, ...) disabled_wait(caller); } #endif - pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); + pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); -- cgit v1.2.3-71-gd317 From 3f553b308bb004eb730da8e00a28150c157c7724 Mon Sep 17 00:00:00 2001 From: Leon Yu Date: Tue, 6 Mar 2018 23:16:24 +0800 Subject: module: propagate error in modules_open() otherwise kernel can oops later in seq_release() due to dereferencing null file->private_data which is only set if seq_open() succeeds. BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 IP: seq_release+0xc/0x30 Call Trace: close_pdeo+0x37/0xd0 proc_reg_release+0x5d/0x60 __fput+0x9d/0x1d0 ____fput+0x9/0x10 task_work_run+0x75/0x90 do_exit+0x252/0xa00 do_group_exit+0x36/0xb0 SyS_exit_group+0xf/0x10 Fixes: 516fb7f2e73d ("/proc/module: use the same logic as /proc/kallsyms for address exposure") Cc: Jessica Yu Cc: Linus Torvalds Cc: stable@vger.kernel.org # 4.15+ Signed-off-by: Leon Yu Signed-off-by: Jessica Yu --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ad2d420024f6..e42764acedb4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4228,7 +4228,7 @@ static int modules_open(struct inode *inode, struct file *file) m->private = kallsyms_show_value() ? NULL : (void *)8ul; } - return 0; + return err; } static const struct file_operations proc_modules_operations = { -- cgit v1.2.3-71-gd317 From 15564ff0a16e2d994e78d62c23d227ff182ee864 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 14 Feb 2018 11:18:21 -0500 Subject: audit: make ANOM_LINK obey audit_enabled and audit_dummy_context Audit link denied events emit disjointed records when audit is disabled. No records should be emitted when audit is disabled. See: https://github.com/linux-audit/audit-kernel/issues/21 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 69ef8de69f03..46cd8f66af17 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2315,6 +2315,9 @@ void audit_log_link_denied(const char *operation, const struct path *link) struct audit_buffer *ab; struct audit_names *name; + if (!audit_enabled || audit_dummy_context()) + return; + name = kzalloc(sizeof(*name), GFP_NOFS); if (!name) return; -- cgit v1.2.3-71-gd317 From 45b578fe4c3cade6f4ca1fc934ce199afd857edc Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 14 Feb 2018 11:18:22 -0500 Subject: audit: link denied should not directly generate PATH record Audit link denied events generate duplicate PATH records which disagree in different ways from symlink and hardlink denials. audit_log_link_denied() should not directly generate PATH records. See: https://github.com/linux-audit/audit-kernel/issues/21 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 46cd8f66af17..3f2f143edadf 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2313,31 +2313,19 @@ EXPORT_SYMBOL(audit_log_task_info); void audit_log_link_denied(const char *operation, const struct path *link) { struct audit_buffer *ab; - struct audit_names *name; if (!audit_enabled || audit_dummy_context()) return; - name = kzalloc(sizeof(*name), GFP_NOFS); - if (!name) - return; - /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ ab = audit_log_start(current->audit_context, GFP_KERNEL, AUDIT_ANOM_LINK); if (!ab) - goto out; + return; audit_log_format(ab, "op=%s", operation); audit_log_task_info(ab, current); audit_log_format(ab, " res=0"); audit_log_end(ab); - - /* Generate AUDIT_PATH record with object. */ - name->type = AUDIT_TYPE_NORMAL; - audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry)); - audit_log_name(current->audit_context, name, link, 0, NULL); -out: - kfree(name); } /** -- cgit v1.2.3-71-gd317 From 13a453c241b78934a945b1af572d0533612c9bd1 Mon Sep 17 00:00:00 2001 From: Norbert Manthey Date: Tue, 27 Feb 2018 08:47:40 +0100 Subject: sched/fair: Add ';' after label attributes Due to using GCC defines for configuration, some labels might be unused in certain configurations. While adding a __maybe_unused to the label is fine in general, the line has to be terminated with ';'. This is also reflected in the GCC documentation, but GCC parsed the previous variant without an error message. This has been spotted while compiling with goto-cc, the compiler for the CPROVER tool suite. Signed-off-by: Norbert Manthey Signed-off-by: Michael Tautschnig Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1519717660-16157-1-git-send-email-nmanthey@amazon.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f5591071ae98..097db34d5ba2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6797,7 +6797,7 @@ simple: p = task_of(se); -done: __maybe_unused +done: __maybe_unused; #ifdef CONFIG_SMP /* * Move the next running task to the front of -- cgit v1.2.3-71-gd317 From 4042d003a0792a3b05c7c424219e4c6cf1abfe76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Dec 2017 15:37:26 +0100 Subject: cpufreq/schedutil: Remove unused CPUFREQ_DL Bitrot... Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Viresh Kumar Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/cpufreq.h | 3 +-- kernel/sched/deadline.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 0b55834efd46..d963cfd3a0c2 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -9,8 +9,7 @@ */ #define SCHED_CPUFREQ_RT (1U << 0) -#define SCHED_CPUFREQ_DL (1U << 1) -#define SCHED_CPUFREQ_IOWAIT (1U << 2) +#define SCHED_CPUFREQ_IOWAIT (1U << 1) #ifdef CONFIG_CPU_FREQ struct update_util_data { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8b7c2b35bec9..d1c7bf7c7e5b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -84,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); + cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } static inline @@ -98,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) if (dl_rq->running_bw > old) dl_rq->running_bw = 0; /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); + cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } static inline -- cgit v1.2.3-71-gd317 From 8f111bc357aa811e0bb5fdfe34c4c9efdafc15b9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Dec 2017 16:26:12 +0100 Subject: cpufreq/schedutil: Rewrite CPUFREQ_RT support Instead of trying to duplicate scheduler state to track if an RT task is running, directly use the scheduler runqueue state for it. This vastly simplifies things and fixes a number of bugs related to sugov and the scheduler getting out of sync wrt this state. As a consequence we not also update the remove cfs/dl state when iterating the shared mask. Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Viresh Kumar Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/cpufreq.h | 3 +- kernel/sched/cpufreq_schedutil.c | 74 ++++++++++++++++++---------------------- kernel/sched/rt.c | 9 +++-- 3 files changed, 41 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index d963cfd3a0c2..b48f2fb3b316 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -8,8 +8,7 @@ * Interface between cpufreq drivers and the scheduler: */ -#define SCHED_CPUFREQ_RT (1U << 0) -#define SCHED_CPUFREQ_IOWAIT (1U << 1) +#define SCHED_CPUFREQ_IOWAIT (1U << 0) #ifdef CONFIG_CPU_FREQ struct update_util_data { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index feb5f89020f2..89fe78ecb88c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -57,7 +57,6 @@ struct sugov_cpu { unsigned long util_cfs; unsigned long util_dl; unsigned long max; - unsigned int flags; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -183,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { + struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util; + + if (rq->rt.rt_nr_running) { + util = sg_cpu->max; + } else { + util = sg_cpu->util_dl; + if (rq->cfs.h_nr_running) + util += sg_cpu->util_cfs; + } + /* * Ideally we would like to set util_dl as min/guaranteed freq and * util_cfs + util_dl as requested freq. However, cpufreq is not yet * ready for such an interface. So, we only do the latter for now. */ - return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); + return min(util, sg_cpu->max); } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { - if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { + if (flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->iowait_boost_pending) return; @@ -262,12 +272,11 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - struct cpufreq_policy *policy = sg_policy->policy; unsigned long util, max; unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time); + sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; if (!sugov_should_update_freq(sg_policy, time)) @@ -275,25 +284,22 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, busy = sugov_cpu_is_busy(sg_cpu); - if (flags & SCHED_CPUFREQ_RT) { - next_f = policy->cpuinfo.max_freq; - } else { - sugov_get_util(sg_cpu); - max = sg_cpu->max; - util = sugov_aggregate_util(sg_cpu); - sugov_iowait_boost(sg_cpu, &util, &max); - next_f = get_next_freq(sg_policy, util, max); - /* - * Do not reduce the frequency if the CPU has not been idle - * recently, as the reduction is likely to be premature then. - */ - if (busy && next_f < sg_policy->next_freq) { - next_f = sg_policy->next_freq; + sugov_get_util(sg_cpu); + max = sg_cpu->max; + util = sugov_aggregate_util(sg_cpu); + sugov_iowait_boost(sg_cpu, &util, &max); + next_f = get_next_freq(sg_policy, util, max); + /* + * Do not reduce the frequency if the CPU has not been idle + * recently, as the reduction is likely to be premature then. + */ + if (busy && next_f < sg_policy->next_freq) { + next_f = sg_policy->next_freq; - /* Reset cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = 0; - } + /* Reset cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = 0; } + sugov_update_commit(sg_policy, time, next_f); } @@ -309,6 +315,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) unsigned long j_util, j_max; s64 delta_ns; + sugov_get_util(j_sg_cpu); + /* * If the CFS CPU utilization was last updated before the * previous frequency update and the time elapsed between the @@ -322,21 +330,15 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost_pending = false; - j_sg_cpu->util_cfs = 0; - if (j_sg_cpu->util_dl == 0) - continue; } - if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) - return policy->cpuinfo.max_freq; j_max = j_sg_cpu->max; j_util = sugov_aggregate_util(j_sg_cpu); + sugov_iowait_boost(j_sg_cpu, &j_util, &j_max); if (j_util * max > j_max * util) { util = j_util; max = j_max; } - - sugov_iowait_boost(j_sg_cpu, &util, &max); } return get_next_freq(sg_policy, util, max); @@ -351,18 +353,11 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) raw_spin_lock(&sg_policy->update_lock); - sugov_get_util(sg_cpu); - sg_cpu->flags = flags; - - sugov_set_iowait_boost(sg_cpu, time); + sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - if (flags & SCHED_CPUFREQ_RT) - next_f = sg_policy->policy->cpuinfo.max_freq; - else - next_f = sugov_next_freq_shared(sg_cpu, time); - + next_f = sugov_next_freq_shared(sg_cpu, time); sugov_update_commit(sg_policy, time, next_f); } @@ -673,7 +668,6 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->flags = 0; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4f4fd3b157f1..86b77987435e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -957,9 +957,6 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; - /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, SCHED_CPUFREQ_RT); - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1001,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) sub_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 0; + + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq, 0); } static void @@ -1017,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) add_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 1; + + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq, 0); } #if defined CONFIG_SMP -- cgit v1.2.3-71-gd317 From a22e47a4e3f5a9e50a827c5d94705ace3b1eac0b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Dec 2017 10:01:24 +0100 Subject: sched/core: Convert nohz_flags to atomic_t Using atomic_t allows us to use the more flexible bitops provided there. Also its smaller. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/fair.c | 23 +++++++++++++++-------- kernel/sched/sched.h | 11 ++++++----- 3 files changed, 24 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4f5eeb63ab5b..96ad1c003d74 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -583,7 +583,7 @@ static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); - if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + if (!(atomic_read(nohz_flags(cpu)) & NOHZ_BALANCE_KICK)) return false; if (idle_cpu(cpu) && !need_resched()) @@ -593,7 +593,7 @@ static inline bool got_nohz_idle_kick(void) * We can't run Idle Load Balance on this CPU for this time so we * cancel it and clear NOHZ_BALANCE_KICK */ - clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(cpu)); return false; } @@ -6074,7 +6074,7 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON rq->last_load_update_tick = jiffies; - rq->nohz_flags = 0; + atomic_set(&rq->nohz_flags, 0); #endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 097db34d5ba2..5d150478dd58 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9072,6 +9072,7 @@ static inline int find_new_ilb(void) */ static void nohz_balancer_kick(void) { + unsigned int flags; int ilb_cpu; nohz.next_balance++; @@ -9081,7 +9082,8 @@ static void nohz_balancer_kick(void) if (ilb_cpu >= nr_cpu_ids) return; - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) + flags = atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)); + if (flags & NOHZ_BALANCE_KICK) return; /* * Use smp_send_reschedule() instead of resched_cpu(). @@ -9095,7 +9097,9 @@ static void nohz_balancer_kick(void) void nohz_balance_exit_idle(unsigned int cpu) { - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { + unsigned int flags = atomic_read(nohz_flags(cpu)); + + if (unlikely(flags & NOHZ_TICK_STOPPED)) { /* * Completely isolated CPUs don't ever set, so we must test. */ @@ -9103,7 +9107,8 @@ void nohz_balance_exit_idle(unsigned int cpu) cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); atomic_dec(&nohz.nr_cpus); } - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + + atomic_andnot(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -9155,7 +9160,7 @@ void nohz_balance_enter_idle(int cpu) if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) return; - if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) + if (atomic_read(nohz_flags(cpu)) & NOHZ_TICK_STOPPED) return; /* If we're a completely isolated CPU, we don't play: */ @@ -9164,7 +9169,7 @@ void nohz_balance_enter_idle(int cpu) cpumask_set_cpu(cpu, nohz.idle_cpus_mask); atomic_inc(&nohz.nr_cpus); - set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + atomic_or(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } #endif @@ -9302,8 +9307,10 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - if (idle != CPU_IDLE || - !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_BALANCE_KICK)) + return; + + if (idle != CPU_IDLE) goto end; for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { @@ -9349,7 +9356,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (likely(update_next_balance)) nohz.next_balance = next_balance; end: - clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); + atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 23ba4dd76ac4..d98e761b962f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -763,7 +763,7 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_load_update_tick; #endif /* CONFIG_SMP */ - unsigned long nohz_flags; + atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ /* capture load from *all* tasks on this CPU: */ @@ -2034,10 +2034,11 @@ extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON -enum rq_nohz_flag_bits { - NOHZ_TICK_STOPPED, - NOHZ_BALANCE_KICK, -}; +#define NOHZ_TICK_STOPPED_BIT 0 +#define NOHZ_BALANCE_KICK_BIT 1 + +#define NOHZ_TICK_STOPPED BIT(NOHZ_TICK_STOPPED_BIT) +#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -- cgit v1.2.3-71-gd317 From b7031a02ec753bf9b52a94a966b05e1abad3b7a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Dec 2017 10:11:09 +0100 Subject: sched/fair: Add NOHZ_STATS_KICK Split the NOHZ idle balancer into doing two separate actions: - update blocked load statistic - actually load-balance Since the latter requires the former, ensure this happens. For now always tag both bits at the same time. Prepares for a future where we can toggle only the STATS bit. Suggested-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++----------------- kernel/sched/sched.h | 4 ++++ 3 files changed, 41 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 96ad1c003d74..69c9a6b07b61 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -583,7 +583,7 @@ static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); - if (!(atomic_read(nohz_flags(cpu)) & NOHZ_BALANCE_KICK)) + if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) return false; if (idle_cpu(cpu) && !need_resched()) @@ -593,7 +593,7 @@ static inline bool got_nohz_idle_kick(void) * We can't run Idle Load Balance on this CPU for this time so we * cancel it and clear NOHZ_BALANCE_KICK */ - atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); return false; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5d150478dd58..fc058967c999 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9082,8 +9082,8 @@ static void nohz_balancer_kick(void) if (ilb_cpu >= nr_cpu_ids) return; - flags = atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)); - if (flags & NOHZ_BALANCE_KICK) + flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu)); + if (flags & NOHZ_KICK_MASK) return; /* * Use smp_send_reschedule() instead of resched_cpu(). @@ -9202,8 +9202,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) int need_serialize, need_decay = 0; u64 max_cost = 0; - update_blocked_averages(cpu); - rcu_read_lock(); for_each_domain(cpu, sd) { /* @@ -9298,20 +9296,27 @@ out: * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the CPUs for whom scheduler ticks are stopped. */ -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - int this_cpu = this_rq->cpu; - struct rq *rq; - int balance_cpu; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + int this_cpu = this_rq->cpu; + unsigned int flags; + int balance_cpu; + struct rq *rq; - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_BALANCE_KICK)) - return; + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) + return false; - if (idle != CPU_IDLE) - goto end; + if (idle != CPU_IDLE) { + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); + return false; + } + + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); + + SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) @@ -9339,7 +9344,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) cpu_load_update_idle(rq); rq_unlock_irq(rq, &rf); - rebalance_domains(rq, CPU_IDLE); + update_blocked_averages(rq->cpu); + if (flags & NOHZ_BALANCE_KICK) + rebalance_domains(rq, CPU_IDLE); } if (time_after(next_balance, rq->next_balance)) { @@ -9348,6 +9355,10 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) } } + update_blocked_averages(this_cpu); + if (flags & NOHZ_BALANCE_KICK) + rebalance_domains(this_rq, CPU_IDLE); + /* * next_balance will be updated only when there is a need. * When the CPU is attached to null domain for ex, it will not be @@ -9355,8 +9366,8 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) */ if (likely(update_next_balance)) nohz.next_balance = next_balance; -end: - atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); + + return true; } /* @@ -9443,7 +9454,10 @@ unlock: return kick; } #else -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +{ + return false; +} #endif /* @@ -9464,7 +9478,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. */ - nohz_idle_balance(this_rq, idle); + if (nohz_idle_balance(this_rq, idle)) + return; + + /* normal load balance */ + update_blocked_averages(this_rq->cpu); rebalance_domains(this_rq, idle); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d98e761b962f..5295f274053b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2036,9 +2036,13 @@ extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON #define NOHZ_TICK_STOPPED_BIT 0 #define NOHZ_BALANCE_KICK_BIT 1 +#define NOHZ_STATS_KICK_BIT 2 #define NOHZ_TICK_STOPPED BIT(NOHZ_TICK_STOPPED_BIT) #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) +#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) + +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -- cgit v1.2.3-71-gd317 From 4550487a993d579c7329bb5b19e516d36800c8bf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Dec 2017 10:47:48 +0100 Subject: sched/fair: Restructure nohz_balance_kick() The current: if (nohz_kick_needed()) nohz_balancer_kick() is pointless complexity, fold them into a single call and avoid the various conditions at the call site. When we introduce multiple different needs to kick the ilb, the above construct also becomes a problem. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 218 ++++++++++++++++++++++++++-------------------------- 1 file changed, 111 insertions(+), 107 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc058967c999..fa483d889f07 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9065,12 +9065,29 @@ static inline int find_new_ilb(void) return nr_cpu_ids; } +static inline void set_cpu_sd_state_busy(void) +{ + struct sched_domain *sd; + int cpu = smp_processor_id(); + + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + atomic_inc(&sd->shared->nr_busy_cpus); +unlock: + rcu_read_unlock(); +} + /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick the * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(void) +static void kick_ilb(void) { unsigned int flags; int ilb_cpu; @@ -9085,6 +9102,7 @@ static void nohz_balancer_kick(void) flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu)); if (flags & NOHZ_KICK_MASK) return; + /* * Use smp_send_reschedule() instead of resched_cpu(). * This way we generate a sched IPI on the target CPU which @@ -9092,7 +9110,94 @@ static void nohz_balancer_kick(void) * will be run before returning from the IPI. */ smp_send_reschedule(ilb_cpu); - return; +} + +/* + * Current heuristic for kicking the idle load balancer in the presence + * of an idle cpu in the system. + * - This rq has more than one task. + * - This rq has at least one CFS task and the capacity of the CPU is + * significantly reduced because of RT tasks or IRQs. + * - At parent of LLC scheduler domain level, this cpu's scheduler group has + * multiple busy cpu. + * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler + * domain span are idle. + */ +static void nohz_balancer_kick(struct rq *rq) +{ + unsigned long now = jiffies; + struct sched_domain_shared *sds; + struct sched_domain *sd; + int nr_busy, i, cpu = rq->cpu; + bool kick = false; + + if (unlikely(rq->idle_balance)) + return; + + /* + * We may be recently in ticked or tickless idle mode. At the first + * busy tick after returning from idle, we will update the busy stats. + */ + set_cpu_sd_state_busy(); + nohz_balance_exit_idle(cpu); + + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing. + */ + if (likely(!atomic_read(&nohz.nr_cpus))) + return; + + if (time_before(now, nohz.next_balance)) + return; + + if (rq->nr_running >= 2) { + kick = true; + goto out; + } + + rcu_read_lock(); + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * XXX: write a coherent comment on why we do this. + * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); + if (nr_busy > 1) { + kick = true; + goto unlock; + } + + } + + sd = rcu_dereference(rq->sd); + if (sd) { + if ((rq->cfs.h_nr_running >= 1) && + check_cpu_capacity(rq, sd)) { + kick = true; + goto unlock; + } + } + + sd = rcu_dereference(per_cpu(sd_asym, cpu)); + if (sd) { + for_each_cpu(i, sched_domain_span(sd)) { + if (i == cpu || + !cpumask_test_cpu(i, nohz.idle_cpus_mask)) + continue; + + if (sched_asym_prefer(i, cpu)) { + kick = true; + goto unlock; + } + } + } +unlock: + rcu_read_unlock(); +out: + if (kick) + kick_ilb(); } void nohz_balance_exit_idle(unsigned int cpu) @@ -9112,23 +9217,6 @@ void nohz_balance_exit_idle(unsigned int cpu) } } -static inline void set_cpu_sd_state_busy(void) -{ - struct sched_domain *sd; - int cpu = smp_processor_id(); - - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - - if (!sd || !sd->nohz_idle) - goto unlock; - sd->nohz_idle = 0; - - atomic_inc(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); -} - void set_cpu_sd_state_idle(void) { struct sched_domain *sd; @@ -9171,6 +9259,8 @@ void nohz_balance_enter_idle(int cpu) atomic_inc(&nohz.nr_cpus); atomic_or(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } +#else +static inline void nohz_balancer_kick(struct rq *rq) { } #endif static DEFINE_SPINLOCK(balancing); @@ -9369,90 +9459,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) return true; } - -/* - * Current heuristic for kicking the idle load balancer in the presence - * of an idle CPU in the system. - * - This rq has more than one task. - * - This rq has at least one CFS task and the capacity of the CPU is - * significantly reduced because of RT tasks or IRQs. - * - At parent of LLC scheduler domain level, this CPU's scheduler group has - * multiple busy CPUs. - * - For SD_ASYM_PACKING, if the lower numbered CPU's in the scheduler - * domain span are idle. - */ -static inline bool nohz_kick_needed(struct rq *rq) -{ - unsigned long now = jiffies; - struct sched_domain_shared *sds; - struct sched_domain *sd; - int nr_busy, i, cpu = rq->cpu; - bool kick = false; - - if (unlikely(rq->idle_balance)) - return false; - - /* - * We may be recently in ticked or tickless idle mode. At the first - * busy tick after returning from idle, we will update the busy stats. - */ - set_cpu_sd_state_busy(); - nohz_balance_exit_idle(cpu); - - /* - * None are in tickless mode and hence no need for NOHZ idle load - * balancing. - */ - if (likely(!atomic_read(&nohz.nr_cpus))) - return false; - - if (time_before(now, nohz.next_balance)) - return false; - - if (rq->nr_running >= 2) - return true; - - rcu_read_lock(); - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - if (sds) { - /* - * XXX: write a coherent comment on why we do this. - * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com - */ - nr_busy = atomic_read(&sds->nr_busy_cpus); - if (nr_busy > 1) { - kick = true; - goto unlock; - } - - } - - sd = rcu_dereference(rq->sd); - if (sd) { - if ((rq->cfs.h_nr_running >= 1) && - check_cpu_capacity(rq, sd)) { - kick = true; - goto unlock; - } - } - - sd = rcu_dereference(per_cpu(sd_asym, cpu)); - if (sd) { - for_each_cpu(i, sched_domain_span(sd)) { - if (i == cpu || - !cpumask_test_cpu(i, nohz.idle_cpus_mask)) - continue; - - if (sched_asym_prefer(i, cpu)) { - kick = true; - goto unlock; - } - } - } -unlock: - rcu_read_unlock(); - return kick; -} #else static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { @@ -9497,10 +9503,8 @@ void trigger_load_balance(struct rq *rq) if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); -#ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq)) - nohz_balancer_kick(); -#endif + + nohz_balancer_kick(rq); } static void rq_online_fair(struct rq *rq) -- cgit v1.2.3-71-gd317 From a4064fb614f83c0a097c5ff7fe433c4aa139c7af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Dec 2017 10:42:50 +0100 Subject: sched/fair: Add NOHZ stats balancing Teach the idle balancer about the need to update statistics which have a different periodicity from regular balancing. Suggested-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fa483d889f07..d8693fa9e7c5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9053,6 +9053,7 @@ static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; unsigned long next_balance; /* in jiffy units */ + unsigned long next_stats; } nohz ____cacheline_aligned; static inline int find_new_ilb(void) @@ -9087,9 +9088,8 @@ unlock: * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void kick_ilb(void) +static void kick_ilb(unsigned int flags) { - unsigned int flags; int ilb_cpu; nohz.next_balance++; @@ -9099,7 +9099,7 @@ static void kick_ilb(void) if (ilb_cpu >= nr_cpu_ids) return; - flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu)); + flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); if (flags & NOHZ_KICK_MASK) return; @@ -9129,7 +9129,7 @@ static void nohz_balancer_kick(struct rq *rq) struct sched_domain_shared *sds; struct sched_domain *sd; int nr_busy, i, cpu = rq->cpu; - bool kick = false; + unsigned int flags = 0; if (unlikely(rq->idle_balance)) return; @@ -9148,11 +9148,14 @@ static void nohz_balancer_kick(struct rq *rq) if (likely(!atomic_read(&nohz.nr_cpus))) return; + if (time_after(now, nohz.next_stats)) + flags = NOHZ_STATS_KICK; + if (time_before(now, nohz.next_balance)) - return; + goto out; if (rq->nr_running >= 2) { - kick = true; + flags = NOHZ_KICK_MASK; goto out; } @@ -9165,7 +9168,7 @@ static void nohz_balancer_kick(struct rq *rq) */ nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { - kick = true; + flags = NOHZ_KICK_MASK; goto unlock; } @@ -9175,7 +9178,7 @@ static void nohz_balancer_kick(struct rq *rq) if (sd) { if ((rq->cfs.h_nr_running >= 1) && check_cpu_capacity(rq, sd)) { - kick = true; + flags = NOHZ_KICK_MASK; goto unlock; } } @@ -9188,7 +9191,7 @@ static void nohz_balancer_kick(struct rq *rq) continue; if (sched_asym_prefer(i, cpu)) { - kick = true; + flags = NOHZ_KICK_MASK; goto unlock; } } @@ -9196,8 +9199,8 @@ static void nohz_balancer_kick(struct rq *rq) unlock: rcu_read_unlock(); out: - if (kick) - kick_ilb(); + if (flags) + kick_ilb(flags); } void nohz_balance_exit_idle(unsigned int cpu) @@ -9389,7 +9392,9 @@ out: static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { /* Earliest time when we have to do rebalance again */ - unsigned long next_balance = jiffies + 60*HZ; + unsigned long now = jiffies; + unsigned long next_balance = now + 60*HZ; + unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD); int update_next_balance = 0; int this_cpu = this_rq->cpu; unsigned int flags; @@ -9449,6 +9454,8 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (flags & NOHZ_BALANCE_KICK) rebalance_domains(this_rq, CPU_IDLE); + nohz.next_stats = next_stats; + /* * next_balance will be updated only when there is a need. * When the CPU is attached to null domain for ex, it will not be -- cgit v1.2.3-71-gd317 From e022e0d38ad475fc650f22efa3deb2fb96e62542 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Dec 2017 11:20:23 +0100 Subject: sched/fair: Update blocked load from NEWIDLE Since we already iterate CPUs looking for work on NEWIDLE, use this iteration to age the blocked load. If the domain for which this is done completely spand the idle set, we can push the ILB based aging forward. Suggested-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/fair.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 1 + 3 files changed, 45 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 69c9a6b07b61..8a10a2ce30a4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6074,6 +6074,7 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON rq->last_load_update_tick = jiffies; + rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); #endif #endif /* CONFIG_SMP */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d8693fa9e7c5..85232dad89c9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5376,6 +5376,14 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) } return load; } + +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + unsigned long next_balance; /* in jiffy units */ + unsigned long next_stats; +} nohz ____cacheline_aligned; + #endif /* CONFIG_NO_HZ_COMMON */ /** @@ -7022,6 +7030,7 @@ enum fbq_type { regular, remote, all }; #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 +#define LBF_NOHZ_STATS 0x10 struct lb_env { struct sched_domain *sd; @@ -7460,6 +7469,10 @@ static void update_blocked_averages(int cpu) if (cfs_rq_is_decayed(cfs_rq)) list_del_leaf_cfs_rq(cfs_rq); } + +#ifdef CONFIG_NO_HZ_COMMON + rq->last_blocked_load_update_tick = jiffies; +#endif rq_unlock_irqrestore(rq, &rf); } @@ -7519,6 +7532,9 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); +#ifdef CONFIG_NO_HZ_COMMON + rq->last_blocked_load_update_tick = jiffies; +#endif rq_unlock_irqrestore(rq, &rf); } @@ -7853,6 +7869,21 @@ group_type group_classify(struct sched_group *group, return group_other; } +static void update_nohz_stats(struct rq *rq) +{ +#ifdef CONFIG_NO_HZ_COMMON + unsigned int cpu = rq->cpu; + + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return; + + if (!time_after(jiffies, rq->last_blocked_load_update_tick)) + return; + + update_blocked_averages(cpu); +#endif +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -7875,6 +7906,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); + if (env->flags & LBF_NOHZ_STATS) + update_nohz_stats(rq); + /* Bias balancing toward CPUs of our domain: */ if (local_group) load = target_load(i, load_idx); @@ -8030,6 +8064,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; +#ifdef CONFIG_NO_HZ_COMMON + if (env->idle == CPU_NEWLY_IDLE) { + env->flags |= LBF_NOHZ_STATS; + + if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) + nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD); + } +#endif + load_idx = get_sd_load_idx(env->sd, env->idle); do { @@ -9049,12 +9092,6 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ -static struct { - cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - unsigned long next_balance; /* in jiffy units */ - unsigned long next_stats; -} nohz ____cacheline_aligned; static inline int find_new_ilb(void) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5295f274053b..21381d276709 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -762,6 +762,7 @@ struct rq { #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_SMP unsigned long last_load_update_tick; + unsigned long last_blocked_load_update_tick; #endif /* CONFIG_SMP */ atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -- cgit v1.2.3-71-gd317 From 00357f5ec5d67a52a175da6f29f85c2c19d59bc8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Dec 2017 15:06:50 +0100 Subject: sched/nohz: Clean up nohz enter/exit The primary observation is that nohz enter/exit is always from the current CPU, therefore NOHZ_TICK_STOPPED does not in fact need to be an atomic. Secondary is that we appear to have 2 nearly identical hooks in the nohz enter code, set_cpu_sd_state_idle() and nohz_balance_enter_idle(). Fold the whole set_cpu_sd_state thing into nohz_balance_{enter,exit}_idle. Removes an atomic op from both enter and exit paths. Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/nohz.h | 2 -- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 73 +++++++++++++++++++++++----------------------- kernel/sched/sched.h | 11 ++++--- kernel/time/tick-sched.c | 7 ----- 5 files changed, 43 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 094217273ff9..b36f4cf38111 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -16,11 +16,9 @@ static inline void cpu_load_update_nohz_stop(void) { } #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); -extern void set_cpu_sd_state_idle(void); extern int get_nohz_timer_target(void); #else static inline void nohz_balance_enter_idle(int cpu) { } -static inline void set_cpu_sd_state_idle(void) { } #endif #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8a10a2ce30a4..c7faeb7bd03a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5861,7 +5861,7 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); - nohz_balance_exit_idle(cpu); + nohz_balance_exit_idle(rq); hrtick_clear(rq); return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 85232dad89c9..494d5db9a6cd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9103,23 +9103,6 @@ static inline int find_new_ilb(void) return nr_cpu_ids; } -static inline void set_cpu_sd_state_busy(void) -{ - struct sched_domain *sd; - int cpu = smp_processor_id(); - - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - - if (!sd || !sd->nohz_idle) - goto unlock; - sd->nohz_idle = 0; - - atomic_inc(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); -} - /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick the * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle @@ -9175,8 +9158,7 @@ static void nohz_balancer_kick(struct rq *rq) * We may be recently in ticked or tickless idle mode. At the first * busy tick after returning from idle, we will update the busy stats. */ - set_cpu_sd_state_busy(); - nohz_balance_exit_idle(cpu); + nohz_balance_exit_idle(rq); /* * None are in tickless mode and hence no need for NOHZ idle load @@ -9240,27 +9222,39 @@ out: kick_ilb(flags); } -void nohz_balance_exit_idle(unsigned int cpu) +static void set_cpu_sd_state_busy(int cpu) { - unsigned int flags = atomic_read(nohz_flags(cpu)); + struct sched_domain *sd; - if (unlikely(flags & NOHZ_TICK_STOPPED)) { - /* - * Completely isolated CPUs don't ever set, so we must test. - */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); - atomic_andnot(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - } + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + atomic_inc(&sd->shared->nr_busy_cpus); +unlock: + rcu_read_unlock(); } -void set_cpu_sd_state_idle(void) +void nohz_balance_exit_idle(struct rq *rq) +{ + SCHED_WARN_ON(rq != this_rq()); + + if (likely(!rq->nohz_tick_stopped)) + return; + + rq->nohz_tick_stopped = 0; + cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + + set_cpu_sd_state_busy(rq->cpu); +} + +static void set_cpu_sd_state_idle(int cpu) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_llc, cpu)); @@ -9280,6 +9274,10 @@ unlock: */ void nohz_balance_enter_idle(int cpu) { + struct rq *rq = cpu_rq(cpu); + + SCHED_WARN_ON(cpu != smp_processor_id()); + /* If this CPU is going down, then nothing needs to be done: */ if (!cpu_active(cpu)) return; @@ -9288,16 +9286,19 @@ void nohz_balance_enter_idle(int cpu) if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) return; - if (atomic_read(nohz_flags(cpu)) & NOHZ_TICK_STOPPED) + if (rq->nohz_tick_stopped) return; /* If we're a completely isolated CPU, we don't play: */ - if (on_null_domain(cpu_rq(cpu))) + if (on_null_domain(rq)) return; + rq->nohz_tick_stopped = 1; + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); atomic_inc(&nohz.nr_cpus); - atomic_or(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + + set_cpu_sd_state_idle(cpu); } #else static inline void nohz_balancer_kick(struct rq *rq) { } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 21381d276709..818f22dbc7ea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -764,6 +764,7 @@ struct rq { unsigned long last_load_update_tick; unsigned long last_blocked_load_update_tick; #endif /* CONFIG_SMP */ + unsigned int nohz_tick_stopped; atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ @@ -2035,11 +2036,9 @@ extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON -#define NOHZ_TICK_STOPPED_BIT 0 -#define NOHZ_BALANCE_KICK_BIT 1 -#define NOHZ_STATS_KICK_BIT 2 +#define NOHZ_BALANCE_KICK_BIT 0 +#define NOHZ_STATS_KICK_BIT 1 -#define NOHZ_TICK_STOPPED BIT(NOHZ_TICK_STOPPED_BIT) #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) @@ -2047,9 +2046,9 @@ extern void cfs_bandwidth_usage_dec(void); #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -extern void nohz_balance_exit_idle(unsigned int cpu); +extern void nohz_balance_exit_idle(struct rq *rq); #else -static inline void nohz_balance_exit_idle(unsigned int cpu) { } +static inline void nohz_balance_exit_idle(struct rq *rq) { } #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f2fa2e940fe5..ab92aa4442df 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -954,13 +954,6 @@ void tick_nohz_idle_enter(void) struct tick_sched *ts; lockdep_assert_irqs_enabled(); - /* - * Update the idle state in the scheduler domain hierarchy - * when tick_nohz_stop_sched_tick() is called from the idle loop. - * State will be updated to busy during the first busy tick after - * exiting idle. - */ - set_cpu_sd_state_idle(); local_irq_disable(); -- cgit v1.2.3-71-gd317 From ea14b57e8a181ac0561eba7a787e088f8c89f822 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Feb 2018 10:27:00 +0100 Subject: sched/cpufreq: Provide migration hint It was suggested that a migration hint might be usefull for the CPU-freq governors. Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Viresh Kumar Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/cpufreq.h | 1 + kernel/sched/fair.c | 31 +++++++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index b48f2fb3b316..59667444669f 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -9,6 +9,7 @@ */ #define SCHED_CPUFREQ_IOWAIT (1U << 0) +#define SCHED_CPUFREQ_MIGRATION (1U << 1) #ifdef CONFIG_CPU_FREQ struct update_util_data { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 494d5db9a6cd..e8f5efe2936c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -772,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se) * For !fair tasks do: * update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se); + attach_entity_load_avg(cfs_rq, se, 0); switched_from_fair(rq, p); * * such that the next switched_to_fair() has the @@ -3009,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { struct rq *rq = rq_of(cfs_rq); - if (&rq->cfs == cfs_rq) { + if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3028,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * See cpu_util(). */ - cpufreq_update_util(rq, 0); + cpufreq_update_util(rq, flags); } } @@ -3686,7 +3686,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #endif if (decayed) - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, 0); return decayed; } @@ -3699,7 +3699,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; @@ -3735,7 +3735,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, flags); } /** @@ -3754,7 +3754,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, 0); } /* @@ -3784,7 +3784,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s if (!se->avg.last_update_time && (flags & DO_ATTACH)) { - attach_entity_load_avg(cfs_rq, se); + /* + * DO_ATTACH means we're here from enqueue_entity(). + * !last_update_time means we've passed through + * migrate_task_rq_fair() indicating we migrated. + * + * IOW we're enqueueing a task on a new CPU. + */ + attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); update_tg_load_avg(cfs_rq, 0); } else if (decayed && (flags & UPDATE_TG)) @@ -3880,13 +3887,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { - cfs_rq_util_change(cfs_rq); + cfs_rq_util_change(cfs_rq, 0); } static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} @@ -9726,7 +9733,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); - attach_entity_load_avg(cfs_rq, se); + attach_entity_load_avg(cfs_rq, se, 0); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); } -- cgit v1.2.3-71-gd317 From f643ea2207010db26f17fca99db031bad87c8461 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 13 Feb 2018 11:31:17 +0100 Subject: sched/nohz: Stop NOHZ stats when decayed Stopped the periodic update of blocked load when all idle CPUs have fully decayed. We introduce a new nohz.has_blocked that reflect if some idle CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked is set everytime that a Idle CPU can have blocked load and it is then clear when no more blocked load has been detected during an update. We don't need atomic operation but only to make cure of the right ordering when updating nohz.idle_cpus_mask and nohz.has_blocked. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: brendan.jackman@arm.com Cc: dietmar.eggemann@arm.com Cc: morten.rasmussen@foss.arm.com Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/1518517879-2280-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 116 ++++++++++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 1 + 2 files changed, 97 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8f5efe2936c..78b06a0814d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5387,8 +5387,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; + int has_blocked; /* Idle CPUS has blocked load */ unsigned long next_balance; /* in jiffy units */ - unsigned long next_stats; + unsigned long next_blocked; /* Next update of blocked load in jiffies */ } nohz ____cacheline_aligned; #endif /* CONFIG_NO_HZ_COMMON */ @@ -7038,6 +7039,7 @@ enum fbq_type { regular, remote, all }; #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 #define LBF_NOHZ_STATS 0x10 +#define LBF_NOHZ_AGAIN 0x20 struct lb_env { struct sched_domain *sd; @@ -7422,8 +7424,6 @@ static void attach_tasks(struct lb_env *env) rq_unlock(env->dst_rq, &rf); } -#ifdef CONFIG_FAIR_GROUP_SCHED - static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { if (cfs_rq->load.weight) @@ -7441,11 +7441,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) return true; } +#ifdef CONFIG_FAIR_GROUP_SCHED + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; struct rq_flags rf; + bool done = true; rq_lock_irqsave(rq, &rf); update_rq_clock(rq); @@ -7475,10 +7478,14 @@ static void update_blocked_averages(int cpu) */ if (cfs_rq_is_decayed(cfs_rq)) list_del_leaf_cfs_rq(cfs_rq); + else + done = false; } #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; + if (done) + rq->has_blocked_load = 0; #endif rq_unlock_irqrestore(rq, &rf); } @@ -7541,6 +7548,8 @@ static inline void update_blocked_averages(int cpu) update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; + if (cfs_rq_is_decayed(cfs_rq)) + rq->has_blocked_load = 0; #endif rq_unlock_irqrestore(rq, &rf); } @@ -7876,18 +7885,25 @@ group_type group_classify(struct sched_group *group, return group_other; } -static void update_nohz_stats(struct rq *rq) +static bool update_nohz_stats(struct rq *rq) { #ifdef CONFIG_NO_HZ_COMMON unsigned int cpu = rq->cpu; + if (!rq->has_blocked_load) + return false; + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) - return; + return false; if (!time_after(jiffies, rq->last_blocked_load_update_tick)) - return; + return true; update_blocked_averages(cpu); + + return rq->has_blocked_load; +#else + return false; #endif } @@ -7913,8 +7929,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); - if (env->flags & LBF_NOHZ_STATS) - update_nohz_stats(rq); + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq)) + env->flags |= LBF_NOHZ_AGAIN; /* Bias balancing toward CPUs of our domain: */ if (local_group) @@ -8072,12 +8088,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd prefer_sibling = 1; #ifdef CONFIG_NO_HZ_COMMON - if (env->idle == CPU_NEWLY_IDLE) { + if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) env->flags |= LBF_NOHZ_STATS; - - if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) - nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD); - } #endif load_idx = get_sd_load_idx(env->sd, env->idle); @@ -8133,6 +8145,15 @@ next_group: sg = sg->next; } while (sg != env->sd->groups); +#ifdef CONFIG_NO_HZ_COMMON + if ((env->flags & LBF_NOHZ_AGAIN) && + cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { + + WRITE_ONCE(nohz.next_blocked, + jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); + } +#endif + if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); @@ -9174,7 +9195,8 @@ static void nohz_balancer_kick(struct rq *rq) if (likely(!atomic_read(&nohz.nr_cpus))) return; - if (time_after(now, nohz.next_stats)) + if (READ_ONCE(nohz.has_blocked) && + time_after(now, READ_ONCE(nohz.next_blocked))) flags = NOHZ_STATS_KICK; if (time_before(now, nohz.next_balance)) @@ -9293,8 +9315,21 @@ void nohz_balance_enter_idle(int cpu) if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) return; + /* + * Can be set safely without rq->lock held + * If a clear happens, it will have evaluated last additions because + * rq->lock is held during the check and the clear + */ + rq->has_blocked_load = 1; + + /* + * The tick is still stopped but load could have been added in the + * meantime. We set the nohz.has_blocked flag to trig a check of the + * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear + * of nohz.has_blocked can only happen after checking the new load + */ if (rq->nohz_tick_stopped) - return; + goto out; /* If we're a completely isolated CPU, we don't play: */ if (on_null_domain(rq)) @@ -9305,7 +9340,21 @@ void nohz_balance_enter_idle(int cpu) cpumask_set_cpu(cpu, nohz.idle_cpus_mask); atomic_inc(&nohz.nr_cpus); + /* + * Ensures that if nohz_idle_balance() fails to observe our + * @idle_cpus_mask store, it must observe the @has_blocked + * store. + */ + smp_mb__after_atomic(); + set_cpu_sd_state_idle(cpu); + +out: + /* + * Each time a cpu enter idle, we assume that it has blocked load and + * enable the periodic update of the load of idle cpus + */ + WRITE_ONCE(nohz.has_blocked, 1); } #else static inline void nohz_balancer_kick(struct rq *rq) { } @@ -9439,7 +9488,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long now = jiffies; unsigned long next_balance = now + 60*HZ; - unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD); + bool has_blocked_load = false; int update_next_balance = 0; int this_cpu = this_rq->cpu; unsigned int flags; @@ -9458,6 +9507,22 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); + /* + * We assume there will be no idle load after this update and clear + * the has_blocked flag. If a cpu enters idle in the mean time, it will + * set the has_blocked flag and trig another update of idle load. + * Because a cpu that becomes idle, is added to idle_cpus_mask before + * setting the flag, we are sure to not clear the state and not + * check the load of an idle cpu. + */ + WRITE_ONCE(nohz.has_blocked, 0); + + /* + * Ensures that if we miss the CPU, we must see the has_blocked + * store from nohz_balance_enter_idle(). + */ + smp_mb(); + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; @@ -9467,11 +9532,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * work being done for other CPUs. Next load * balancing owner will pick it up. */ - if (need_resched()) - break; + if (need_resched()) { + has_blocked_load = true; + goto abort; + } rq = cpu_rq(balance_cpu); + update_blocked_averages(rq->cpu); + has_blocked_load |= rq->has_blocked_load; + /* * If time for next balance is due, * do the balance. @@ -9484,7 +9554,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) cpu_load_update_idle(rq); rq_unlock_irq(rq, &rf); - update_blocked_averages(rq->cpu); if (flags & NOHZ_BALANCE_KICK) rebalance_domains(rq, CPU_IDLE); } @@ -9499,7 +9568,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (flags & NOHZ_BALANCE_KICK) rebalance_domains(this_rq, CPU_IDLE); - nohz.next_stats = next_stats; + WRITE_ONCE(nohz.next_blocked, + now + msecs_to_jiffies(LOAD_AVG_PERIOD)); + +abort: + /* There is still blocked load, enable periodic update */ + if (has_blocked_load) + WRITE_ONCE(nohz.has_blocked, 1); /* * next_balance will be updated only when there is a need. @@ -10135,6 +10210,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; + nohz.next_blocked = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif #endif /* SMP */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 818f22dbc7ea..22909ffc04fb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -763,6 +763,7 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_load_update_tick; unsigned long last_blocked_load_update_tick; + unsigned int has_blocked_load; #endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; atomic_t nohz_flags; -- cgit v1.2.3-71-gd317 From 1936c53ce8c8d4555e9ccad2dc8d98e0637b11f7 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 13 Feb 2018 11:31:18 +0100 Subject: sched/fair: Reduce the periodic update duration Instead of using the cfs_rq_is_decayed() which monitors all *_avg and *_sum, we create a cfs_rq_has_blocked() which only takes care of util_avg and load_avg. We are only interested by these 2 values which are decaying faster than the *_sum so we can stop the periodic update earlier. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: brendan.jackman@arm.com Cc: dietmar.eggemann@arm.com Cc: morten.rasmussen@foss.arm.com Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/1518517879-2280-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 78b06a0814d1..aad7c03dbad8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7424,6 +7424,19 @@ static void attach_tasks(struct lb_env *env) rq_unlock(env->dst_rq, &rf); } +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->avg.load_avg) + return true; + + if (cfs_rq->avg.util_avg) + return true; + + return false; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED + static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { if (cfs_rq->load.weight) @@ -7441,8 +7454,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) return true; } -#ifdef CONFIG_FAIR_GROUP_SCHED - static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -7478,7 +7489,9 @@ static void update_blocked_averages(int cpu) */ if (cfs_rq_is_decayed(cfs_rq)) list_del_leaf_cfs_rq(cfs_rq); - else + + /* Don't need periodic decay once load/util_avg are null */ + if (cfs_rq_has_blocked(cfs_rq)) done = false; } @@ -7548,7 +7561,7 @@ static inline void update_blocked_averages(int cpu) update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; - if (cfs_rq_is_decayed(cfs_rq)) + if (!cfs_rq_has_blocked(cfs_rq)) rq->has_blocked_load = 0; #endif rq_unlock_irqrestore(rq, &rf); -- cgit v1.2.3-71-gd317 From 63928384faefba1b31c3bb77361965715a9fc71c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Feb 2018 16:54:17 +0100 Subject: sched/nohz: Optimize nohz_idle_balance() Avoid calling update_blocked_averages() when it does not in fact have any by re-using/extending update_nohz_stats(). Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aad7c03dbad8..5c357561db5d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7898,7 +7898,7 @@ group_type group_classify(struct sched_group *group, return group_other; } -static bool update_nohz_stats(struct rq *rq) +static bool update_nohz_stats(struct rq *rq, bool force) { #ifdef CONFIG_NO_HZ_COMMON unsigned int cpu = rq->cpu; @@ -7909,7 +7909,7 @@ static bool update_nohz_stats(struct rq *rq) if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) return false; - if (!time_after(jiffies, rq->last_blocked_load_update_tick)) + if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick)) return true; update_blocked_averages(cpu); @@ -7942,7 +7942,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); - if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq)) + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; /* Bias balancing toward CPUs of our domain: */ @@ -9552,8 +9552,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rq = cpu_rq(balance_cpu); - update_blocked_averages(rq->cpu); - has_blocked_load |= rq->has_blocked_load; + has_blocked_load |= update_nohz_stats(rq, true); /* * If time for next balance is due, -- cgit v1.2.3-71-gd317 From af3fe03c562055bc3c116eabe73f141ae31bf234 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Feb 2018 10:58:39 +0100 Subject: sched/fair: Move rebalance_domains() This pure code movement results in two #ifdef CONFIG_NO_HZ_COMMON sections landing next to each other. Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 236 ++++++++++++++++++++++++++-------------------------- 1 file changed, 118 insertions(+), 118 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c357561db5d..0da79d8a6a2c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9121,6 +9121,124 @@ out_unlock: return 0; } +static DEFINE_SPINLOCK(balancing); + +/* + * Scale the max load_balance interval with the number of CPUs in the system. + * This trades load-balance latency on larger machines for less cross talk. + */ +void update_max_interval(void) +{ + max_load_balance_interval = HZ*num_online_cpus()/10; +} + +/* + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in init_sched_domains. + */ +static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) +{ + int continue_balancing = 1; + int cpu = rq->cpu; + unsigned long interval; + struct sched_domain *sd; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; + int need_serialize, need_decay = 0; + u64 max_cost = 0; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + /* + * Decay the newidle max times here because this is a regular + * visit to all the domains. Decay ~1% per second. + */ + if (time_after(jiffies, sd->next_decay_max_lb_cost)) { + sd->max_newidle_lb_cost = + (sd->max_newidle_lb_cost * 253) / 256; + sd->next_decay_max_lb_cost = jiffies + HZ; + need_decay = 1; + } + max_cost += sd->max_newidle_lb_cost; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!continue_balancing) { + if (need_decay) + continue; + break; + } + + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + + need_serialize = sd->flags & SD_SERIALIZE; + if (need_serialize) { + if (!spin_trylock(&balancing)) + goto out; + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { + if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { + /* + * The LBF_DST_PINNED logic could have changed + * env->dst_cpu, so we can't know our idle + * state even if we migrated tasks. Update it. + */ + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; + } + sd->last_balance = jiffies; + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + } + if (need_serialize) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) { + next_balance = sd->last_balance + interval; + update_next_balance = 1; + } + } + if (need_decay) { + /* + * Ensure the rq-wide value also decays but keep it at a + * reasonable floor to avoid funnies with rq->avg_idle. + */ + rq->max_idle_balance_cost = + max((u64)sysctl_sched_migration_cost, max_cost); + } + rcu_read_unlock(); + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) { + rq->next_balance = next_balance; + +#ifdef CONFIG_NO_HZ_COMMON + /* + * If this CPU has been elected to perform the nohz idle + * balance. Other idle CPUs have already rebalanced with + * nohz_idle_balance() and nohz.next_balance has been + * updated accordingly. This CPU is now running the idle load + * balance for itself and we need to update the + * nohz.next_balance accordingly. + */ + if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) + nohz.next_balance = rq->next_balance; +#endif + } +} + static inline int on_null_domain(struct rq *rq) { return unlikely(!rcu_dereference_sched(rq->sd)); @@ -9373,124 +9491,6 @@ out: static inline void nohz_balancer_kick(struct rq *rq) { } #endif -static DEFINE_SPINLOCK(balancing); - -/* - * Scale the max load_balance interval with the number of CPUs in the system. - * This trades load-balance latency on larger machines for less cross talk. - */ -void update_max_interval(void) -{ - max_load_balance_interval = HZ*num_online_cpus()/10; -} - -/* - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in init_sched_domains. - */ -static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) -{ - int continue_balancing = 1; - int cpu = rq->cpu; - unsigned long interval; - struct sched_domain *sd; - /* Earliest time when we have to do rebalance again */ - unsigned long next_balance = jiffies + 60*HZ; - int update_next_balance = 0; - int need_serialize, need_decay = 0; - u64 max_cost = 0; - - rcu_read_lock(); - for_each_domain(cpu, sd) { - /* - * Decay the newidle max times here because this is a regular - * visit to all the domains. Decay ~1% per second. - */ - if (time_after(jiffies, sd->next_decay_max_lb_cost)) { - sd->max_newidle_lb_cost = - (sd->max_newidle_lb_cost * 253) / 256; - sd->next_decay_max_lb_cost = jiffies + HZ; - need_decay = 1; - } - max_cost += sd->max_newidle_lb_cost; - - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. - */ - if (!continue_balancing) { - if (need_decay) - continue; - break; - } - - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); - - need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { - if (!spin_trylock(&balancing)) - goto out; - } - - if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { - /* - * The LBF_DST_PINNED logic could have changed - * env->dst_cpu, so we can't know our idle - * state even if we migrated tasks. Update it. - */ - idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; - } - sd->last_balance = jiffies; - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); - } - if (need_serialize) - spin_unlock(&balancing); -out: - if (time_after(next_balance, sd->last_balance + interval)) { - next_balance = sd->last_balance + interval; - update_next_balance = 1; - } - } - if (need_decay) { - /* - * Ensure the rq-wide value also decays but keep it at a - * reasonable floor to avoid funnies with rq->avg_idle. - */ - rq->max_idle_balance_cost = - max((u64)sysctl_sched_migration_cost, max_cost); - } - rcu_read_unlock(); - - /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) { - rq->next_balance = next_balance; - -#ifdef CONFIG_NO_HZ_COMMON - /* - * If this CPU has been elected to perform the nohz idle - * balance. Other idle CPUs have already rebalanced with - * nohz_idle_balance() and nohz.next_balance has been - * updated accordingly. This CPU is now running the idle load - * balance for itself and we need to update the - * nohz.next_balance accordingly. - */ - if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) - nohz.next_balance = rq->next_balance; -#endif - } -} - #ifdef CONFIG_NO_HZ_COMMON /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the -- cgit v1.2.3-71-gd317 From dd707247ababb685ac4b8b2c6a7bf2923725e6ac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Feb 2018 10:59:45 +0100 Subject: sched/nohz: Merge CONFIG_NO_HZ_COMMON blocks Now that we have two back-to-back NO_HZ_COMMON blocks, merge them. Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0da79d8a6a2c..d6767f533029 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9487,11 +9487,7 @@ out: */ WRITE_ONCE(nohz.has_blocked, 1); } -#else -static inline void nohz_balancer_kick(struct rq *rq) { } -#endif -#ifdef CONFIG_NO_HZ_COMMON /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the CPUs for whom scheduler ticks are stopped. @@ -9598,12 +9594,14 @@ abort: return true; } -#else +#else /* !CONFIG_NO_HZ_COMMON */ +static inline void nohz_balancer_kick(struct rq *rq) { } + static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { return false; } -#endif +#endif /* CONFIG_NO_HZ_COMMON */ /* * run_rebalance_domains is triggered when needed from the scheduler tick. -- cgit v1.2.3-71-gd317 From 47ea54121e46a685aa2320df8b0f71aaeedff23f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Feb 2018 11:45:47 +0100 Subject: sched/fair: Move idle_balance() We're going to want to call nohz_idle_balance() or parts thereof from idle_balance(). Since we already have a forward declaration of idle_balance() move it down such that it's below nohz_idle_balance() avoiding the need for a forward declaration for that. Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 228 ++++++++++++++++++++++++++-------------------------- 1 file changed, 114 insertions(+), 114 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d6767f533029..058badcfa94b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8916,120 +8916,6 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) *next_balance = next; } -/* - * idle_balance is called by schedule() if this_cpu is about to become - * idle. Attempts to pull tasks from other CPUs. - */ -static int idle_balance(struct rq *this_rq, struct rq_flags *rf) -{ - unsigned long next_balance = jiffies + HZ; - int this_cpu = this_rq->cpu; - struct sched_domain *sd; - int pulled_task = 0; - u64 curr_cost = 0; - - /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. - */ - this_rq->idle_stamp = rq_clock(this_rq); - - /* - * Do not pull tasks towards !active CPUs... - */ - if (!cpu_active(this_cpu)) - return 0; - - /* - * This is OK, because current is on_cpu, which avoids it being picked - * for load-balance and preemption/IRQs are still disabled avoiding - * further scheduler activity on it and we're being very careful to - * re-start the picking loop. - */ - rq_unpin_lock(this_rq, rf); - - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (sd) - update_next_balance(sd, &next_balance); - rcu_read_unlock(); - - goto out; - } - - raw_spin_unlock(&this_rq->lock); - - update_blocked_averages(this_cpu); - rcu_read_lock(); - for_each_domain(this_cpu, sd) { - int continue_balancing = 1; - u64 t0, domain_cost; - - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, &next_balance); - break; - } - - if (sd->flags & SD_BALANCE_NEWIDLE) { - t0 = sched_clock_cpu(this_cpu); - - pulled_task = load_balance(this_cpu, this_rq, - sd, CPU_NEWLY_IDLE, - &continue_balancing); - - domain_cost = sched_clock_cpu(this_cpu) - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; - - curr_cost += domain_cost; - } - - update_next_balance(sd, &next_balance); - - /* - * Stop searching for tasks to pull if there are - * now runnable tasks on this rq. - */ - if (pulled_task || this_rq->nr_running > 0) - break; - } - rcu_read_unlock(); - - raw_spin_lock(&this_rq->lock); - - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; - - /* - * While browsing the domains, we released the rq lock, a task could - * have been enqueued in the meantime. Since we're not going idle, - * pretend we pulled a task. - */ - if (this_rq->cfs.h_nr_running && !pulled_task) - pulled_task = 1; - -out: - /* Move the next balance forward */ - if (time_after(this_rq->next_balance, next_balance)) - this_rq->next_balance = next_balance; - - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) - pulled_task = -1; - - if (pulled_task) - this_rq->idle_stamp = 0; - - rq_repin_lock(this_rq, rf); - - return pulled_task; -} - /* * active_load_balance_cpu_stop is run by the CPU stopper. It pushes * running tasks off the busiest CPU onto idle CPUs. It requires at @@ -9603,6 +9489,120 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) } #endif /* CONFIG_NO_HZ_COMMON */ +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static int idle_balance(struct rq *this_rq, struct rq_flags *rf) +{ + unsigned long next_balance = jiffies + HZ; + int this_cpu = this_rq->cpu; + struct sched_domain *sd; + int pulled_task = 0; + u64 curr_cost = 0; + + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + this_rq->idle_stamp = rq_clock(this_rq); + + /* + * Do not pull tasks towards !active CPUs... + */ + if (!cpu_active(this_cpu)) + return 0; + + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + rq_unpin_lock(this_rq, rf); + + if (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (sd) + update_next_balance(sd, &next_balance); + rcu_read_unlock(); + + goto out; + } + + raw_spin_unlock(&this_rq->lock); + + update_blocked_averages(this_cpu); + rcu_read_lock(); + for_each_domain(this_cpu, sd) { + int continue_balancing = 1; + u64 t0, domain_cost; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { + update_next_balance(sd, &next_balance); + break; + } + + if (sd->flags & SD_BALANCE_NEWIDLE) { + t0 = sched_clock_cpu(this_cpu); + + pulled_task = load_balance(this_cpu, this_rq, + sd, CPU_NEWLY_IDLE, + &continue_balancing); + + domain_cost = sched_clock_cpu(this_cpu) - t0; + if (domain_cost > sd->max_newidle_lb_cost) + sd->max_newidle_lb_cost = domain_cost; + + curr_cost += domain_cost; + } + + update_next_balance(sd, &next_balance); + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) + break; + } + rcu_read_unlock(); + + raw_spin_lock(&this_rq->lock); + + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + + /* + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. + */ + if (this_rq->cfs.h_nr_running && !pulled_task) + pulled_task = 1; + +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) + this_rq->next_balance = next_balance; + + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running) + pulled_task = -1; + + if (pulled_task) + this_rq->idle_stamp = 0; + + rq_repin_lock(this_rq, rf); + + return pulled_task; +} + /* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). -- cgit v1.2.3-71-gd317 From 31e77c93e432dec79c7d90b888bbfc3652592741 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 14 Feb 2018 16:26:46 +0100 Subject: sched/fair: Update blocked load when newly idle When NEWLY_IDLE load balance is not triggered, we might need to update the blocked load anyway. We can kick an ilb so an idle CPU will take care of updating blocked load or we can try to update them locally before entering idle. In the latter case, we reuse part of the nohz_idle_balance. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: brendan.jackman@arm.com Cc: dietmar.eggemann@arm.com Cc: morten.rasmussen@foss.arm.com Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/1518622006-16089-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 105 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 058badcfa94b..3582117e1580 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9375,10 +9375,14 @@ out: } /* - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the CPUs for whom scheduler ticks are stopped. + * Internal function that runs load balance for all idle cpus. The load balance + * can be a simple update of blocked load or a complete load balance with + * tasks movement depending of flags. + * The function returns false if the loop has stopped before running + * through all idle CPUs. */ -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, + enum cpu_idle_type idle) { /* Earliest time when we have to do rebalance again */ unsigned long now = jiffies; @@ -9386,20 +9390,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) bool has_blocked_load = false; int update_next_balance = 0; int this_cpu = this_rq->cpu; - unsigned int flags; int balance_cpu; + int ret = false; struct rq *rq; - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) - return false; - - if (idle != CPU_IDLE) { - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - return false; - } - - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); /* @@ -9443,10 +9437,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (time_after_eq(jiffies, rq->next_balance)) { struct rq_flags rf; - rq_lock_irq(rq, &rf); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); cpu_load_update_idle(rq); - rq_unlock_irq(rq, &rf); + rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) rebalance_domains(rq, CPU_IDLE); @@ -9458,13 +9452,21 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) } } - update_blocked_averages(this_cpu); + /* Newly idle CPU doesn't need an update */ + if (idle != CPU_NEWLY_IDLE) { + update_blocked_averages(this_cpu); + has_blocked_load |= this_rq->has_blocked_load; + } + if (flags & NOHZ_BALANCE_KICK) rebalance_domains(this_rq, CPU_IDLE); WRITE_ONCE(nohz.next_blocked, now + msecs_to_jiffies(LOAD_AVG_PERIOD)); + /* The full idle balance loop has been done */ + ret = true; + abort: /* There is still blocked load, enable periodic update */ if (has_blocked_load) @@ -9478,15 +9480,79 @@ abort: if (likely(update_next_balance)) nohz.next_balance = next_balance; + return ret; +} + +/* + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +{ + int this_cpu = this_rq->cpu; + unsigned int flags; + + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) + return false; + + if (idle != CPU_IDLE) { + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); + return false; + } + + /* + * barrier, pairs with nohz_balance_enter_idle(), ensures ... + */ + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); + if (!(flags & NOHZ_KICK_MASK)) + return false; + + _nohz_idle_balance(this_rq, flags, idle); + return true; } + +static void nohz_newidle_balance(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu; + + /* + * This CPU doesn't want to be disturbed by scheduler + * housekeeping + */ + if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) + return; + + /* Will wake up very soon. No time for doing anything else*/ + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + + /* Don't need to update blocked load of idle CPUs*/ + if (!READ_ONCE(nohz.has_blocked) || + time_before(jiffies, READ_ONCE(nohz.next_blocked))) + return; + + raw_spin_unlock(&this_rq->lock); + /* + * This CPU is going to be idle and blocked load of idle CPUs + * need to be updated. Run the ilb locally as it is a good + * candidate for ilb instead of waking up another idle CPU. + * Kick an normal ilb if we failed to do the update. + */ + if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) + kick_ilb(NOHZ_STATS_KICK); + raw_spin_lock(&this_rq->lock); +} + #else /* !CONFIG_NO_HZ_COMMON */ static inline void nohz_balancer_kick(struct rq *rq) { } -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { return false; } + +static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* @@ -9523,12 +9589,15 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) if (this_rq->avg_idle < sysctl_sched_migration_cost || !this_rq->rd->overload) { + rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) update_next_balance(sd, &next_balance); rcu_read_unlock(); + nohz_newidle_balance(this_rq); + goto out; } -- cgit v1.2.3-71-gd317 From d17067e4487adc53bedb43681b3cb5a1714ff6ca Mon Sep 17 00:00:00 2001 From: gaurav jindal Date: Wed, 21 Feb 2018 18:24:07 +0530 Subject: sched/completions: Use bool in try_wait_for_completion() Since the return type of the function is bool, the internal 'ret' variable should be bool too. Signed-off-by: Gaurav Jindal Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180221125407.GA14292@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/completion.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 5d2d56b0817a..e426b0cb9ac6 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -280,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout); bool try_wait_for_completion(struct completion *x) { unsigned long flags; - int ret = 1; + bool ret = true; /* * Since x->done will need to be locked only @@ -289,11 +289,11 @@ bool try_wait_for_completion(struct completion *x) * return early in the blocking case. */ if (!READ_ONCE(x->done)) - return 0; + return false; spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) - ret = 0; + ret = false; else if (x->done != UINT_MAX) x->done--; spin_unlock_irqrestore(&x->wait.lock, flags); -- cgit v1.2.3-71-gd317 From bd903afeb504db5655a45bb4cf86f38be5b1bf62 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 5 Mar 2018 21:55:04 -0800 Subject: perf/core: Fix ctx_event_type in ctx_resched() In ctx_resched(), EVENT_FLEXIBLE should be sched_out when EVENT_PINNED is added. However, ctx_resched() calculates ctx_event_type before checking this condition. As a result, pinned events will NOT get higher priority than flexible events. The following shows this issue on an Intel CPU (where ref-cycles can only use one hardware counter). 1. First start: perf stat -C 0 -e ref-cycles -I 1000 2. Then, in the second console, run: perf stat -C 0 -e ref-cycles:D -I 1000 The second perf uses pinned events, which is expected to have higher priority. However, because it failed in ctx_resched(). It is never run. This patch fixes this by calculating ctx_event_type after re-evaluating event_type. Reported-by: Ephraim Park Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 487f05e18aa4 ("perf/core: Optimize event rescheduling on active contexts") Link: http://lkml.kernel.org/r/20180306055504.3283731-1-songliubraving@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 96db9ae5d5af..4b838470fac4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2246,7 +2246,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, enum event_type_t event_type) { - enum event_type_t ctx_event_type = event_type & EVENT_ALL; + enum event_type_t ctx_event_type; bool cpu_event = !!(event_type & EVENT_CPU); /* @@ -2256,6 +2256,8 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; + ctx_event_type = event_type & EVENT_ALL; + perf_pmu_disable(cpuctx->ctx.pmu); if (task_ctx) task_ctx_sched_out(cpuctx, task_ctx, event_type); -- cgit v1.2.3-71-gd317 From 6d8cb045cde681e64a5ed80a2ab70be831a7f9b0 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 8 Mar 2018 23:46:33 -0800 Subject: bpf: comment why dots in filenames under BPF virtual FS are not allowed When pinning a file under the BPF virtual file system (traditionally /sys/fs/bpf), using a dot in the name of the location to pin at is not allowed. For example, trying to pin at "/sys/fs/bpf/foo.bar" will be rejected with -EPERM. This check was introduced at the same time as the BPF file system itself, with commit b2197755b263 ("bpf: add support for persistent maps/progs"). At this time, it was checked in a function called "bpf_dname_reserved()", which made clear that using a dot was reserved for future extensions. This function disappeared and the check was moved elsewhere with commit 0c93b7d85d40 ("bpf: reject invalid names right in ->lookup()"), and the meaning of the dot ban was lost. The present commit simply adds a comment in the source to explain to the reader that the usage of dots is reserved for future usage. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 81e2f6995adb..bf6da59ae0d0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -178,6 +178,9 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { + /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future + * extensions. + */ if (strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); -- cgit v1.2.3-71-gd317 From 6b0ef92fee2a3189eba6d6b827b247cb4f6da7e9 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 9 Mar 2018 14:56:28 +0800 Subject: rtmutex: Make rt_mutex_futex_unlock() safe for irq-off callsites When running rcutorture with TREE03 config, CONFIG_PROVE_LOCKING=y, and kernel cmdline argument "rcutorture.gp_exp=1", lockdep reports a HARDIRQ-safe->HARDIRQ-unsafe deadlock: ================================ WARNING: inconsistent lock state 4.16.0-rc4+ #1 Not tainted -------------------------------- inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. takes: __schedule+0xbe/0xaf0 {IN-HARDIRQ-W} state was registered at: _raw_spin_lock+0x2a/0x40 scheduler_tick+0x47/0xf0 ... other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&rq->lock); lock(&rq->lock); *** DEADLOCK *** 1 lock held by rcu_torture_rea/724: rcu_torture_read_lock+0x0/0x70 stack backtrace: CPU: 2 PID: 724 Comm: rcu_torture_rea Not tainted 4.16.0-rc4+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014 Call Trace: lock_acquire+0x90/0x200 ? __schedule+0xbe/0xaf0 _raw_spin_lock+0x2a/0x40 ? __schedule+0xbe/0xaf0 __schedule+0xbe/0xaf0 preempt_schedule_irq+0x2f/0x60 retint_kernel+0x1b/0x2d RIP: 0010:rcu_read_unlock_special+0x0/0x680 ? rcu_torture_read_unlock+0x60/0x60 __rcu_read_unlock+0x64/0x70 rcu_torture_read_unlock+0x17/0x60 rcu_torture_reader+0x275/0x450 ? rcutorture_booster_init+0x110/0x110 ? rcu_torture_stall+0x230/0x230 ? kthread+0x10e/0x130 kthread+0x10e/0x130 ? kthread_create_worker_on_cpu+0x70/0x70 ? call_usermodehelper_exec_async+0x11a/0x150 ret_from_fork+0x3a/0x50 This happens with the following even sequence: preempt_schedule_irq(); local_irq_enable(); __schedule(): local_irq_disable(); // irq off ... rcu_note_context_switch(): rcu_note_preempt_context_switch(): rcu_read_unlock_special(): local_irq_save(flags); ... raw_spin_unlock_irqrestore(...,flags); // irq remains off rt_mutex_futex_unlock(): raw_spin_lock_irq(); ... raw_spin_unlock_irq(); // accidentally set irq on rq_lock(): raw_spin_lock(); // acquiring rq->lock with irq on which means rq->lock becomes a HARDIRQ-unsafe lock, which can cause deadlocks in scheduler code. This problem was introduced by commit 02a7c234e540 ("rcu: Suppress lockdep false-positive ->boost_mtx complaints"). That brought the user of rt_mutex_futex_unlock() with irq off. To fix this, replace the *lock_irq() in rt_mutex_futex_unlock() with *lock_irq{save,restore}() to make it safe to call rt_mutex_futex_unlock() with irq off. Fixes: 02a7c234e540 ("rcu: Suppress lockdep false-positive ->boost_mtx complaints") Signed-off-by: Boqun Feng Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Josh Triplett Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Link: https://lkml.kernel.org/r/20180309065630.8283-1-boqun.feng@gmail.com --- kernel/locking/rtmutex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 65cc0cb984e6..940633c63254 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1616,11 +1616,12 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) { DEFINE_WAKE_Q(wake_q); + unsigned long flags; bool postunlock; - raw_spin_lock_irq(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); postunlock = __rt_mutex_futex_unlock(lock, &wake_q); - raw_spin_unlock_irq(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); if (postunlock) rt_mutex_postunlock(&wake_q); -- cgit v1.2.3-71-gd317 From 34a866bd45d69754da1979e83a37bec6defc6295 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Sun, 4 Mar 2018 13:10:16 +0100 Subject: genirq/irq_sim: Explicitly include slab.h kfree() is used in the irq_sim code but slab.h is pulled in indirectly via irq.h. Include it explicitly. Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Link: https://lkml.kernel.org/r/20180304121018.640-2-brgl@bgdev.pl --- kernel/irq/irq_sim.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 24caabf1a0f7..bd7dc1db6a80 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -7,6 +7,7 @@ * option) any later version. */ +#include #include #include -- cgit v1.2.3-71-gd317 From 28b6afa7d4456e759031bf83706b4be3689fba94 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Sun, 4 Mar 2018 13:10:17 +0100 Subject: genirq/irq_sim: Check the return value of irq_sim_init() for error codes As discussed with Marc Zyngier: irq_sim_init() and its devres variant should return the base of the allocated interrupt range on success rather than 0. Make devm_irq_sim_init() check for an error code. This is a preparatory change for modifying irq_sim_init() itself. Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Link: https://lkml.kernel.org/r/20180304121018.640-3-brgl@bgdev.pl --- kernel/irq/irq_sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index bd7dc1db6a80..05f0d7c1698a 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -124,7 +124,7 @@ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, return -ENOMEM; rv = irq_sim_init(sim, num_irqs); - if (rv) { + if (rv < 0) { devres_free(dr); return rv; } -- cgit v1.2.3-71-gd317 From f09777fa891327ee0dd892e83619f156bf1c37e6 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Sun, 4 Mar 2018 13:10:18 +0100 Subject: genirq/irq_sim: Return the base of the irq range from irq_sim_init() Returning the base of the allocated interrupt range from irq_sim_init() and devm_irq_sim_init() allows users to handle the logic of associating irq numbers with any other driver-specific resources without having to use irq_sim_irqnum(). Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Link: https://lkml.kernel.org/r/20180304121018.640-4-brgl@bgdev.pl --- kernel/irq/irq_sim.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 05f0d7c1698a..85690859a2a8 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -50,7 +50,8 @@ static void irq_sim_handle_irq(struct irq_work *work) * @sim: The interrupt simulator object to initialize. * @num_irqs: Number of interrupts to allocate * - * Returns 0 on success and a negative error number on failure. + * On success: return the base of the allocated interrupt range. + * On failure: a negative errno. */ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) { @@ -79,7 +80,7 @@ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq); sim->irq_count = num_irqs; - return 0; + return sim->irq_base; } EXPORT_SYMBOL_GPL(irq_sim_init); @@ -111,7 +112,8 @@ static void devm_irq_sim_release(struct device *dev, void *res) * @sim: The interrupt simulator object to initialize. * @num_irqs: Number of interrupts to allocate * - * Returns 0 on success and a negative error number on failure. + * On success: return the base of the allocated interrupt range. + * On failure: a negative errno. */ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, unsigned int num_irqs) @@ -132,7 +134,7 @@ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, dr->sim = sim; devres_add(dev, dr); - return 0; + return rv; } EXPORT_SYMBOL_GPL(devm_irq_sim_init); -- cgit v1.2.3-71-gd317 From 6498ddad301c7a94162915d06d1efe2e5d20f6dc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Feb 2018 17:48:07 +0100 Subject: softirq: Consolidate common code in __tasklet_[hi]_schedule() __tasklet_schedule() and __tasklet_hi_schedule() are almost identical. Move the common code from both function into __tasklet_schedule_common() and let both functions invoke it with different arguments. [ bigeasy: Splitted out from RT's "tasklet: Prevent tasklets from going into infinite spin in RT" and added commit message. Use this_cpu_ptr(headp) in __tasklet_schedule_common() as suggested by Julia Cartwright ] Signed-off-by: Ingo Molnar Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Cc: Julia Cartwright Link: https://lkml.kernel.org/r/20180227164808.10093-2-bigeasy@linutronix.de --- kernel/softirq.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 24d243ef8e71..2394b009994f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -460,29 +460,33 @@ struct tasklet_head { static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); -void __tasklet_schedule(struct tasklet_struct *t) +static void __tasklet_schedule_common(struct tasklet_struct *t, + struct tasklet_head __percpu *headp, + unsigned int softirq_nr) { + struct tasklet_head *head; unsigned long flags; local_irq_save(flags); + head = this_cpu_ptr(headp); t->next = NULL; - *__this_cpu_read(tasklet_vec.tail) = t; - __this_cpu_write(tasklet_vec.tail, &(t->next)); - raise_softirq_irqoff(TASKLET_SOFTIRQ); + *head->tail = t; + head->tail = &(t->next); + raise_softirq_irqoff(softirq_nr); local_irq_restore(flags); } + +void __tasklet_schedule(struct tasklet_struct *t) +{ + __tasklet_schedule_common(t, &tasklet_vec, + TASKLET_SOFTIRQ); +} EXPORT_SYMBOL(__tasklet_schedule); void __tasklet_hi_schedule(struct tasklet_struct *t) { - unsigned long flags; - - local_irq_save(flags); - t->next = NULL; - *__this_cpu_read(tasklet_hi_vec.tail) = t; - __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); - raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_restore(flags); + __tasklet_schedule_common(t, &tasklet_hi_vec, + HI_SOFTIRQ); } EXPORT_SYMBOL(__tasklet_hi_schedule); -- cgit v1.2.3-71-gd317 From 82b691bedf05f258f1c86c96ee574b0d7795c0a1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Feb 2018 17:48:08 +0100 Subject: softirq: Consolidate common code in tasklet_[hi]_action() tasklet_action() + tasklet_hi_action() are almost identical. Move the common code from both function into __tasklet_action_common() and let both functions invoke it with different arguments. [ bigeasy: Splitted out from RT's "tasklet: Prevent tasklets from going into infinite spin in RT" and added commit message] Signed-off-by: Ingo Molnar Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Cc: Julia Cartwright Link: https://lkml.kernel.org/r/20180227164808.10093-3-bigeasy@linutronix.de --- kernel/softirq.c | 54 +++++++++++++++--------------------------------------- 1 file changed, 15 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 2394b009994f..177de3640c78 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -490,14 +490,16 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule); -static __latent_entropy void tasklet_action(struct softirq_action *a) +static void tasklet_action_common(struct softirq_action *a, + struct tasklet_head *tl_head, + unsigned int softirq_nr) { struct tasklet_struct *list; local_irq_disable(); - list = __this_cpu_read(tasklet_vec.head); - __this_cpu_write(tasklet_vec.head, NULL); - __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head)); + list = tl_head->head; + tl_head->head = NULL; + tl_head->tail = &tl_head->head; local_irq_enable(); while (list) { @@ -519,47 +521,21 @@ static __latent_entropy void tasklet_action(struct softirq_action *a) local_irq_disable(); t->next = NULL; - *__this_cpu_read(tasklet_vec.tail) = t; - __this_cpu_write(tasklet_vec.tail, &(t->next)); - __raise_softirq_irqoff(TASKLET_SOFTIRQ); + *tl_head->tail = t; + tl_head->tail = &t->next; + __raise_softirq_irqoff(softirq_nr); local_irq_enable(); } } -static __latent_entropy void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(struct softirq_action *a) { - struct tasklet_struct *list; - - local_irq_disable(); - list = __this_cpu_read(tasklet_hi_vec.head); - __this_cpu_write(tasklet_hi_vec.head, NULL); - __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head)); - local_irq_enable(); - - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, - &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } + tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); +} - local_irq_disable(); - t->next = NULL; - *__this_cpu_read(tasklet_hi_vec.tail) = t; - __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); - __raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); - } +static __latent_entropy void tasklet_hi_action(struct softirq_action *a) +{ + tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } void tasklet_init(struct tasklet_struct *t, -- cgit v1.2.3-71-gd317 From af40ff687bc9d351030685fde2f57ba45ab4fc14 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Thu, 8 Mar 2018 17:41:05 +0000 Subject: arm64: signal: Ensure si_code is valid for all fault signals Currently, as reported by Eric, an invalid si_code value 0 is passed in many signals delivered to userspace in response to faults and other kernel errors. Typically 0 is passed when the fault is insufficiently diagnosable or when there does not appear to be any sensible alternative value to choose. This appears to violate POSIX, and is intuitively wrong for at least two reasons arising from the fact that 0 == SI_USER: 1) si_code is a union selector, and SI_USER (and si_code <= 0 in general) implies the existence of a different set of fields (siginfo._kill) from that which exists for a fault signal (siginfo._sigfault). However, the code raising the signal typically writes only the _sigfault fields, and the _kill fields make no sense in this case. Thus when userspace sees si_code == 0 (SI_USER) it may legitimately inspect fields in the inactive union member _kill and obtain garbage as a result. There appears to be software in the wild relying on this, albeit generally only for printing diagnostic messages. 2) Software that wants to be robust against spurious signals may discard signals where si_code == SI_USER (or <= 0), or may filter such signals based on the si_uid and si_pid fields of siginfo._sigkill. In the case of fault signals, this means that important (and usually fatal) error conditions may be silently ignored. In practice, many of the faults for which arm64 passes si_code == 0 are undiagnosable conditions such as exceptions with syndrome values in ESR_ELx to which the architecture does not yet assign any meaning, or conditions indicative of a bug or error in the kernel or system and thus that are unrecoverable and should never occur in normal operation. The approach taken in this patch is to translate all such undiagnosable or "impossible" synchronous fault conditions to SIGKILL, since these are at least probably localisable to a single process. Some of these conditions should really result in a kernel panic, but due to the lack of diagnostic information it is difficult to be certain: this patch does not add any calls to panic(), but this could change later if justified. Although si_code will not reach userspace in the case of SIGKILL, it is still desirable to pass a nonzero value so that the common siginfo handling code can detect incorrect use of si_code == 0 without false positives. In this case the si_code dependent siginfo fields will not be correctly initialised, but since they are not passed to userspace I deem this not to matter. A few faults can reasonably occur in realistic userspace scenarios, and _should_ raise a regular, handleable (but perhaps not ignorable/blockable) signal: for these, this patch attempts to choose a suitable standard si_code value for the raised signal in each case instead of 0. arm64 was the only arch to define a BUS_FIXME code, so after this patch nobody defines it. This patch therefore also removes the relevant code from siginfo_layout(). Cc: James Morse Reported-by: Eric W. Biederman Signed-off-by: Dave Martin Signed-off-by: Will Deacon --- arch/arm64/include/uapi/asm/siginfo.h | 14 ---- arch/arm64/kernel/fpsimd.c | 2 +- arch/arm64/mm/fault.c | 116 +++++++++++++++++----------------- kernel/signal.c | 4 -- 4 files changed, 59 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/include/uapi/asm/siginfo.h b/arch/arm64/include/uapi/asm/siginfo.h index 9b4d91277742..8d7dbbcce780 100644 --- a/arch/arm64/include/uapi/asm/siginfo.h +++ b/arch/arm64/include/uapi/asm/siginfo.h @@ -28,18 +28,4 @@ #define FPE_FIXME 0 /* Broken dup of SI_USER */ #endif /* __KERNEL__ */ -/* - * SIGBUS si_codes - */ -#ifdef __KERNEL__ -#define BUS_FIXME 0 /* Broken dup of SI_USER */ -#endif /* __KERNEL__ */ - -/* - * SIGTRAP si_codes - */ -#ifdef __KERNEL__ -#define TRAP_FIXME 0 /* Broken dup of SI_USER */ -#endif /* __KERNEL__ */ - #endif diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 6964ff867d4a..65fc87645ec6 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -285,7 +285,7 @@ static void task_fpsimd_save(void) * re-enter user with corrupt state. * There's no way to recover, so kill it: */ - force_signal_inject(SIGKILL, 0, 0); + force_signal_inject(SIGKILL, SI_KERNEL, 0); return; } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 49dfb08a6c4d..551d044fb31f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -583,9 +583,9 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) nmi_exit(); } - info.si_signo = SIGBUS; + info.si_signo = inf->sig; info.si_errno = 0; - info.si_code = BUS_FIXME; + info.si_code = inf->code; if (esr & ESR_ELx_FnV) info.si_addr = NULL; else @@ -596,70 +596,70 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) } static const struct fault_info fault_info[] = { - { do_bad, SIGBUS, BUS_FIXME, "ttbr address size fault" }, - { do_bad, SIGBUS, BUS_FIXME, "level 1 address size fault" }, - { do_bad, SIGBUS, BUS_FIXME, "level 2 address size fault" }, - { do_bad, SIGBUS, BUS_FIXME, "level 3 address size fault" }, + { do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" }, + { do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" }, + { do_bad, SIGKILL, SI_KERNEL, "level 2 address size fault" }, + { do_bad, SIGKILL, SI_KERNEL, "level 3 address size fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 8" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 8" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 12" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 12" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, - { do_sea, SIGBUS, BUS_FIXME, "synchronous external abort" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 17" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 18" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 19" }, - { do_sea, SIGBUS, BUS_FIXME, "level 0 (translation table walk)" }, - { do_sea, SIGBUS, BUS_FIXME, "level 1 (translation table walk)" }, - { do_sea, SIGBUS, BUS_FIXME, "level 2 (translation table walk)" }, - { do_sea, SIGBUS, BUS_FIXME, "level 3 (translation table walk)" }, - { do_sea, SIGBUS, BUS_FIXME, "synchronous parity or ECC error" }, // Reserved when RAS is implemented - { do_bad, SIGBUS, BUS_FIXME, "unknown 25" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 26" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 27" }, - { do_sea, SIGBUS, BUS_FIXME, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented - { do_sea, SIGBUS, BUS_FIXME, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented - { do_sea, SIGBUS, BUS_FIXME, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented - { do_sea, SIGBUS, BUS_FIXME, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented - { do_bad, SIGBUS, BUS_FIXME, "unknown 32" }, + { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 17" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 18" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 19" }, + { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" }, + { do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" }, + { do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" }, + { do_sea, SIGKILL, SI_KERNEL, "level 3 (translation table walk)" }, + { do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented + { do_bad, SIGKILL, SI_KERNEL, "unknown 25" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 26" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 27" }, + { do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented + { do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented + { do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented + { do_sea, SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented + { do_bad, SIGKILL, SI_KERNEL, "unknown 32" }, { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 34" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 35" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 36" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 37" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 38" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 39" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 40" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 41" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 42" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 43" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 44" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 45" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 46" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 47" }, - { do_bad, SIGBUS, BUS_FIXME, "TLB conflict abort" }, - { do_bad, SIGBUS, BUS_FIXME, "Unsupported atomic hardware update fault" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 50" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 51" }, - { do_bad, SIGBUS, BUS_FIXME, "implementation fault (lockdown abort)" }, - { do_bad, SIGBUS, BUS_FIXME, "implementation fault (unsupported exclusive)" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 54" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 55" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 56" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 57" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 58" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 59" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 60" }, - { do_bad, SIGBUS, BUS_FIXME, "section domain fault" }, - { do_bad, SIGBUS, BUS_FIXME, "page domain fault" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 63" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 34" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 35" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 36" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 37" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 38" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 39" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 40" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 41" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 42" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 43" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 44" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 45" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 46" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 47" }, + { do_bad, SIGKILL, SI_KERNEL, "TLB conflict abort" }, + { do_bad, SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 50" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 51" }, + { do_bad, SIGKILL, SI_KERNEL, "implementation fault (lockdown abort)" }, + { do_bad, SIGBUS, BUS_OBJERR, "implementation fault (unsupported exclusive)" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 54" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 55" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 56" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 57" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 58" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 59" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 60" }, + { do_bad, SIGKILL, SI_KERNEL, "section domain fault" }, + { do_bad, SIGKILL, SI_KERNEL, "page domain fault" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, }; int handle_guest_sea(phys_addr_t addr, unsigned int esr) @@ -748,11 +748,11 @@ static struct fault_info __refdata debug_fault_info[] = { { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" }, { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" }, { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 3" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 3" }, { do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" }, - { do_bad, SIGTRAP, TRAP_FIXME, "aarch32 vector catch" }, + { do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" }, { early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" }, - { do_bad, SIGBUS, BUS_FIXME, "unknown 7" }, + { do_bad, SIGKILL, SI_KERNEL, "unknown 7" }, }; void __init hook_debug_fault_code(int nr, diff --git a/kernel/signal.c b/kernel/signal.c index c6e4c83dc090..049a482e705c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2843,10 +2843,6 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) #ifdef FPE_FIXME if ((sig == SIGFPE) && (si_code == FPE_FIXME)) layout = SIL_FAULT; -#endif -#ifdef BUS_FIXME - if ((sig == SIGBUS) && (si_code == BUS_FIXME)) - layout = SIL_FAULT; #endif } return layout; -- cgit v1.2.3-71-gd317 From 0862ca422b79cb5aa70823ee0f07f6b468f86070 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 9 Mar 2018 15:50:59 -0800 Subject: bug: use %pB in BUG and stack protector failure The BUG and stack protector reports were still using a raw %p. This changes it to %pB for more meaningful output. Link: http://lkml.kernel.org/r/20180301225704.GA34198@beast Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Signed-off-by: Kees Cook Reviewed-by: Andrew Morton Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Richard Weinberger , Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 +- lib/bug.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 2cfef408fec9..4b794f1d8561 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -640,7 +640,7 @@ device_initcall(register_warn_debugfs); */ __visible void __stack_chk_fail(void) { - panic("stack-protector: Kernel stack is corrupted in: %p\n", + panic("stack-protector: Kernel stack is corrupted in: %pB\n", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); diff --git a/lib/bug.c b/lib/bug.c index c1b0fad31b10..44f432cb064d 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -191,7 +191,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) if (file) pr_crit("kernel BUG at %s:%u!\n", file, line); else - pr_crit("Kernel BUG at %p [verbose debug info unavailable]\n", + pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n", (void *)bugaddr); return BUG_TRAP_TYPE_BUG; -- cgit v1.2.3-71-gd317 From c2cda2a5bda9f1369c9d1ab54a20571c13cf2743 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 9 Mar 2018 10:42:47 -0800 Subject: timekeeping/ntp: Don't align NTP frequency adjustments to ticks When the timekeeping multiplier is changed, the NTP error is updated to correct the clock for the delay between the tick and the update of the clock. This error is corrected in later updates and the clock appears as if the frequency was changed exactly on the tick. Remove this correction to keep the point where the frequency is effectively changed at the time of the update. This removes a major source of the NTP error. Signed-off-by: Miroslav Lichvar Signed-off-by: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1520620971-9567-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cd03317e7b57..c1a0ac17336e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1860,8 +1860,6 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, * xtime_nsec_2 = xtime_nsec_1 - offset * Which simplfies to: * xtime_nsec -= offset - * - * XXX - TODO: Doc ntp_error calculation. */ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { /* NTP adjustment caused clocksource mult overflow */ @@ -1872,7 +1870,6 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; tk->tkr_mono.xtime_nsec -= offset; - tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; } /* -- cgit v1.2.3-71-gd317 From 78b98e3c5a66d569a53b8f57b6a698f912794a43 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 9 Mar 2018 10:42:48 -0800 Subject: timekeeping/ntp: Determine the multiplier directly from NTP tick length When the length of the NTP tick changes significantly, e.g. when an NTP/PTP application is correcting the initial offset of the clock, a large value may accumulate in the NTP error before the multiplier converges to the correct value. It may then take a very long time (hours or even days) before the error is corrected. This causes the clock to have an unstable frequency offset, which has a negative impact on the stability of synchronization with precise time sources (e.g. NTP/PTP using hardware timestamping or the PTP KVM clock). Use division to determine the correct multiplier directly from the NTP tick length and replace the iterative approach. This removes the last major source of the NTP error. The only remaining source is now limited resolution of the multiplier, which is corrected by adding 1 to the multiplier when the system clock is behind the NTP time. Signed-off-by: Miroslav Lichvar Signed-off-by: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1520620971-9567-3-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- include/linux/timekeeper_internal.h | 2 + kernel/time/timekeeping.c | 138 ++++++++++++------------------------ 2 files changed, 49 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index d315c3d6725c..7acb953298a7 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -117,6 +117,8 @@ struct timekeeper { s64 ntp_error; u32 ntp_error_shift; u32 ntp_err_mult; + /* Flag used to avoid updating NTP twice with same second */ + u32 skip_second_overflow; #ifdef CONFIG_DEBUG_TIMEKEEPING long last_warning; /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c1a0ac17336e..e11760121cb2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -332,6 +332,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->tkr_mono.mult = clock->mult; tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; + tk->skip_second_overflow = 0; } /* Timekeeper helper functions. */ @@ -1799,20 +1800,19 @@ device_initcall(timekeeping_init_ops); */ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, s64 offset, - bool negative, - int adj_scale) + s32 mult_adj) { s64 interval = tk->cycle_interval; - s32 mult_adj = 1; - if (negative) { - mult_adj = -mult_adj; + if (mult_adj == 0) { + return; + } else if (mult_adj == -1) { interval = -interval; - offset = -offset; + offset = -offset; + } else if (mult_adj != 1) { + interval *= mult_adj; + offset *= mult_adj; } - mult_adj <<= adj_scale; - interval <<= adj_scale; - offset <<= adj_scale; /* * So the following can be confusing. @@ -1873,85 +1873,35 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, } /* - * Calculate the multiplier adjustment needed to match the frequency - * specified by NTP + * Adjust the timekeeper's multiplier to the correct frequency + * and also to reduce the accumulated error value. */ -static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, - s64 offset) +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - s64 interval = tk->cycle_interval; - s64 xinterval = tk->xtime_interval; - u32 base = tk->tkr_mono.clock->mult; - u32 max = tk->tkr_mono.clock->maxadj; - u32 cur_adj = tk->tkr_mono.mult; - s64 tick_error; - bool negative; - u32 adj_scale; - - /* Remove any current error adj from freq calculation */ - if (tk->ntp_err_mult) - xinterval -= tk->cycle_interval; - - tk->ntp_tick = ntp_tick_length(); - - /* Calculate current error per tick */ - tick_error = ntp_tick_length() >> tk->ntp_error_shift; - tick_error -= (xinterval + tk->xtime_remainder); - - /* Don't worry about correcting it if its small */ - if (likely((tick_error >= 0) && (tick_error <= interval))) - return; - - /* preserve the direction of correction */ - negative = (tick_error < 0); + u32 mult; - /* If any adjustment would pass the max, just return */ - if (negative && (cur_adj - 1) <= (base - max)) - return; - if (!negative && (cur_adj + 1) >= (base + max)) - return; /* - * Sort out the magnitude of the correction, but - * avoid making so large a correction that we go - * over the max adjustment. + * Determine the multiplier from the current NTP tick length. + * Avoid expensive division when the tick length doesn't change. */ - adj_scale = 0; - tick_error = abs(tick_error); - while (tick_error > interval) { - u32 adj = 1 << (adj_scale + 1); - - /* Check if adjustment gets us within 1 unit from the max */ - if (negative && (cur_adj - adj) <= (base - max)) - break; - if (!negative && (cur_adj + adj) >= (base + max)) - break; - - adj_scale++; - tick_error >>= 1; + if (likely(tk->ntp_tick == ntp_tick_length())) { + mult = tk->tkr_mono.mult - tk->ntp_err_mult; + } else { + tk->ntp_tick = ntp_tick_length(); + mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - + tk->xtime_remainder, tk->cycle_interval); } - /* scale the corrections */ - timekeeping_apply_adjustment(tk, offset, negative, adj_scale); -} + /* + * If the clock is behind the NTP time, increase the multiplier by 1 + * to catch up with it. If it's ahead and there was a remainder in the + * tick division, the clock will slow down. Otherwise it will stay + * ahead until the tick length changes to a non-divisible value. + */ + tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0; + mult += tk->ntp_err_mult; -/* - * Adjust the timekeeper's multiplier to the correct frequency - * and also to reduce the accumulated error value. - */ -static void timekeeping_adjust(struct timekeeper *tk, s64 offset) -{ - /* Correct for the current frequency error */ - timekeeping_freqadjust(tk, offset); - - /* Next make a small adjustment to fix any cumulative error */ - if (!tk->ntp_err_mult && (tk->ntp_error > 0)) { - tk->ntp_err_mult = 1; - timekeeping_apply_adjustment(tk, offset, 0, 0); - } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) { - /* Undo any existing error adjustment */ - timekeeping_apply_adjustment(tk, offset, 1, 0); - tk->ntp_err_mult = 0; - } + timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult); if (unlikely(tk->tkr_mono.clock->maxadj && (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) @@ -1968,18 +1918,15 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * in the code above, its possible the required corrective factor to * xtime_nsec could cause it to underflow. * - * Now, since we already accumulated the second, cannot simply roll - * the accumulated second back, since the NTP subsystem has been - * notified via second_overflow. So instead we push xtime_nsec forward - * by the amount we underflowed, and add that amount into the error. - * - * We'll correct this error next time through this function, when - * xtime_nsec is not as small. + * Now, since we have already accumulated the second and the NTP + * subsystem has been notified via second_overflow(), we need to skip + * the next update. */ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { - s64 neg = -(s64)tk->tkr_mono.xtime_nsec; - tk->tkr_mono.xtime_nsec = 0; - tk->ntp_error += neg << tk->ntp_error_shift; + tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC << + tk->tkr_mono.shift; + tk->xtime_sec--; + tk->skip_second_overflow = 1; } } @@ -2002,6 +1949,15 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; + /* + * Skip NTP update if this second was accumulated before, + * i.e. xtime_nsec underflowed in timekeeping_adjust() + */ + if (unlikely(tk->skip_second_overflow)) { + tk->skip_second_overflow = 0; + continue; + } + /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); if (unlikely(leap)) { @@ -2118,7 +2074,7 @@ void update_wall_time(void) shift--; } - /* correct the clock when NTP error is too big */ + /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); /* -- cgit v1.2.3-71-gd317 From cbf4100efb8f279b6f35917b748b2239019c7a96 Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Mon, 15 Jan 2018 20:51:37 -0600 Subject: tracing: Add support to detect and avoid duplicates A duplicate in the tracing_map hash table is when 2 different entries have the same key and, as a result, the key_hash. This is possible due to a race condition in the algorithm. This race condition is inherent to the algorithm and not a bug. This was fine because, until now, we were only interested in the sum of all the values related to a particular key (the duplicates are dealt with in tracing_map_sort_entries()). But, with the inclusion of variables[1], we are interested in individual values. So, it will not be clear what value to choose when there are duplicates. So, the duplicates need to be removed. The duplicates can occur in the code in the following scenarios: - A thread is in the process of adding a new element. It has successfully executed cmpxchg() and inserted the key. But, it is still not done acquiring the trace_map_elt struct, populating it and storing the pointer to the struct in the value field of tracing_map hash table. If another thread comes in at this time and wants to add an element with the same key, it will not see the current element and add a new one. - There are multiple threads trying to execute cmpxchg at the same time, one of the threads will succeed and the others will fail. The ones which fail will go ahead increment 'idx' and add a new element there creating a duplicate. This patch detects and avoids the first condition by asking the thread which detects the duplicate to loop one more time. There is also a possibility of infinite loop if the thread which is trying to insert goes to sleep indefinitely and the one which is trying to insert a new element detects a duplicate. Which is why, the thread loops for map_size iterations before returning NULL. The second scenario is avoided by preventing the threads which failed cmpxchg() from incrementing idx. This way, they will loop around and check if the thread which succeeded in executing cmpxchg() had the same key. [1] http://lkml.kernel.org/r/cover.1498510759.git.tom.zanussi@linux.intel.com Link: http://lkml.kernel.org/r/e178e89ec399240331d383bd5913d649713110f4.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Vedang Patel Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 07e75344725b..b30f3439f27f 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -414,7 +414,9 @@ static inline struct tracing_map_elt * __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) { u32 idx, key_hash, test_key; + int dup_try = 0; struct tracing_map_entry *entry; + struct tracing_map_elt *val; key_hash = jhash(key, map->key_size, 0); if (key_hash == 0) @@ -426,11 +428,33 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) entry = TRACING_MAP_ENTRY(map->map, idx); test_key = entry->key; - if (test_key && test_key == key_hash && entry->val && - keys_match(key, entry->val->key, map->key_size)) { - if (!lookup_only) - atomic64_inc(&map->hits); - return entry->val; + if (test_key && test_key == key_hash) { + val = READ_ONCE(entry->val); + if (val && + keys_match(key, val->key, map->key_size)) { + if (!lookup_only) + atomic64_inc(&map->hits); + return val; + } else if (unlikely(!val)) { + /* + * The key is present. But, val (pointer to elt + * struct) is still NULL. which means some other + * thread is in the process of inserting an + * element. + * + * On top of that, it's key_hash is same as the + * one being inserted right now. So, it's + * possible that the element has the same + * key as well. + */ + + dup_try++; + if (dup_try > map->map_size) { + atomic64_inc(&map->drops); + break; + } + continue; + } } if (!test_key) { @@ -452,6 +476,13 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) atomic64_inc(&map->hits); return entry->val; + } else { + /* + * cmpxchg() failed. Loop around once + * more to check what key was inserted. + */ + dup_try++; + continue; } } -- cgit v1.2.3-71-gd317 From c193707dde77ace92a649cd59a17e105e2fbeaef Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Mon, 15 Jan 2018 20:51:38 -0600 Subject: tracing: Remove code which merges duplicates We now have the logic to detect and remove duplicates in the tracing_map hash table. The code which merges duplicates in the histogram is redundant now. So, modify this code just to detect duplicates. The duplication detection code is still kept to ensure that any rare race condition which might cause duplicates does not go unnoticed. Link: http://lkml.kernel.org/r/55215cf59e2674391bdaf772fdafc4c393352b03.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Vedang Patel Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 11 ------ kernel/trace/tracing_map.c | 83 +++------------------------------------- kernel/trace/tracing_map.h | 7 ---- 3 files changed, 6 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1e1558c99d56..712260e72be5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -340,16 +340,6 @@ static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt) return 0; } -static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to, - struct tracing_map_elt *from) -{ - char *comm_from = from->private_data; - char *comm_to = to->private_data; - - if (comm_from) - memcpy(comm_to, comm_from, TASK_COMM_LEN + 1); -} - static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt) { char *comm = elt->private_data; @@ -360,7 +350,6 @@ static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt) static const struct tracing_map_ops hist_trigger_elt_comm_ops = { .elt_alloc = hist_trigger_elt_comm_alloc, - .elt_copy = hist_trigger_elt_comm_copy, .elt_free = hist_trigger_elt_comm_free, .elt_init = hist_trigger_elt_comm_init, }; diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index b30f3439f27f..f47a4d54bcf0 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -847,67 +847,15 @@ create_sort_entry(void *key, struct tracing_map_elt *elt) return sort_entry; } -static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt) -{ - struct tracing_map_elt *dup_elt; - unsigned int i; - - dup_elt = tracing_map_elt_alloc(elt->map); - if (IS_ERR(dup_elt)) - return NULL; - - if (elt->map->ops && elt->map->ops->elt_copy) - elt->map->ops->elt_copy(dup_elt, elt); - - dup_elt->private_data = elt->private_data; - memcpy(dup_elt->key, elt->key, elt->map->key_size); - - for (i = 0; i < elt->map->n_fields; i++) { - atomic64_set(&dup_elt->fields[i].sum, - atomic64_read(&elt->fields[i].sum)); - dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn; - } - - return dup_elt; -} - -static int merge_dup(struct tracing_map_sort_entry **sort_entries, - unsigned int target, unsigned int dup) -{ - struct tracing_map_elt *target_elt, *elt; - bool first_dup = (target - dup) == 1; - int i; - - if (first_dup) { - elt = sort_entries[target]->elt; - target_elt = copy_elt(elt); - if (!target_elt) - return -ENOMEM; - sort_entries[target]->elt = target_elt; - sort_entries[target]->elt_copied = true; - } else - target_elt = sort_entries[target]->elt; - - elt = sort_entries[dup]->elt; - - for (i = 0; i < elt->map->n_fields; i++) - atomic64_add(atomic64_read(&elt->fields[i].sum), - &target_elt->fields[i].sum); - - sort_entries[dup]->dup = true; - - return 0; -} - -static int merge_dups(struct tracing_map_sort_entry **sort_entries, +static void detect_dups(struct tracing_map_sort_entry **sort_entries, int n_entries, unsigned int key_size) { unsigned int dups = 0, total_dups = 0; - int err, i, j; + int i; void *key; if (n_entries < 2) - return total_dups; + return; sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *), (int (*)(const void *, const void *))cmp_entries_dup, NULL); @@ -916,30 +864,14 @@ static int merge_dups(struct tracing_map_sort_entry **sort_entries, for (i = 1; i < n_entries; i++) { if (!memcmp(sort_entries[i]->key, key, key_size)) { dups++; total_dups++; - err = merge_dup(sort_entries, i - dups, i); - if (err) - return err; continue; } key = sort_entries[i]->key; dups = 0; } - if (!total_dups) - return total_dups; - - for (i = 0, j = 0; i < n_entries; i++) { - if (!sort_entries[i]->dup) { - sort_entries[j] = sort_entries[i]; - if (j++ != i) - sort_entries[i] = NULL; - } else { - destroy_sort_entry(sort_entries[i]); - sort_entries[i] = NULL; - } - } - - return total_dups; + WARN_ONCE(total_dups > 0, + "Duplicates detected: %d\n", total_dups); } static bool is_key(struct tracing_map *map, unsigned int field_idx) @@ -1065,10 +997,7 @@ int tracing_map_sort_entries(struct tracing_map *map, return 1; } - ret = merge_dups(entries, n_entries, map->key_size); - if (ret < 0) - goto free; - n_entries -= ret; + detect_dups(entries, n_entries, map->key_size); if (is_key(map, sort_keys[0].field_idx)) cmp_entries_fn = cmp_entries_key; diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 5b5bbf8ae550..de57887c0670 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -215,11 +215,6 @@ struct tracing_map { * Element allocation occurs before tracing begins, when the * tracing_map_init() call is made by client code. * - * @elt_copy: At certain points in the lifetime of an element, it may - * need to be copied. The copy should include a copy of the - * client-allocated data, which can be copied into the 'to' - * element from the 'from' element. - * * @elt_free: When a tracing_map_elt is freed, this function is called * and allows client-allocated per-element data to be freed. * @@ -233,8 +228,6 @@ struct tracing_map { */ struct tracing_map_ops { int (*elt_alloc)(struct tracing_map_elt *elt); - void (*elt_copy)(struct tracing_map_elt *to, - struct tracing_map_elt *from); void (*elt_free)(struct tracing_map_elt *elt); void (*elt_clear)(struct tracing_map_elt *elt); void (*elt_init)(struct tracing_map_elt *elt); -- cgit v1.2.3-71-gd317 From 00b4145298aeb05a2d110117ed18148cb21ebd14 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:39 -0600 Subject: ring-buffer: Add interface for setting absolute time stamps Define a new function, tracing_set_time_stamp_abs(), which can be used to enable or disable the use of absolute timestamps rather than time deltas for a trace array. Only the interface is added here; a subsequent patch will add the underlying implementation. Link: http://lkml.kernel.org/r/ce96119de44c7fe0ee44786d15254e9b493040d3.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Baohong Liu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 2 ++ kernel/trace/ring_buffer.c | 11 +++++++++++ kernel/trace/trace.c | 33 ++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 3 +++ 4 files changed, 48 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 7d9eb39fa76a..025159e17e1b 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -178,6 +178,8 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, int cpu, u64 *ts); void ring_buffer_set_clock(struct ring_buffer *buffer, u64 (*clock)(void)); +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs); +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer); size_t ring_buffer_page_len(void *page); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dcf1c4dd3efe..2a03e069bbc6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -488,6 +488,7 @@ struct ring_buffer { u64 (*clock)(void); struct rb_irq_work irq_work; + bool time_stamp_abs; }; struct ring_buffer_iter { @@ -1382,6 +1383,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, buffer->clock = clock; } +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs) +{ + buffer->time_stamp_abs = abs; +} + +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer) +{ + return buffer->time_stamp_abs; +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static inline unsigned long rb_page_entries(struct buffer_page *bpage) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 20a2300ae4e8..cba003f0362e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2269,7 +2269,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, *current_rb = trace_file->tr->trace_buffer.buffer; - if ((trace_file->flags & + if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ @@ -6282,6 +6282,37 @@ static int tracing_clock_open(struct inode *inode, struct file *file) return ret; } +int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs) +{ + int ret = 0; + + mutex_lock(&trace_types_lock); + + if (abs && tr->time_stamp_abs_ref++) + goto out; + + if (!abs) { + if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) { + ret = -EINVAL; + goto out; + } + + if (--tr->time_stamp_abs_ref) + goto out; + } + + ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->max_buffer.buffer) + ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs); +#endif + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + struct ftrace_buffer_info { struct trace_iterator iter; void *spare; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2a6d0325a761..477341710ebf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -273,6 +273,7 @@ struct trace_array { /* function tracing enabled */ int function_enabled; #endif + int time_stamp_abs_ref; }; enum { @@ -286,6 +287,8 @@ extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); extern void trace_array_put(struct trace_array *tr); +extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. -- cgit v1.2.3-71-gd317 From dc4e2801d400b0346fb281ce9cf010d611e2243c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:40 -0600 Subject: ring-buffer: Redefine the unimplemented RINGBUF_TYPE_TIME_STAMP RINGBUF_TYPE_TIME_STAMP is defined but not used, and from what I can gather was reserved for something like an absolute timestamp feature for the ring buffer, if not a complete replacement of the current time_delta scheme. This code redefines RINGBUF_TYPE_TIME_STAMP to implement absolute time stamps. Another way to look at it is that it essentially forces extended time_deltas for all events. The motivation for doing this is to enable time_deltas that aren't dependent on previous events in the ring buffer, making it feasible to use the ring_buffer_event timetamps in a more random-access way, for purposes other than serial event printing. To set/reset this mode, use tracing_set_timestamp_abs() from the previous interface patch. Link: http://lkml.kernel.org/r/477b362dba1ce7fab9889a1a8e885a62c472f041.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 12 ++--- kernel/trace/ring_buffer.c | 104 ++++++++++++++++++++++++++++++++------------ 2 files changed, 83 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 025159e17e1b..7cb84774c20d 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -34,10 +34,12 @@ struct ring_buffer_event { * array[0] = time delta (28 .. 59) * size = 8 bytes * - * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock - * array[0] = tv_nsec - * array[1..2] = tv_sec - * size = 16 bytes + * @RINGBUF_TYPE_TIME_STAMP: Absolute timestamp + * Same format as TIME_EXTEND except that the + * value is an absolute timestamp, not a delta + * event.time_delta contains bottom 27 bits + * array[0] = top (28 .. 59) bits + * size = 8 bytes * * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX: * Data record @@ -54,12 +56,12 @@ enum ring_buffer_type { RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28, RINGBUF_TYPE_PADDING, RINGBUF_TYPE_TIME_EXTEND, - /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */ RINGBUF_TYPE_TIME_STAMP, }; unsigned ring_buffer_event_length(struct ring_buffer_event *event); void *ring_buffer_event_data(struct ring_buffer_event *event); +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event); /* * ring_buffer_discard_commit will remove an event that has not diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2a03e069bbc6..33073cdebb26 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -41,6 +41,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) RINGBUF_TYPE_PADDING); trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); + trace_seq_printf(s, "\ttime_stamp : type == %d\n", + RINGBUF_TYPE_TIME_STAMP); trace_seq_printf(s, "\tdata max type_len == %d\n", RINGBUF_TYPE_DATA_TYPE_LEN_MAX); @@ -140,12 +142,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s) enum { RB_LEN_TIME_EXTEND = 8, - RB_LEN_TIME_STAMP = 16, + RB_LEN_TIME_STAMP = 8, }; #define skip_time_extend(event) \ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) +#define extended_time(event) \ + (event->type_len >= RINGBUF_TYPE_TIME_EXTEND) + static inline int rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; @@ -209,7 +214,7 @@ rb_event_ts_length(struct ring_buffer_event *event) { unsigned len = 0; - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + if (extended_time(event)) { /* time extends include the data event after it */ len = RB_LEN_TIME_EXTEND; event = skip_time_extend(event); @@ -231,7 +236,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length; - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); length = rb_event_length(event); @@ -248,7 +253,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static __always_inline void * rb_event_data(struct ring_buffer_event *event) { - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ @@ -275,6 +280,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) +/** + * ring_buffer_event_time_stamp - return the event's extended timestamp + * @event: the event to get the timestamp of + * + * Returns the extended timestamp associated with a data event. + * An extended time_stamp is a 64-bit timestamp represented + * internally in a special way that makes the best use of space + * contained within a ring buffer event. This function decodes + * it and maps it to a straight u64 value. + */ +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) +{ + u64 ts; + + ts = event->array[0]; + ts <<= TS_SHIFT; + ts += event->time_delta; + + return ts; +} + /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) /* Missed count stored at end */ @@ -2217,12 +2243,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Slow path, do not inline */ static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) { - event->type_len = RINGBUF_TYPE_TIME_EXTEND; + if (abs) + event->type_len = RINGBUF_TYPE_TIME_STAMP; + else + event->type_len = RINGBUF_TYPE_TIME_EXTEND; - /* Not the first event on the page? */ - if (rb_event_index(event)) { + /* Not the first event on the page, or not delta? */ + if (abs || rb_event_index(event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { @@ -2265,7 +2294,9 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, * add it to the start of the resevered space. */ if (unlikely(info->add_timestamp)) { - event = rb_add_time_stamp(event, delta); + bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); + + event = rb_add_time_stamp(event, info->delta, abs); length -= RB_LEN_TIME_EXTEND; delta = 0; } @@ -2453,7 +2484,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer static inline void rb_event_discard(struct ring_buffer_event *event) { - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + if (extended_time(event)) event = skip_time_extend(event); /* array[0] holds the actual length for the discarded event */ @@ -2497,10 +2528,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->write_stamp = cpu_buffer->commit_page->page->time_stamp; else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); cpu_buffer->write_stamp += delta; + } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { + delta = ring_buffer_event_time_stamp(event); + cpu_buffer->write_stamp = delta; } else cpu_buffer->write_stamp += event->time_delta; } @@ -2680,7 +2712,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ - if (!tail) + if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer)) info->delta = 0; /* See if we shot pass the end of this buffer page */ @@ -2757,8 +2789,11 @@ rb_reserve_next_event(struct ring_buffer *buffer, /* make sure this diff is calculated here */ barrier(); - /* Did the write stamp get updated already? */ - if (likely(info.ts >= cpu_buffer->write_stamp)) { + if (ring_buffer_time_stamp_abs(buffer)) { + info.delta = info.ts; + rb_handle_timestamp(cpu_buffer, &info); + } else /* Did the write stamp get updated already? */ + if (likely(info.ts >= cpu_buffer->write_stamp)) { info.delta = diff; if (unlikely(test_time_stamp(info.delta))) rb_handle_timestamp(cpu_buffer, &info); @@ -3440,14 +3475,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + delta = ring_buffer_event_time_stamp(event); + cpu_buffer->read_stamp = delta; return; case RINGBUF_TYPE_DATA: @@ -3471,14 +3505,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; + delta = ring_buffer_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + delta = ring_buffer_event_time_stamp(event); + iter->read_stamp = delta; return; case RINGBUF_TYPE_DATA: @@ -3702,6 +3735,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, struct buffer_page *reader; int nr_loops = 0; + if (ts) + *ts = 0; again: /* * We repeat when a time extend is encountered. @@ -3738,12 +3773,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, goto again; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + if (ts) { + *ts = ring_buffer_event_time_stamp(event); + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_DATA: - if (ts) { + if (ts && !(*ts)) { *ts = cpu_buffer->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); @@ -3768,6 +3808,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; int nr_loops = 0; + if (ts) + *ts = 0; + cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; @@ -3820,12 +3863,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) goto again; case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ + if (ts) { + *ts = ring_buffer_event_time_stamp(event); + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_DATA: - if (ts) { + if (ts && !(*ts)) { *ts = iter->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(buffer, cpu_buffer->cpu, ts); -- cgit v1.2.3-71-gd317 From 2c1ea60b195da6c4661ec5e4d61f68b8b34e113b Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:41 -0600 Subject: tracing: Add timestamp_mode trace file Add a new option flag indicating whether or not the ring buffer is in 'absolute timestamp' mode. Currently this is only set/unset by hist triggers that make use of a common_timestamp. As such, there's no reason to make this writeable for users - its purpose is only to allow users to determine unequivocally whether or not the ring buffer is in that mode (although absolute timestamps can coexist with the normal delta timestamps, when the ring buffer is in absolute mode, timestamps written while absolute mode is in effect take up more space in the buffer, and are not as efficient). Link: http://lkml.kernel.org/r/e8aa7b1cde1cf15014e66545d06ac6ef2ebba456.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace.txt | 24 +++++++++++++++++++++ kernel/trace/trace.c | 47 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index d4601df6e72e..54213e5c23f6 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -539,6 +539,30 @@ of ftrace. Here is a list of some of the key files: See events.txt for more information. + timestamp_mode: + + Certain tracers may change the timestamp mode used when + logging trace events into the event buffer. Events with + different modes can coexist within a buffer but the mode in + effect when an event is logged determines which timestamp mode + is used for that event. The default timestamp mode is + 'delta'. + + Usual timestamp modes for tracing: + + # cat timestamp_mode + [delta] absolute + + The timestamp mode with the square brackets around it is the + one in effect. + + delta: Default timestamp mode - timestamp is a delta against + a per-buffer timestamp. + + absolute: The timestamp is a full timestamp, not a delta + against some other value. As such it takes up more + space and is less efficient. + hwlat_detector: Directory for the Hardware Latency Detector. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cba003f0362e..988d94a05e81 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4515,6 +4515,9 @@ static const char readme_msg[] = #ifdef CONFIG_X86_64 " x86-tsc: TSC cycle counter\n" #endif + "\n timestamp_mode\t-view the mode used to timestamp events\n" + " delta: Delta difference against a buffer-wide timestamp\n" + " absolute: Absolute (standalone) timestamp\n" "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" " tracing_cpumask\t- Limit which CPUs to trace\n" @@ -6282,6 +6285,40 @@ static int tracing_clock_open(struct inode *inode, struct file *file) return ret; } +static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + + mutex_lock(&trace_types_lock); + + if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer)) + seq_puts(m, "delta [absolute]\n"); + else + seq_puts(m, "[delta] absolute\n"); + + mutex_unlock(&trace_types_lock); + + return 0; +} + +static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr)) + return -ENODEV; + + ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs) { int ret = 0; @@ -6560,6 +6597,13 @@ static const struct file_operations trace_clock_fops = { .write = tracing_clock_write, }; +static const struct file_operations trace_time_stamp_mode_fops = { + .open = tracing_time_stamp_mode_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_single_release_tr, +}; + #ifdef CONFIG_TRACER_SNAPSHOT static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, @@ -7882,6 +7926,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); + trace_create_file("timestamp_mode", 0444, d_tracer, tr, + &trace_time_stamp_mode_fops); + create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) -- cgit v1.2.3-71-gd317 From 1ac4f51c0eb518e04ff3455f0c7d17ad9187eb27 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:42 -0600 Subject: tracing: Give event triggers access to ring_buffer_event The ring_buffer event can provide a timestamp that may be useful to various triggers - pass it into the handlers for that purpose. Link: http://lkml.kernel.org/r/6de592683b59fa70ffa5d43d0109896623fc1367.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 14 ++++++----- kernel/trace/trace.h | 9 +++---- kernel/trace/trace_events_hist.c | 11 +++++---- kernel/trace/trace_events_trigger.c | 47 +++++++++++++++++++++++-------------- 4 files changed, 49 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8a1442c4e513..0cf48c61cc6d 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -430,11 +430,13 @@ enum event_trigger_type { extern int filter_match_preds(struct event_filter *filter, void *rec); -extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, - void *rec); -extern void event_triggers_post_call(struct trace_event_file *file, - enum event_trigger_type tt, - void *rec); +extern enum event_trigger_type +event_triggers_call(struct trace_event_file *file, void *rec, + struct ring_buffer_event *event); +extern void +event_triggers_post_call(struct trace_event_file *file, + enum event_trigger_type tt, + void *rec, struct ring_buffer_event *event); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); @@ -454,7 +456,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { if (eflags & EVENT_FILE_FL_TRIGGER_MODE) - event_triggers_call(file, NULL); + event_triggers_call(file, NULL, NULL); if (eflags & EVENT_FILE_FL_SOFT_DISABLED) return true; if (eflags & EVENT_FILE_FL_PID_FILTER) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 477341710ebf..99060f7eebbd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1294,7 +1294,7 @@ __event_trigger_test_discard(struct trace_event_file *file, unsigned long eflags = file->flags; if (eflags & EVENT_FILE_FL_TRIGGER_COND) - *tt = event_triggers_call(file, entry); + *tt = event_triggers_call(file, entry, event); if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && @@ -1331,7 +1331,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); if (tt) - event_triggers_post_call(file, tt, entry); + event_triggers_post_call(file, tt, entry, event); } /** @@ -1364,7 +1364,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file, irq_flags, pc, regs); if (tt) - event_triggers_post_call(file, tt, entry); + event_triggers_post_call(file, tt, entry, event); } #define FILTER_PRED_INVALID ((unsigned short)-1) @@ -1589,7 +1589,8 @@ extern int register_trigger_hist_enable_disable_cmds(void); */ struct event_trigger_ops { void (*func)(struct event_trigger_data *data, - void *rec); + void *rec, + struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 712260e72be5..63a19123cf47 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -909,7 +909,8 @@ static inline void add_to_key(char *compound_key, void *key, memcpy(compound_key + key_field->offset, key, size); } -static void event_hist_trigger(struct event_trigger_data *data, void *rec) +static void event_hist_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { struct hist_trigger_data *hist_data = data->private_data; bool use_compound_key = (hist_data->n_keys > 1); @@ -1658,7 +1659,8 @@ __init int register_trigger_hist_cmd(void) } static void -hist_enable_trigger(struct event_trigger_data *data, void *rec) +hist_enable_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; struct event_trigger_data *test; @@ -1674,7 +1676,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec) } static void -hist_enable_count_trigger(struct event_trigger_data *data, void *rec) +hist_enable_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!data->count) return; @@ -1682,7 +1685,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - hist_enable_trigger(data, rec); + hist_enable_trigger(data, rec, event); } static struct event_trigger_ops hist_enable_trigger_ops = { diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 87411482a46f..632471692462 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -63,7 +63,8 @@ void trigger_data_free(struct event_trigger_data *data) * any trigger that should be deferred, ETT_NONE if nothing to defer. */ enum event_trigger_type -event_triggers_call(struct trace_event_file *file, void *rec) +event_triggers_call(struct trace_event_file *file, void *rec, + struct ring_buffer_event *event) { struct event_trigger_data *data; enum event_trigger_type tt = ETT_NONE; @@ -76,7 +77,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) if (data->paused) continue; if (!rec) { - data->ops->func(data, rec); + data->ops->func(data, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -86,7 +87,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data, rec); + data->ops->func(data, rec, event); } return tt; } @@ -108,7 +109,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); void event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt, - void *rec) + void *rec, struct ring_buffer_event *event) { struct event_trigger_data *data; @@ -116,7 +117,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, rec); + data->ops->func(data, rec, event); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -909,7 +910,8 @@ void set_named_trigger_data(struct event_trigger_data *data, } static void -traceon_trigger(struct event_trigger_data *data, void *rec) +traceon_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (tracing_is_on()) return; @@ -918,7 +920,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec) } static void -traceon_count_trigger(struct event_trigger_data *data, void *rec) +traceon_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (tracing_is_on()) return; @@ -933,7 +936,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec) } static void -traceoff_trigger(struct event_trigger_data *data, void *rec) +traceoff_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!tracing_is_on()) return; @@ -942,7 +946,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec) } static void -traceoff_count_trigger(struct event_trigger_data *data, void *rec) +traceoff_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!tracing_is_on()) return; @@ -1039,13 +1044,15 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data, void *rec) +snapshot_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { tracing_snapshot(); } static void -snapshot_count_trigger(struct event_trigger_data *data, void *rec) +snapshot_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!data->count) return; @@ -1053,7 +1060,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - snapshot_trigger(data, rec); + snapshot_trigger(data, rec, event); } static int @@ -1141,13 +1148,15 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif static void -stacktrace_trigger(struct event_trigger_data *data, void *rec) +stacktrace_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data, void *rec) +stacktrace_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { if (!data->count) return; @@ -1155,7 +1164,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - stacktrace_trigger(data, rec); + stacktrace_trigger(data, rec, event); } static int @@ -1217,7 +1226,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) } static void -event_enable_trigger(struct event_trigger_data *data, void *rec) +event_enable_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1228,7 +1238,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec) } static void -event_enable_count_trigger(struct event_trigger_data *data, void *rec) +event_enable_count_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1242,7 +1253,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec) if (data->count != -1) (data->count)--; - event_enable_trigger(data, rec); + event_enable_trigger(data, rec, event); } int event_enable_trigger_print(struct seq_file *m, -- cgit v1.2.3-71-gd317 From fbd302cbebe9408699fd11a4eb423d0a466058b9 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:43 -0600 Subject: tracing: Add ring buffer event param to hist field functions Some events such as timestamps require access to a ring_buffer_event struct; add a param so that hist field functions can access that. Link: http://lkml.kernel.org/r/2ff4af18e72b6002eb86b26b2a7f39cef7d1dfe4.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 63a19123cf47..37f5acefdc6c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -26,7 +26,8 @@ struct hist_field; -typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event); +typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event, + struct ring_buffer_event *rbe); #define HIST_FIELD_OPERANDS_MAX 2 @@ -40,24 +41,28 @@ struct hist_field { struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; }; -static u64 hist_field_none(struct hist_field *field, void *event) +static u64 hist_field_none(struct hist_field *field, void *event, + struct ring_buffer_event *rbe) { return 0; } -static u64 hist_field_counter(struct hist_field *field, void *event) +static u64 hist_field_counter(struct hist_field *field, void *event, + struct ring_buffer_event *rbe) { return 1; } -static u64 hist_field_string(struct hist_field *hist_field, void *event) +static u64 hist_field_string(struct hist_field *hist_field, void *event, + struct ring_buffer_event *rbe) { char *addr = (char *)(event + hist_field->field->offset); return (u64)(unsigned long)addr; } -static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) +static u64 hist_field_dynstring(struct hist_field *hist_field, void *event, + struct ring_buffer_event *rbe) { u32 str_item = *(u32 *)(event + hist_field->field->offset); int str_loc = str_item & 0xffff; @@ -66,24 +71,28 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) return (u64)(unsigned long)addr; } -static u64 hist_field_pstring(struct hist_field *hist_field, void *event) +static u64 hist_field_pstring(struct hist_field *hist_field, void *event, + struct ring_buffer_event *rbe) { char **addr = (char **)(event + hist_field->field->offset); return (u64)(unsigned long)*addr; } -static u64 hist_field_log2(struct hist_field *hist_field, void *event) +static u64 hist_field_log2(struct hist_field *hist_field, void *event, + struct ring_buffer_event *rbe) { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, event); + u64 val = operand->fn(operand, event, rbe); return (u64) ilog2(roundup_pow_of_two(val)); } #define DEFINE_HIST_FIELD_FN(type) \ -static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ + static u64 hist_field_##type(struct hist_field *hist_field, \ + void *event, \ + struct ring_buffer_event *rbe) \ { \ type *addr = (type *)(event + hist_field->field->offset); \ \ @@ -871,8 +880,8 @@ create_hist_data(unsigned int map_bits, } static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, - void *rec) + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe) { struct hist_field *hist_field; unsigned int i; @@ -880,7 +889,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, rec); + hist_val = hist_field->fn(hist_field, rec, rbe); tracing_map_update_sum(elt, i, hist_val); } } @@ -910,7 +919,7 @@ static inline void add_to_key(char *compound_key, void *key, } static void event_hist_trigger(struct event_trigger_data *data, void *rec, - struct ring_buffer_event *event) + struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; bool use_compound_key = (hist_data->n_keys > 1); @@ -939,7 +948,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, key = entries; } else { - field_contents = key_field->fn(key_field, rec); + field_contents = key_field->fn(key_field, rec, rbe); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; @@ -956,7 +965,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, elt = tracing_map_insert(hist_data->map, key); if (elt) - hist_trigger_elt_update(hist_data, elt, rec); + hist_trigger_elt_update(hist_data, elt, rec, rbe); } static void hist_trigger_stacktrace_print(struct seq_file *m, -- cgit v1.2.3-71-gd317 From 9b1ae035c9304ed1e183de3b3bb08eafd01a7553 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:44 -0600 Subject: tracing: Break out hist trigger assignment parsing This will make it easier to add variables, and makes the parsing code cleaner regardless. Link: http://lkml.kernel.org/r/e574b3291bbe15e35a4dfc87e5395aa715701c98.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Rajvi Jingar Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 72 ++++++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 37f5acefdc6c..e4368bb7ba30 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -251,6 +251,51 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) kfree(attrs); } +static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) +{ + int ret = 0; + + if ((strncmp(str, "key=", strlen("key=")) == 0) || + (strncmp(str, "keys=", strlen("keys=")) == 0)) { + attrs->keys_str = kstrdup(str, GFP_KERNEL); + if (!attrs->keys_str) { + ret = -ENOMEM; + goto out; + } + } else if ((strncmp(str, "val=", strlen("val=")) == 0) || + (strncmp(str, "vals=", strlen("vals=")) == 0) || + (strncmp(str, "values=", strlen("values=")) == 0)) { + attrs->vals_str = kstrdup(str, GFP_KERNEL); + if (!attrs->vals_str) { + ret = -ENOMEM; + goto out; + } + } else if (strncmp(str, "sort=", strlen("sort=")) == 0) { + attrs->sort_key_str = kstrdup(str, GFP_KERNEL); + if (!attrs->sort_key_str) { + ret = -ENOMEM; + goto out; + } + } else if (strncmp(str, "name=", strlen("name=")) == 0) { + attrs->name = kstrdup(str, GFP_KERNEL); + if (!attrs->name) { + ret = -ENOMEM; + goto out; + } + } else if (strncmp(str, "size=", strlen("size=")) == 0) { + int map_bits = parse_map_size(str); + + if (map_bits < 0) { + ret = map_bits; + goto out; + } + attrs->map_bits = map_bits; + } else + ret = -EINVAL; + out: + return ret; +} + static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) { struct hist_trigger_attrs *attrs; @@ -263,33 +308,18 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) while (trigger_str) { char *str = strsep(&trigger_str, ":"); - if ((strncmp(str, "key=", strlen("key=")) == 0) || - (strncmp(str, "keys=", strlen("keys=")) == 0)) - attrs->keys_str = kstrdup(str, GFP_KERNEL); - else if ((strncmp(str, "val=", strlen("val=")) == 0) || - (strncmp(str, "vals=", strlen("vals=")) == 0) || - (strncmp(str, "values=", strlen("values=")) == 0)) - attrs->vals_str = kstrdup(str, GFP_KERNEL); - else if (strncmp(str, "sort=", strlen("sort=")) == 0) - attrs->sort_key_str = kstrdup(str, GFP_KERNEL); - else if (strncmp(str, "name=", strlen("name=")) == 0) - attrs->name = kstrdup(str, GFP_KERNEL); - else if (strcmp(str, "pause") == 0) + if (strchr(str, '=')) { + ret = parse_assignment(str, attrs); + if (ret) + goto free; + } else if (strcmp(str, "pause") == 0) attrs->pause = true; else if ((strcmp(str, "cont") == 0) || (strcmp(str, "continue") == 0)) attrs->cont = true; else if (strcmp(str, "clear") == 0) attrs->clear = true; - else if (strncmp(str, "size=", strlen("size=")) == 0) { - int map_bits = parse_map_size(str); - - if (map_bits < 0) { - ret = map_bits; - goto free; - } - attrs->map_bits = map_bits; - } else { + else { ret = -EINVAL; goto free; } -- cgit v1.2.3-71-gd317 From ad42febe51ae0a2e875f507a37a6329277f75fdd Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:45 -0600 Subject: tracing: Add hist trigger timestamp support Add support for a timestamp event field. This is actually a 'pseudo-' event field in that it behaves like it's part of the event record, but is really part of the corresponding ring buffer event. To make use of the timestamp field, users can specify "common_timestamp" as a field name for any histogram. Note that this doesn't make much sense on its own either as either a key or value, but needs to be supported even so, since follow-on patches will add support for making use of this field in time deltas. The common_timestamp 'field' is not a bona fide event field - so you won't find it in the event description - but rather it's a synthetic field that can be used like a real field. Note that the use of this field requires the ring buffer be put into 'absolute timestamp' mode, which saves the complete timestamp for each event rather than an offset. This mode will be enabled if and only if a histogram makes use of the "common_timestamp" field. Link: http://lkml.kernel.org/r/97afbd646ed146e26271f3458b4b33e16d7817c2.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Baohong Liu [kasan use-after-free fix] Signed-off-by: Vedang Patel Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 94 ++++++++++++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e4368bb7ba30..a793f8c04830 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -89,6 +89,12 @@ static u64 hist_field_log2(struct hist_field *hist_field, void *event, return (u64) ilog2(roundup_pow_of_two(val)); } +static u64 hist_field_timestamp(struct hist_field *hist_field, void *event, + struct ring_buffer_event *rbe) +{ + return ring_buffer_event_time_stamp(rbe); +} + #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, \ void *event, \ @@ -135,6 +141,7 @@ enum hist_field_flags { HIST_FIELD_FL_SYSCALL = 1 << 7, HIST_FIELD_FL_STACKTRACE = 1 << 8, HIST_FIELD_FL_LOG2 = 1 << 9, + HIST_FIELD_FL_TIMESTAMP = 1 << 10, }; struct hist_trigger_attrs { @@ -159,6 +166,7 @@ struct hist_trigger_data { struct trace_event_file *event_file; struct hist_trigger_attrs *attrs; struct tracing_map *map; + bool enable_timestamps; }; static const char *hist_field_name(struct hist_field *field, @@ -173,6 +181,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = field->field->name; else if (field->flags & HIST_FIELD_FL_LOG2) field_name = hist_field_name(field->operands[0], ++level); + else if (field->flags & HIST_FIELD_FL_TIMESTAMP) + field_name = "common_timestamp"; if (field_name == NULL) field_name = ""; @@ -440,6 +450,12 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, goto out; } + if (flags & HIST_FIELD_FL_TIMESTAMP) { + hist_field->fn = hist_field_timestamp; + hist_field->size = sizeof(u64); + goto out; + } + if (WARN_ON_ONCE(!field)) goto out; @@ -517,10 +533,15 @@ static int create_val_field(struct hist_trigger_data *hist_data, } } - field = trace_find_event_field(file->event_call, field_name); - if (!field || !field->size) { - ret = -EINVAL; - goto out; + if (strcmp(field_name, "common_timestamp") == 0) { + flags |= HIST_FIELD_FL_TIMESTAMP; + hist_data->enable_timestamps = true; + } else { + field = trace_find_event_field(file->event_call, field_name); + if (!field || !field->size) { + ret = -EINVAL; + goto out; + } } hist_data->fields[val_idx] = create_hist_field(field, flags); @@ -615,16 +636,22 @@ static int create_key_field(struct hist_trigger_data *hist_data, } } - field = trace_find_event_field(file->event_call, field_name); - if (!field || !field->size) { - ret = -EINVAL; - goto out; - } + if (strcmp(field_name, "common_timestamp") == 0) { + flags |= HIST_FIELD_FL_TIMESTAMP; + hist_data->enable_timestamps = true; + key_size = sizeof(u64); + } else { + field = trace_find_event_field(file->event_call, field_name); + if (!field || !field->size) { + ret = -EINVAL; + goto out; + } - if (is_string_field(field)) - key_size = MAX_FILTER_STR_VAL; - else - key_size = field->size; + if (is_string_field(field)) + key_size = MAX_FILTER_STR_VAL; + else + key_size = field->size; + } } hist_data->fields[key_idx] = create_hist_field(field, flags); @@ -820,6 +847,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) cmp_fn = tracing_map_cmp_none; + else if (!field) + cmp_fn = tracing_map_cmp_num(hist_field->size, + hist_field->is_signed); else if (is_string_field(field)) cmp_fn = tracing_map_cmp_string; else @@ -1215,7 +1245,11 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { const char *field_name = hist_field_name(hist_field, 0); - seq_printf(m, "%s", field_name); + if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) + seq_puts(m, "common_timestamp"); + else if (field_name) + seq_printf(m, "%s", field_name); + if (hist_field->flags) { const char *flags_str = get_hist_field_flags(hist_field); @@ -1266,27 +1300,25 @@ static int event_hist_trigger_print(struct seq_file *m, for (i = 0; i < hist_data->n_sort_keys; i++) { struct tracing_map_sort_key *sort_key; + unsigned int idx; sort_key = &hist_data->sort_keys[i]; + idx = sort_key->field_idx; + + if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX)) + return -EINVAL; if (i > 0) seq_puts(m, ","); - if (sort_key->field_idx == HITCOUNT_IDX) + if (idx == HITCOUNT_IDX) seq_puts(m, "hitcount"); - else { - unsigned int idx = sort_key->field_idx; - - if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX)) - return -EINVAL; - + else hist_field_print(m, hist_data->fields[idx]); - } if (sort_key->descending) seq_puts(m, ".descending"); } - seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); if (data->filter_str) @@ -1454,6 +1486,10 @@ static bool hist_trigger_match(struct event_trigger_data *data, return false; if (key_field->offset != key_field_test->offset) return false; + if (key_field->size != key_field_test->size) + return false; + if (key_field->is_signed != key_field_test->is_signed) + return false; } for (i = 0; i < hist_data->n_sort_keys; i++) { @@ -1536,6 +1572,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, update_cond_flag(file); + if (hist_data->enable_timestamps) + tracing_set_time_stamp_abs(file->tr, true); + if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); update_cond_flag(file); @@ -1570,17 +1609,26 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, if (unregistered && test->ops->free) test->ops->free(test->ops, test); + + if (hist_data->enable_timestamps) { + if (unregistered) + tracing_set_time_stamp_abs(file->tr, false); + } } static void hist_unreg_all(struct trace_event_file *file) { struct event_trigger_data *test, *n; + struct hist_trigger_data *hist_data; list_for_each_entry_safe(test, n, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; list_del_rcu(&test->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); + if (hist_data->enable_timestamps) + tracing_set_time_stamp_abs(file->tr, false); if (test->ops->free) test->ops->free(test->ops, test); } -- cgit v1.2.3-71-gd317 From 2734b629525a9dae5bf217cbf0a9651da93d2108 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:46 -0600 Subject: tracing: Add per-element variable support to tracing_map In order to allow information to be passed between trace events, add support for per-element variables to tracing_map. This provides a means for histograms to associate a value or values with an entry when it's saved or updated, and retrieved by a subsequent event occurrences. Variables can be set using tracing_map_set_var() and read using tracing_map_read_var(). tracing_map_var_set() returns true or false depending on whether or not the variable has been set or not, which is important for event-matching applications. tracing_map_read_var_once() reads the variable and resets it to the 'unset' state, implementing read-once variables, which are also important for event-matching uses. Link: http://lkml.kernel.org/r/7fa001108252556f0c6dd9d63145eabfe3370d1a.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 108 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/tracing_map.h | 11 +++++ 2 files changed, 119 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index f47a4d54bcf0..5cadb1b8b5fe 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -66,6 +66,73 @@ u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i) return (u64)atomic64_read(&elt->fields[i].sum); } +/** + * tracing_map_set_var - Assign a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * @n: The value to assign + * + * Assign n to variable i associated with the specified tracing_map_elt + * instance. The index i is the index returned by the call to + * tracing_map_add_var() when the tracing map was set up. + */ +void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n) +{ + atomic64_set(&elt->vars[i], n); + elt->var_set[i] = true; +} + +/** + * tracing_map_var_set - Return whether or not a variable has been set + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Return true if the variable has been set, false otherwise. The + * index i is the index returned by the call to tracing_map_add_var() + * when the tracing map was set up. + */ +bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i) +{ + return elt->var_set[i]; +} + +/** + * tracing_map_read_var - Return the value of a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Retrieve the value of the variable i associated with the specified + * tracing_map_elt instance. The index i is the index returned by the + * call to tracing_map_add_var() when the tracing map was set + * up. + * + * Return: The variable value associated with field i for elt. + */ +u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i) +{ + return (u64)atomic64_read(&elt->vars[i]); +} + +/** + * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Retrieve the value of the variable i associated with the specified + * tracing_map_elt instance, and reset the variable to the 'not set' + * state. The index i is the index returned by the call to + * tracing_map_add_var() when the tracing map was set up. The reset + * essentially makes the variable a read-once variable if it's only + * accessed using this function. + * + * Return: The variable value associated with field i for elt. + */ +u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i) +{ + elt->var_set[i] = false; + return (u64)atomic64_read(&elt->vars[i]); +} + int tracing_map_cmp_string(void *val_a, void *val_b) { char *a = val_a; @@ -170,6 +237,28 @@ int tracing_map_add_sum_field(struct tracing_map *map) return tracing_map_add_field(map, tracing_map_cmp_atomic64); } +/** + * tracing_map_add_var - Add a field describing a tracing_map var + * @map: The tracing_map + * + * Add a var to the map and return the index identifying it in the map + * and associated tracing_map_elts. This is the index used for + * instance to update a var for a particular tracing_map_elt using + * tracing_map_update_var() or reading it via tracing_map_read_var(). + * + * Return: The index identifying the var in the map and associated + * tracing_map_elts, or -EINVAL on error. + */ +int tracing_map_add_var(struct tracing_map *map) +{ + int ret = -EINVAL; + + if (map->n_vars < TRACING_MAP_VARS_MAX) + ret = map->n_vars++; + + return ret; +} + /** * tracing_map_add_key_field - Add a field describing a tracing_map key * @map: The tracing_map @@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct tracing_map_elt *elt) if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64) atomic64_set(&elt->fields[i].sum, 0); + for (i = 0; i < elt->map->n_vars; i++) { + atomic64_set(&elt->vars[i], 0); + elt->var_set[i] = false; + } + if (elt->map->ops && elt->map->ops->elt_clear) elt->map->ops->elt_clear(elt); } @@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt) if (elt->map->ops && elt->map->ops->elt_free) elt->map->ops->elt_free(elt); kfree(elt->fields); + kfree(elt->vars); + kfree(elt->var_set); kfree(elt->key); kfree(elt); } @@ -333,6 +429,18 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) goto free; } + elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL); + if (!elt->vars) { + err = -ENOMEM; + goto free; + } + + elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL); + if (!elt->var_set) { + err = -ENOMEM; + goto free; + } + tracing_map_elt_init_fields(elt); if (map->ops && map->ops->elt_alloc) { diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index de57887c0670..053eb92b2d31 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -10,6 +10,7 @@ #define TRACING_MAP_VALS_MAX 3 #define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ TRACING_MAP_VALS_MAX) +#define TRACING_MAP_VARS_MAX 16 #define TRACING_MAP_SORT_KEYS_MAX 2 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); @@ -137,6 +138,8 @@ struct tracing_map_field { struct tracing_map_elt { struct tracing_map *map; struct tracing_map_field *fields; + atomic64_t *vars; + bool *var_set; void *key; void *private_data; }; @@ -192,6 +195,7 @@ struct tracing_map { int key_idx[TRACING_MAP_KEYS_MAX]; unsigned int n_keys; struct tracing_map_sort_key sort_key; + unsigned int n_vars; atomic64_t hits; atomic64_t drops; }; @@ -241,6 +245,7 @@ tracing_map_create(unsigned int map_bits, extern int tracing_map_init(struct tracing_map *map); extern int tracing_map_add_sum_field(struct tracing_map *map); +extern int tracing_map_add_var(struct tracing_map *map); extern int tracing_map_add_key_field(struct tracing_map *map, unsigned int offset, tracing_map_cmp_fn_t cmp_fn); @@ -260,7 +265,13 @@ extern int tracing_map_cmp_none(void *val_a, void *val_b); extern void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n); +extern void tracing_map_set_var(struct tracing_map_elt *elt, + unsigned int i, u64 n); +extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i); extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); +extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i); +extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i); + extern void tracing_map_set_field_descr(struct tracing_map *map, unsigned int i, unsigned int key_offset, -- cgit v1.2.3-71-gd317 From b559d003a226911979ceb8469db4c9b621c3bc09 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:47 -0600 Subject: tracing: Add hist_data member to hist_field Allow hist_data access via hist_field. Some users of hist_fields require or will require more access to the associated hist_data. Link: http://lkml.kernel.org/r/d04cd0768f5228ebb4ac0ba4a847bc4d14d4826f.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a793f8c04830..77ebe6b410ba 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -39,6 +39,7 @@ struct hist_field { unsigned int offset; unsigned int is_signed; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; + struct hist_trigger_data *hist_data; }; static u64 hist_field_none(struct hist_field *field, void *event, @@ -420,7 +421,8 @@ static void destroy_hist_field(struct hist_field *hist_field, kfree(hist_field); } -static struct hist_field *create_hist_field(struct ftrace_event_field *field, +static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, + struct ftrace_event_field *field, unsigned long flags) { struct hist_field *hist_field; @@ -432,6 +434,8 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, if (!hist_field) return NULL; + hist_field->hist_data = hist_data; + if (flags & HIST_FIELD_FL_HITCOUNT) { hist_field->fn = hist_field_counter; goto out; @@ -445,7 +449,7 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, if (flags & HIST_FIELD_FL_LOG2) { unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; hist_field->fn = hist_field_log2; - hist_field->operands[0] = create_hist_field(field, fl); + hist_field->operands[0] = create_hist_field(hist_data, field, fl); hist_field->size = hist_field->operands[0]->size; goto out; } @@ -498,7 +502,7 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data) static int create_hitcount_val(struct hist_trigger_data *hist_data) { hist_data->fields[HITCOUNT_IDX] = - create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT); + create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT); if (!hist_data->fields[HITCOUNT_IDX]) return -ENOMEM; @@ -544,7 +548,7 @@ static int create_val_field(struct hist_trigger_data *hist_data, } } - hist_data->fields[val_idx] = create_hist_field(field, flags); + hist_data->fields[val_idx] = create_hist_field(hist_data, field, flags); if (!hist_data->fields[val_idx]) { ret = -ENOMEM; goto out; @@ -654,7 +658,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, } } - hist_data->fields[key_idx] = create_hist_field(field, flags); + hist_data->fields[key_idx] = create_hist_field(hist_data, field, flags); if (!hist_data->fields[key_idx]) { ret = -ENOMEM; goto out; -- cgit v1.2.3-71-gd317 From 860f9f6b02e9e846c4cfb3505efed331a910d0b7 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:48 -0600 Subject: tracing: Add usecs modifier for hist trigger timestamps Appending .usecs onto a common_timestamp field will cause the timestamp value to be in microseconds instead of the default nanoseconds. A typical latency histogram using usecs would look like this: # echo 'hist:keys=pid,prio:ts0=common_timestamp.usecs ... # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0 ... This also adds an external trace_clock_in_ns() to trace.c for the timestamp conversion. Link: http://lkml.kernel.org/r/4e813705a170b3e13e97dc3135047362fb1a39f3.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.txt | 1 + kernel/trace/trace.c | 13 +++++++++++-- kernel/trace/trace.h | 2 ++ kernel/trace/trace_events_hist.c | 28 ++++++++++++++++++++++------ 4 files changed, 36 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt index a4143f04a097..25c94730d3fe 100644 --- a/Documentation/trace/histogram.txt +++ b/Documentation/trace/histogram.txt @@ -74,6 +74,7 @@ .syscall display a syscall id as a system call name .execname display a common_pid as a program name .log2 display log2 value rather than raw number + .usecs display a common_timestamp in microseconds Note that in general the semantics of a given field aren't interpreted when applying a modifier to it, but there are some diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 988d94a05e81..82cc8891fda6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1168,6 +1168,14 @@ static struct { ARCH_TRACE_CLOCKS }; +bool trace_clock_in_ns(struct trace_array *tr) +{ + if (trace_clocks[tr->clock_id].in_ns) + return true; + + return false; +} + /* * trace_parser_get_init - gets the buffer for trace parser */ @@ -4694,8 +4702,9 @@ static const char readme_msg[] = "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" "\t .execname display a common_pid as a program name\n" - "\t .syscall display a syscall id as a syscall name\n\n" - "\t .log2 display log2 value rather than raw number\n\n" + "\t .syscall display a syscall id as a syscall name\n" + "\t .log2 display log2 value rather than raw number\n" + "\t .usecs display a common_timestamp in microseconds\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 99060f7eebbd..89771b4f16df 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -289,6 +289,8 @@ extern void trace_array_put(struct trace_array *tr); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); +extern bool trace_clock_in_ns(struct trace_array *tr); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 77ebe6b410ba..7f5f0b8f6558 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -90,12 +90,6 @@ static u64 hist_field_log2(struct hist_field *hist_field, void *event, return (u64) ilog2(roundup_pow_of_two(val)); } -static u64 hist_field_timestamp(struct hist_field *hist_field, void *event, - struct ring_buffer_event *rbe) -{ - return ring_buffer_event_time_stamp(rbe); -} - #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, \ void *event, \ @@ -143,6 +137,7 @@ enum hist_field_flags { HIST_FIELD_FL_STACKTRACE = 1 << 8, HIST_FIELD_FL_LOG2 = 1 << 9, HIST_FIELD_FL_TIMESTAMP = 1 << 10, + HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11, }; struct hist_trigger_attrs { @@ -153,6 +148,7 @@ struct hist_trigger_attrs { bool pause; bool cont; bool clear; + bool ts_in_usecs; unsigned int map_bits; }; @@ -170,6 +166,20 @@ struct hist_trigger_data { bool enable_timestamps; }; +static u64 hist_field_timestamp(struct hist_field *hist_field, void *event, + struct ring_buffer_event *rbe) +{ + struct hist_trigger_data *hist_data = hist_field->hist_data; + struct trace_array *tr = hist_data->event_file->tr; + + u64 ts = ring_buffer_event_time_stamp(rbe); + + if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr)) + ts = ns2usecs(ts); + + return ts; +} + static const char *hist_field_name(struct hist_field *field, unsigned int level) { @@ -634,6 +644,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_SYSCALL; else if (strcmp(field_str, "log2") == 0) flags |= HIST_FIELD_FL_LOG2; + else if (strcmp(field_str, "usecs") == 0) + flags |= HIST_FIELD_FL_TIMESTAMP_USECS; else { ret = -EINVAL; goto out; @@ -643,6 +655,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, if (strcmp(field_name, "common_timestamp") == 0) { flags |= HIST_FIELD_FL_TIMESTAMP; hist_data->enable_timestamps = true; + if (flags & HIST_FIELD_FL_TIMESTAMP_USECS) + hist_data->attrs->ts_in_usecs = true; key_size = sizeof(u64); } else { field = trace_find_event_field(file->event_call, field_name); @@ -1241,6 +1255,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "syscall"; else if (hist_field->flags & HIST_FIELD_FL_LOG2) flags_str = "log2"; + else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) + flags_str = "usecs"; return flags_str; } -- cgit v1.2.3-71-gd317 From 30350d65ac5676c6d08d4fc935bc9a9cb0fd4ed3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:49 -0600 Subject: tracing: Add variable support to hist triggers Add support for saving the value of a current event's event field by assigning it to a variable that can be read by a subsequent event. The basic syntax for saving a variable is to simply prefix a unique variable name not corresponding to any keyword along with an '=' sign to any event field. Both keys and values can be saved and retrieved in this way: # echo 'hist:keys=next_pid:vals=$ts0:ts0=common_timestamp ... # echo 'hist:timer_pid=common_pid:key=$timer_pid ...' If a variable isn't a key variable or prefixed with 'vals=', the associated event field will be saved in a variable but won't be summed as a value: # echo 'hist:keys=next_pid:ts1=common_timestamp:... Multiple variables can be assigned at the same time: # echo 'hist:keys=pid:vals=$ts0,$b,field2:ts0=common_timestamp,b=field1 ... Multiple (or single) variables can also be assigned at the same time using separate assignments: # echo 'hist:keys=pid:vals=$ts0:ts0=common_timestamp:b=field1:c=field2 ... Variables set as above can be used by being referenced from another event, as described in a subsequent patch. Link: http://lkml.kernel.org/r/fc93c4944d9719dbcb1d0067be627d44e98e2adc.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Baohong Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 370 ++++++++++++++++++++++++++++++++++----- 1 file changed, 331 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7f5f0b8f6558..8f43f24bf49c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -30,6 +30,13 @@ typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event, struct ring_buffer_event *rbe); #define HIST_FIELD_OPERANDS_MAX 2 +#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) + +struct hist_var { + char *name; + struct hist_trigger_data *hist_data; + unsigned int idx; +}; struct hist_field { struct ftrace_event_field *field; @@ -40,6 +47,7 @@ struct hist_field { unsigned int is_signed; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; + struct hist_var var; }; static u64 hist_field_none(struct hist_field *field, void *event, @@ -138,6 +146,13 @@ enum hist_field_flags { HIST_FIELD_FL_LOG2 = 1 << 9, HIST_FIELD_FL_TIMESTAMP = 1 << 10, HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11, + HIST_FIELD_FL_VAR = 1 << 12, +}; + +struct var_defs { + unsigned int n_vars; + char *name[TRACING_MAP_VARS_MAX]; + char *expr[TRACING_MAP_VARS_MAX]; }; struct hist_trigger_attrs { @@ -150,13 +165,19 @@ struct hist_trigger_attrs { bool clear; bool ts_in_usecs; unsigned int map_bits; + + char *assignment_str[TRACING_MAP_VARS_MAX]; + unsigned int n_assignments; + + struct var_defs var_defs; }; struct hist_trigger_data { - struct hist_field *fields[TRACING_MAP_FIELDS_MAX]; + struct hist_field *fields[HIST_FIELDS_MAX]; unsigned int n_vals; unsigned int n_keys; unsigned int n_fields; + unsigned int n_vars; unsigned int key_size; struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; unsigned int n_sort_keys; @@ -164,6 +185,7 @@ struct hist_trigger_data { struct hist_trigger_attrs *attrs; struct tracing_map *map; bool enable_timestamps; + bool remove; }; static u64 hist_field_timestamp(struct hist_field *hist_field, void *event, @@ -180,6 +202,48 @@ static u64 hist_field_timestamp(struct hist_field *hist_field, void *event, return ts; } +static struct hist_field *find_var_field(struct hist_trigger_data *hist_data, + const char *var_name) +{ + struct hist_field *hist_field, *found = NULL; + int i; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR && + strcmp(hist_field->var.name, var_name) == 0) { + found = hist_field; + break; + } + } + + return found; +} + +static struct hist_field *find_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + const char *var_name) +{ + struct hist_trigger_data *test_data; + struct event_trigger_data *test; + struct hist_field *hist_field; + + hist_field = find_var_field(hist_data, var_name); + if (hist_field) + return hist_field; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + test_data = test->private_data; + hist_field = find_var_field(test_data, var_name); + if (hist_field) + return hist_field; + } + } + + return NULL; +} + static const char *hist_field_name(struct hist_field *field, unsigned int level) { @@ -262,9 +326,14 @@ static int parse_map_size(char *str) static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) { + unsigned int i; + if (!attrs) return; + for (i = 0; i < attrs->n_assignments; i++) + kfree(attrs->assignment_str[i]); + kfree(attrs->name); kfree(attrs->sort_key_str); kfree(attrs->keys_str); @@ -311,8 +380,22 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) goto out; } attrs->map_bits = map_bits; - } else - ret = -EINVAL; + } else { + char *assignment; + + if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { + ret = -EINVAL; + goto out; + } + + assignment = kstrdup(str, GFP_KERNEL); + if (!assignment) { + ret = -ENOMEM; + goto out; + } + + attrs->assignment_str[attrs->n_assignments++] = assignment; + } out: return ret; } @@ -428,12 +511,15 @@ static void destroy_hist_field(struct hist_field *hist_field, for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) destroy_hist_field(hist_field->operands[i], level + 1); + kfree(hist_field->var.name); + kfree(hist_field); } static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, struct ftrace_event_field *field, - unsigned long flags) + unsigned long flags, + char *var_name) { struct hist_field *hist_field; @@ -459,7 +545,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_LOG2) { unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; hist_field->fn = hist_field_log2; - hist_field->operands[0] = create_hist_field(hist_data, field, fl); + hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; goto out; } @@ -494,14 +580,23 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field->field = field; hist_field->flags = flags; + if (var_name) { + hist_field->var.name = kstrdup(var_name, GFP_KERNEL); + if (!hist_field->var.name) + goto free; + } + return hist_field; + free: + destroy_hist_field(hist_field, 0); + return NULL; } static void destroy_hist_fields(struct hist_trigger_data *hist_data) { unsigned int i; - for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) { + for (i = 0; i < HIST_FIELDS_MAX; i++) { if (hist_data->fields[i]) { destroy_hist_field(hist_data->fields[i], 0); hist_data->fields[i] = NULL; @@ -512,11 +607,12 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data) static int create_hitcount_val(struct hist_trigger_data *hist_data) { hist_data->fields[HITCOUNT_IDX] = - create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT); + create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL); if (!hist_data->fields[HITCOUNT_IDX]) return -ENOMEM; hist_data->n_vals++; + hist_data->n_fields++; if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) return -EINVAL; @@ -524,19 +620,16 @@ static int create_hitcount_val(struct hist_trigger_data *hist_data) return 0; } -static int create_val_field(struct hist_trigger_data *hist_data, - unsigned int val_idx, - struct trace_event_file *file, - char *field_str) +static int __create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *var_name, char *field_str, + unsigned long flags) { struct ftrace_event_field *field = NULL; - unsigned long flags = 0; char *field_name; int ret = 0; - if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) - return -EINVAL; - field_name = strsep(&field_str, "."); if (field_str) { if (strcmp(field_str, "hex") == 0) @@ -558,25 +651,58 @@ static int create_val_field(struct hist_trigger_data *hist_data, } } - hist_data->fields[val_idx] = create_hist_field(hist_data, field, flags); + hist_data->fields[val_idx] = create_hist_field(hist_data, field, flags, var_name); if (!hist_data->fields[val_idx]) { ret = -ENOMEM; goto out; } ++hist_data->n_vals; + ++hist_data->n_fields; - if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) ret = -EINVAL; out: return ret; } +static int create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *field_str) +{ + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) + return -EINVAL; + + return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); +} + +static int create_var_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *var_name, char *expr_str) +{ + unsigned long flags = 0; + + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) + return -EINVAL; + if (find_var(hist_data, file, var_name) && !hist_data->remove) { + return -EINVAL; + } + + flags |= HIST_FIELD_FL_VAR; + hist_data->n_vars++; + if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX)) + return -EINVAL; + + return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); +} + static int create_val_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { char *fields_str, *field_str; - unsigned int i, j; + unsigned int i, j = 1; int ret; ret = create_hitcount_val(hist_data); @@ -596,12 +722,15 @@ static int create_val_fields(struct hist_trigger_data *hist_data, field_str = strsep(&fields_str, ","); if (!field_str) break; + if (strcmp(field_str, "hitcount") == 0) continue; + ret = create_val_field(hist_data, j++, file, field_str); if (ret) goto out; } + if (fields_str && (strcmp(fields_str, "hitcount") != 0)) ret = -EINVAL; out: @@ -615,11 +744,12 @@ static int create_key_field(struct hist_trigger_data *hist_data, char *field_str) { struct ftrace_event_field *field = NULL; + struct hist_field *hist_field = NULL; unsigned long flags = 0; unsigned int key_size; int ret = 0; - if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) + if (WARN_ON(key_idx >= HIST_FIELDS_MAX)) return -EINVAL; flags |= HIST_FIELD_FL_KEY; @@ -627,6 +757,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, if (strcmp(field_str, "stacktrace") == 0) { flags |= HIST_FIELD_FL_STACKTRACE; key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH; + hist_field = create_hist_field(hist_data, NULL, flags, NULL); } else { char *field_name = strsep(&field_str, "."); @@ -672,7 +803,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, } } - hist_data->fields[key_idx] = create_hist_field(hist_data, field, flags); + hist_data->fields[key_idx] = create_hist_field(hist_data, field, flags, NULL); if (!hist_data->fields[key_idx]) { ret = -ENOMEM; goto out; @@ -688,6 +819,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, } hist_data->n_keys++; + hist_data->n_fields++; if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX)) return -EINVAL; @@ -731,21 +863,111 @@ static int create_key_fields(struct hist_trigger_data *hist_data, return ret; } +static int create_var_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + unsigned int i, j = hist_data->n_vals; + int ret = 0; + + unsigned int n_vars = hist_data->attrs->var_defs.n_vars; + + for (i = 0; i < n_vars; i++) { + char *var_name = hist_data->attrs->var_defs.name[i]; + char *expr = hist_data->attrs->var_defs.expr[i]; + + ret = create_var_field(hist_data, j++, file, var_name, expr); + if (ret) + goto out; + } + out: + return ret; +} + +static void free_var_defs(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) { + kfree(hist_data->attrs->var_defs.name[i]); + kfree(hist_data->attrs->var_defs.expr[i]); + } + + hist_data->attrs->var_defs.n_vars = 0; +} + +static int parse_var_defs(struct hist_trigger_data *hist_data) +{ + char *s, *str, *var_name, *field_str; + unsigned int i, j, n_vars = 0; + int ret = 0; + + for (i = 0; i < hist_data->attrs->n_assignments; i++) { + str = hist_data->attrs->assignment_str[i]; + for (j = 0; j < TRACING_MAP_VARS_MAX; j++) { + field_str = strsep(&str, ","); + if (!field_str) + break; + + var_name = strsep(&field_str, "="); + if (!var_name || !field_str) { + ret = -EINVAL; + goto free; + } + + if (n_vars == TRACING_MAP_VARS_MAX) { + ret = -EINVAL; + goto free; + } + + s = kstrdup(var_name, GFP_KERNEL); + if (!s) { + ret = -ENOMEM; + goto free; + } + hist_data->attrs->var_defs.name[n_vars] = s; + + s = kstrdup(field_str, GFP_KERNEL); + if (!s) { + kfree(hist_data->attrs->var_defs.name[n_vars]); + ret = -ENOMEM; + goto free; + } + hist_data->attrs->var_defs.expr[n_vars++] = s; + + hist_data->attrs->var_defs.n_vars = n_vars; + } + } + + return ret; + free: + free_var_defs(hist_data); + + return ret; +} + static int create_hist_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { int ret; + ret = parse_var_defs(hist_data); + if (ret) + goto out; + ret = create_val_fields(hist_data, file); if (ret) goto out; - ret = create_key_fields(hist_data, file); + ret = create_var_fields(hist_data, file); if (ret) goto out; - hist_data->n_fields = hist_data->n_vals + hist_data->n_keys; + ret = create_key_fields(hist_data, file); + if (ret) + goto out; out: + free_var_defs(hist_data); + return ret; } @@ -768,7 +990,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) char *fields_str = hist_data->attrs->sort_key_str; struct tracing_map_sort_key *sort_key; int descending, ret = 0; - unsigned int i, j; + unsigned int i, j, k; hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */ @@ -816,12 +1038,19 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) continue; } - for (j = 1; j < hist_data->n_fields; j++) { + for (j = 1, k = 1; j < hist_data->n_fields; j++) { + unsigned int idx; + hist_field = hist_data->fields[j]; + if (hist_field->flags & HIST_FIELD_FL_VAR) + continue; + + idx = k++; + test_name = hist_field_name(hist_field, 0); if (strcmp(field_name, test_name) == 0) { - sort_key->field_idx = j; + sort_key->field_idx = idx; descending = is_descending(field_str); if (descending < 0) { ret = descending; @@ -836,6 +1065,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) break; } } + hist_data->n_sort_keys = i; out: return ret; @@ -876,12 +1106,19 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) idx = tracing_map_add_key_field(map, hist_field->offset, cmp_fn); - - } else + } else if (!(hist_field->flags & HIST_FIELD_FL_VAR)) idx = tracing_map_add_sum_field(map); if (idx < 0) return idx; + + if (hist_field->flags & HIST_FIELD_FL_VAR) { + idx = tracing_map_add_var(map); + if (idx < 0) + return idx; + hist_field->var.idx = idx; + hist_field->var.hist_data = hist_data; + } } return 0; @@ -905,7 +1142,8 @@ static bool need_tracing_map_ops(struct hist_trigger_data *hist_data) static struct hist_trigger_data * create_hist_data(unsigned int map_bits, struct hist_trigger_attrs *attrs, - struct trace_event_file *file) + struct trace_event_file *file, + bool remove) { const struct tracing_map_ops *map_ops = NULL; struct hist_trigger_data *hist_data; @@ -916,6 +1154,7 @@ create_hist_data(unsigned int map_bits, return ERR_PTR(-ENOMEM); hist_data->attrs = attrs; + hist_data->remove = remove; ret = create_hist_fields(hist_data, file); if (ret) @@ -962,14 +1201,28 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, struct ring_buffer_event *rbe) { struct hist_field *hist_field; - unsigned int i; + unsigned int i, var_idx; u64 hist_val; for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; hist_val = hist_field->fn(hist_field, rec, rbe); + if (hist_field->flags & HIST_FIELD_FL_VAR) { + var_idx = hist_field->var.idx; + tracing_map_set_var(elt, var_idx, hist_val); + continue; + } tracing_map_update_sum(elt, i, hist_val); } + + for_each_hist_key_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field->flags & HIST_FIELD_FL_VAR) { + hist_val = hist_field->fn(hist_field, rec, rbe); + var_idx = hist_field->var.idx; + tracing_map_set_var(elt, var_idx, hist_val); + } + } } static inline void add_to_key(char *compound_key, void *key, @@ -1144,6 +1397,9 @@ hist_trigger_entry_print(struct seq_file *m, for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); + if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR) + continue; + if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { seq_printf(m, " %s: %10llx", field_name, tracing_map_read_sum(elt, i)); @@ -1265,6 +1521,9 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { const char *field_name = hist_field_name(hist_field, 0); + if (hist_field->var.name) + seq_printf(m, "%s=", hist_field->var.name); + if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) seq_puts(m, "common_timestamp"); else if (field_name) @@ -1283,7 +1542,8 @@ static int event_hist_trigger_print(struct seq_file *m, struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; - struct hist_field *key_field; + struct hist_field *field; + bool have_var = false; unsigned int i; seq_puts(m, "hist:"); @@ -1294,25 +1554,47 @@ static int event_hist_trigger_print(struct seq_file *m, seq_puts(m, "keys="); for_each_hist_key_field(i, hist_data) { - key_field = hist_data->fields[i]; + field = hist_data->fields[i]; if (i > hist_data->n_vals) seq_puts(m, ","); - if (key_field->flags & HIST_FIELD_FL_STACKTRACE) + if (field->flags & HIST_FIELD_FL_STACKTRACE) seq_puts(m, "stacktrace"); else - hist_field_print(m, key_field); + hist_field_print(m, field); } seq_puts(m, ":vals="); for_each_hist_val_field(i, hist_data) { + field = hist_data->fields[i]; + if (field->flags & HIST_FIELD_FL_VAR) { + have_var = true; + continue; + } + if (i == HITCOUNT_IDX) seq_puts(m, "hitcount"); else { seq_puts(m, ","); - hist_field_print(m, hist_data->fields[i]); + hist_field_print(m, field); + } + } + + if (have_var) { + unsigned int n = 0; + + seq_puts(m, ":"); + + for_each_hist_val_field(i, hist_data) { + field = hist_data->fields[i]; + + if (field->flags & HIST_FIELD_FL_VAR) { + if (n++) + seq_puts(m, ","); + hist_field_print(m, field); + } } } @@ -1320,7 +1602,10 @@ static int event_hist_trigger_print(struct seq_file *m, for (i = 0; i < hist_data->n_sort_keys; i++) { struct tracing_map_sort_key *sort_key; - unsigned int idx; + unsigned int idx, first_key_idx; + + /* skip VAR vals */ + first_key_idx = hist_data->n_vals - hist_data->n_vars; sort_key = &hist_data->sort_keys[i]; idx = sort_key->field_idx; @@ -1333,8 +1618,11 @@ static int event_hist_trigger_print(struct seq_file *m, if (idx == HITCOUNT_IDX) seq_puts(m, "hitcount"); - else + else { + if (idx >= first_key_idx) + idx += hist_data->n_vars; hist_field_print(m, hist_data->fields[idx]); + } if (sort_key->descending) seq_puts(m, ".descending"); @@ -1631,7 +1919,7 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, test->ops->free(test->ops, test); if (hist_data->enable_timestamps) { - if (unregistered) + if (!hist_data->remove || unregistered) tracing_set_time_stamp_abs(file->tr, false); } } @@ -1664,12 +1952,16 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, struct hist_trigger_attrs *attrs; struct event_trigger_ops *trigger_ops; struct hist_trigger_data *hist_data; + bool remove = false; char *trigger; int ret = 0; if (!param) return -EINVAL; + if (glob[0] == '!') + remove = true; + /* separate the trigger from the filter (k:v [if filter]) */ trigger = strsep(¶m, " \t"); if (!trigger) @@ -1682,7 +1974,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, if (attrs->map_bits) hist_trigger_bits = attrs->map_bits; - hist_data = create_hist_data(hist_trigger_bits, attrs, file); + hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove); if (IS_ERR(hist_data)) { destroy_hist_trigger_attrs(attrs); return PTR_ERR(hist_data); @@ -1711,7 +2003,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, goto out_free; } - if (glob[0] == '!') { + if (remove) { cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); ret = 0; goto out_free; -- cgit v1.2.3-71-gd317 From 1a361dfcf261d68f081a12133aa8d0d6d6cca34f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:50 -0600 Subject: tracing: Account for variables in named trigger compatibility Named triggers must also have the same set of variables in order to be considered compatible - update the trigger match test to account for that. The reason for this requirement is that named triggers with variables are meant to allow one or more events to set the same variable. Link: http://lkml.kernel.org/r/a17eae6328a99917f9d5c66129c9fcd355279ee9.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8f43f24bf49c..ba326260c034 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1610,7 +1610,7 @@ static int event_hist_trigger_print(struct seq_file *m, sort_key = &hist_data->sort_keys[i]; idx = sort_key->field_idx; - if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX)) + if (WARN_ON(idx >= HIST_FIELDS_MAX)) return -EINVAL; if (i > 0) @@ -1798,6 +1798,11 @@ static bool hist_trigger_match(struct event_trigger_data *data, return false; if (key_field->is_signed != key_field_test->is_signed) return false; + if (!!key_field->var.name != !!key_field_test->var.name) + return false; + if (key_field->var.name && + strcmp(key_field->var.name, key_field_test->var.name) != 0) + return false; } for (i = 0; i < hist_data->n_sort_keys; i++) { -- cgit v1.2.3-71-gd317 From 2ece94fbd25c70543dd073d10569e08c3e3b4a7f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:51 -0600 Subject: tracing: Move get_hist_field_flags() Move get_hist_field_flags() to make it more easily accessible for new code (and keep the move separate from new functionality). Link: http://lkml.kernel.org/r/32470f0a7047ec7a6e84ba5ec89d6142cc6ede7d.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 44 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ba326260c034..a81a709dc703 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -497,6 +497,28 @@ static const struct tracing_map_ops hist_trigger_elt_comm_ops = { .elt_init = hist_trigger_elt_comm_init, }; +static const char *get_hist_field_flags(struct hist_field *hist_field) +{ + const char *flags_str = NULL; + + if (hist_field->flags & HIST_FIELD_FL_HEX) + flags_str = "hex"; + else if (hist_field->flags & HIST_FIELD_FL_SYM) + flags_str = "sym"; + else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) + flags_str = "sym-offset"; + else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) + flags_str = "execname"; + else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) + flags_str = "syscall"; + else if (hist_field->flags & HIST_FIELD_FL_LOG2) + flags_str = "log2"; + else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) + flags_str = "usecs"; + + return flags_str; +} + static void destroy_hist_field(struct hist_field *hist_field, unsigned int level) { @@ -1495,28 +1517,6 @@ const struct file_operations event_hist_fops = { .release = single_release, }; -static const char *get_hist_field_flags(struct hist_field *hist_field) -{ - const char *flags_str = NULL; - - if (hist_field->flags & HIST_FIELD_FL_HEX) - flags_str = "hex"; - else if (hist_field->flags & HIST_FIELD_FL_SYM) - flags_str = "sym"; - else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) - flags_str = "sym-offset"; - else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) - flags_str = "execname"; - else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) - flags_str = "syscall"; - else if (hist_field->flags & HIST_FIELD_FL_LOG2) - flags_str = "log2"; - else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) - flags_str = "usecs"; - - return flags_str; -} - static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { const char *field_name = hist_field_name(hist_field, 0); -- cgit v1.2.3-71-gd317 From 100719dcef447aa0c90301f919e81ae477b32bf2 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:52 -0600 Subject: tracing: Add simple expression support to hist triggers Add support for simple addition, subtraction, and unary expressions (-(expr) and expr, where expr = b-a, a+b, a+b+c) to hist triggers, in order to support a minimal set of useful inter-event calculations. These operations are needed for calculating latencies between events (timestamp1-timestamp0) and for combined latencies (latencies over 3 or more events). In the process, factor out some common code from key and value parsing. Link: http://lkml.kernel.org/r/9a9308ead4fe32a433d9c7e95921fb798394f6b2.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi [kbuild test robot fix, add static to parse_atom()] Signed-off-by: Fengguang Wu [ Replaced '//' comments with normal /* */ comments ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 487 +++++++++++++++++++++++++++++++++------ 1 file changed, 413 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a81a709dc703..4c3c7d784bfd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -32,6 +32,13 @@ typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event, #define HIST_FIELD_OPERANDS_MAX 2 #define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) +enum field_op_id { + FIELD_OP_NONE, + FIELD_OP_PLUS, + FIELD_OP_MINUS, + FIELD_OP_UNARY_MINUS, +}; + struct hist_var { char *name; struct hist_trigger_data *hist_data; @@ -48,6 +55,8 @@ struct hist_field { struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; struct hist_var var; + enum field_op_id operator; + char *name; }; static u64 hist_field_none(struct hist_field *field, void *event, @@ -98,6 +107,41 @@ static u64 hist_field_log2(struct hist_field *hist_field, void *event, return (u64) ilog2(roundup_pow_of_two(val)); } +static u64 hist_field_plus(struct hist_field *hist_field, void *event, + struct ring_buffer_event *rbe) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, event, rbe); + u64 val2 = operand2->fn(operand2, event, rbe); + + return val1 + val2; +} + +static u64 hist_field_minus(struct hist_field *hist_field, void *event, + struct ring_buffer_event *rbe) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, event, rbe); + u64 val2 = operand2->fn(operand2, event, rbe); + + return val1 - val2; +} + +static u64 hist_field_unary_minus(struct hist_field *hist_field, void *event, + struct ring_buffer_event *rbe) +{ + struct hist_field *operand = hist_field->operands[0]; + + s64 sval = (s64)operand->fn(operand, event, rbe); + u64 val = (u64)-sval; + + return val; +} + #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, \ void *event, \ @@ -147,6 +191,7 @@ enum hist_field_flags { HIST_FIELD_FL_TIMESTAMP = 1 << 10, HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11, HIST_FIELD_FL_VAR = 1 << 12, + HIST_FIELD_FL_EXPR = 1 << 13, }; struct var_defs { @@ -258,6 +303,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; + else if (field->flags & HIST_FIELD_FL_EXPR) + field_name = field->name; if (field_name == NULL) field_name = ""; @@ -519,12 +566,104 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) return flags_str; } +static void expr_field_str(struct hist_field *field, char *expr) +{ + strcat(expr, hist_field_name(field, 0)); + + if (field->flags) { + const char *flags_str = get_hist_field_flags(field); + + if (flags_str) { + strcat(expr, "."); + strcat(expr, flags_str); + } + } +} + +static char *expr_str(struct hist_field *field, unsigned int level) +{ + char *expr; + + if (level > 1) + return NULL; + + expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!expr) + return NULL; + + if (!field->operands[0]) { + expr_field_str(field, expr); + return expr; + } + + if (field->operator == FIELD_OP_UNARY_MINUS) { + char *subexpr; + + strcat(expr, "-("); + subexpr = expr_str(field->operands[0], ++level); + if (!subexpr) { + kfree(expr); + return NULL; + } + strcat(expr, subexpr); + strcat(expr, ")"); + + kfree(subexpr); + + return expr; + } + + expr_field_str(field->operands[0], expr); + + switch (field->operator) { + case FIELD_OP_MINUS: + strcat(expr, "-"); + break; + case FIELD_OP_PLUS: + strcat(expr, "+"); + break; + default: + kfree(expr); + return NULL; + } + + expr_field_str(field->operands[1], expr); + + return expr; +} + +static int contains_operator(char *str) +{ + enum field_op_id field_op = FIELD_OP_NONE; + char *op; + + op = strpbrk(str, "+-"); + if (!op) + return FIELD_OP_NONE; + + switch (*op) { + case '-': + if (*str == '-') + field_op = FIELD_OP_UNARY_MINUS; + else + field_op = FIELD_OP_MINUS; + break; + case '+': + field_op = FIELD_OP_PLUS; + break; + default: + break; + } + + return field_op; +} + static void destroy_hist_field(struct hist_field *hist_field, unsigned int level) { unsigned int i; - if (level > 2) + if (level > 3) return; if (!hist_field) @@ -534,6 +673,7 @@ static void destroy_hist_field(struct hist_field *hist_field, destroy_hist_field(hist_field->operands[i], level + 1); kfree(hist_field->var.name); + kfree(hist_field->name); kfree(hist_field); } @@ -554,6 +694,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field->hist_data = hist_data; + if (flags & HIST_FIELD_FL_EXPR) + goto out; /* caller will populate */ + if (flags & HIST_FIELD_FL_HITCOUNT) { hist_field->fn = hist_field_counter; goto out; @@ -626,6 +769,257 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data) } } +static struct ftrace_event_field * +parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, + char *field_str, unsigned long *flags) +{ + struct ftrace_event_field *field = NULL; + char *field_name, *modifier, *str; + + modifier = str = kstrdup(field_str, GFP_KERNEL); + if (!modifier) + return ERR_PTR(-ENOMEM); + + field_name = strsep(&modifier, "."); + if (modifier) { + if (strcmp(modifier, "hex") == 0) + *flags |= HIST_FIELD_FL_HEX; + else if (strcmp(modifier, "sym") == 0) + *flags |= HIST_FIELD_FL_SYM; + else if (strcmp(modifier, "sym-offset") == 0) + *flags |= HIST_FIELD_FL_SYM_OFFSET; + else if ((strcmp(modifier, "execname") == 0) && + (strcmp(field_name, "common_pid") == 0)) + *flags |= HIST_FIELD_FL_EXECNAME; + else if (strcmp(modifier, "syscall") == 0) + *flags |= HIST_FIELD_FL_SYSCALL; + else if (strcmp(modifier, "log2") == 0) + *flags |= HIST_FIELD_FL_LOG2; + else if (strcmp(modifier, "usecs") == 0) + *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; + else { + field = ERR_PTR(-EINVAL); + goto out; + } + } + + if (strcmp(field_name, "common_timestamp") == 0) { + *flags |= HIST_FIELD_FL_TIMESTAMP; + hist_data->enable_timestamps = true; + if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) + hist_data->attrs->ts_in_usecs = true; + } else { + field = trace_find_event_field(file->event_call, field_name); + if (!field || !field->size) { + field = ERR_PTR(-EINVAL); + goto out; + } + } + out: + kfree(str); + + return field; +} + +static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, + struct trace_event_file *file, char *str, + unsigned long *flags, char *var_name) +{ + struct ftrace_event_field *field = NULL; + struct hist_field *hist_field = NULL; + int ret = 0; + + field = parse_field(hist_data, file, str, flags); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto out; + } + + hist_field = create_hist_field(hist_data, field, *flags, var_name); + if (!hist_field) { + ret = -ENOMEM; + goto out; + } + + return hist_field; + out: + return ERR_PTR(ret); +} + +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int level); + +static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int level) +{ + struct hist_field *operand1, *expr = NULL; + unsigned long operand_flags; + int ret = 0; + char *s; + + /* we support only -(xxx) i.e. explicit parens required */ + + if (level > 3) { + ret = -EINVAL; + goto free; + } + + str++; /* skip leading '-' */ + + s = strchr(str, '('); + if (s) + str++; + else { + ret = -EINVAL; + goto free; + } + + s = strrchr(str, ')'); + if (s) + *s = '\0'; + else { + ret = -EINVAL; /* no closing ')' */ + goto free; + } + + flags |= HIST_FIELD_FL_EXPR; + expr = create_hist_field(hist_data, NULL, flags, var_name); + if (!expr) { + ret = -ENOMEM; + goto free; + } + + operand_flags = 0; + operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + if (IS_ERR(operand1)) { + ret = PTR_ERR(operand1); + goto free; + } + + expr->flags |= operand1->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + expr->fn = hist_field_unary_minus; + expr->operands[0] = operand1; + expr->operator = FIELD_OP_UNARY_MINUS; + expr->name = expr_str(expr, 0); + + return expr; + free: + destroy_hist_field(expr, 0); + return ERR_PTR(ret); +} + +static int check_expr_operands(struct hist_field *operand1, + struct hist_field *operand2) +{ + unsigned long operand1_flags = operand1->flags; + unsigned long operand2_flags = operand2->flags; + + if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != + (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) + return -EINVAL; + + return 0; +} + +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int level) +{ + struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; + unsigned long operand_flags; + int field_op, ret = -EINVAL; + char *sep, *operand1_str; + + if (level > 3) + return ERR_PTR(-EINVAL); + + field_op = contains_operator(str); + + if (field_op == FIELD_OP_NONE) + return parse_atom(hist_data, file, str, &flags, var_name); + + if (field_op == FIELD_OP_UNARY_MINUS) + return parse_unary(hist_data, file, str, flags, var_name, ++level); + + switch (field_op) { + case FIELD_OP_MINUS: + sep = "-"; + break; + case FIELD_OP_PLUS: + sep = "+"; + break; + default: + goto free; + } + + operand1_str = strsep(&str, sep); + if (!operand1_str || !str) + goto free; + + operand_flags = 0; + operand1 = parse_atom(hist_data, file, operand1_str, + &operand_flags, NULL); + if (IS_ERR(operand1)) { + ret = PTR_ERR(operand1); + operand1 = NULL; + goto free; + } + + /* rest of string could be another expression e.g. b+c in a+b+c */ + operand_flags = 0; + operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + if (IS_ERR(operand2)) { + ret = PTR_ERR(operand2); + operand2 = NULL; + goto free; + } + + ret = check_expr_operands(operand1, operand2); + if (ret) + goto free; + + flags |= HIST_FIELD_FL_EXPR; + + flags |= operand1->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + + expr = create_hist_field(hist_data, NULL, flags, var_name); + if (!expr) { + ret = -ENOMEM; + goto free; + } + + expr->operands[0] = operand1; + expr->operands[1] = operand2; + expr->operator = field_op; + expr->name = expr_str(expr, 0); + + switch (field_op) { + case FIELD_OP_MINUS: + expr->fn = hist_field_minus; + break; + case FIELD_OP_PLUS: + expr->fn = hist_field_plus; + break; + default: + goto free; + } + + return expr; + free: + destroy_hist_field(operand1, 0); + destroy_hist_field(operand2, 0); + destroy_hist_field(expr, 0); + + return ERR_PTR(ret); +} + static int create_hitcount_val(struct hist_trigger_data *hist_data) { hist_data->fields[HITCOUNT_IDX] = @@ -648,37 +1042,17 @@ static int __create_val_field(struct hist_trigger_data *hist_data, char *var_name, char *field_str, unsigned long flags) { - struct ftrace_event_field *field = NULL; - char *field_name; + struct hist_field *hist_field; int ret = 0; - field_name = strsep(&field_str, "."); - if (field_str) { - if (strcmp(field_str, "hex") == 0) - flags |= HIST_FIELD_FL_HEX; - else { - ret = -EINVAL; - goto out; - } - } - - if (strcmp(field_name, "common_timestamp") == 0) { - flags |= HIST_FIELD_FL_TIMESTAMP; - hist_data->enable_timestamps = true; - } else { - field = trace_find_event_field(file->event_call, field_name); - if (!field || !field->size) { - ret = -EINVAL; - goto out; - } - } - - hist_data->fields[val_idx] = create_hist_field(hist_data, field, flags, var_name); - if (!hist_data->fields[val_idx]) { - ret = -ENOMEM; + hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0); + if (IS_ERR(hist_field)) { + ret = PTR_ERR(hist_field); goto out; } + hist_data->fields[val_idx] = hist_field; + ++hist_data->n_vals; ++hist_data->n_fields; @@ -765,8 +1139,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *field_str) { - struct ftrace_event_field *field = NULL; struct hist_field *hist_field = NULL; + unsigned long flags = 0; unsigned int key_size; int ret = 0; @@ -781,60 +1155,24 @@ static int create_key_field(struct hist_trigger_data *hist_data, key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH; hist_field = create_hist_field(hist_data, NULL, flags, NULL); } else { - char *field_name = strsep(&field_str, "."); - - if (field_str) { - if (strcmp(field_str, "hex") == 0) - flags |= HIST_FIELD_FL_HEX; - else if (strcmp(field_str, "sym") == 0) - flags |= HIST_FIELD_FL_SYM; - else if (strcmp(field_str, "sym-offset") == 0) - flags |= HIST_FIELD_FL_SYM_OFFSET; - else if ((strcmp(field_str, "execname") == 0) && - (strcmp(field_name, "common_pid") == 0)) - flags |= HIST_FIELD_FL_EXECNAME; - else if (strcmp(field_str, "syscall") == 0) - flags |= HIST_FIELD_FL_SYSCALL; - else if (strcmp(field_str, "log2") == 0) - flags |= HIST_FIELD_FL_LOG2; - else if (strcmp(field_str, "usecs") == 0) - flags |= HIST_FIELD_FL_TIMESTAMP_USECS; - else { - ret = -EINVAL; - goto out; - } + hist_field = parse_expr(hist_data, file, field_str, flags, + NULL, 0); + if (IS_ERR(hist_field)) { + ret = PTR_ERR(hist_field); + goto out; } - if (strcmp(field_name, "common_timestamp") == 0) { - flags |= HIST_FIELD_FL_TIMESTAMP; - hist_data->enable_timestamps = true; - if (flags & HIST_FIELD_FL_TIMESTAMP_USECS) - hist_data->attrs->ts_in_usecs = true; - key_size = sizeof(u64); - } else { - field = trace_find_event_field(file->event_call, field_name); - if (!field || !field->size) { - ret = -EINVAL; - goto out; - } - - if (is_string_field(field)) - key_size = MAX_FILTER_STR_VAL; - else - key_size = field->size; - } + key_size = hist_field->size; } - hist_data->fields[key_idx] = create_hist_field(hist_data, field, flags, NULL); - if (!hist_data->fields[key_idx]) { - ret = -ENOMEM; - goto out; - } + hist_data->fields[key_idx] = hist_field; key_size = ALIGN(key_size, sizeof(u64)); hist_data->fields[key_idx]->size = key_size; hist_data->fields[key_idx]->offset = key_offset; + hist_data->key_size += key_size; + if (hist_data->key_size > HIST_KEY_SIZE_MAX) { ret = -EINVAL; goto out; @@ -1419,7 +1757,8 @@ hist_trigger_entry_print(struct seq_file *m, for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); - if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR) + if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR || + hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR) continue; if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { -- cgit v1.2.3-71-gd317 From af6a29bcaf8ff260222a953536c13c167d5c4649 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:53 -0600 Subject: tracing: Generalize per-element hist trigger data Up until now, hist triggers only needed per-element support for saving 'comm' data, which was saved directly as a private data pointer. In anticipation of the need to save other data besides 'comm', add a new hist_elt_data struct for the purpose, and switch the current 'comm'-related code over to that. Link: http://lkml.kernel.org/r/4502c338c965ddf5fc19fb1ec4764391e001ed4b.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 76 +++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 4c3c7d784bfd..f072ed3122c8 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -289,6 +289,10 @@ static struct hist_field *find_var(struct hist_trigger_data *hist_data, return NULL; } +struct hist_elt_data { + char *comm; +}; + static const char *hist_field_name(struct hist_field *field, unsigned int level) { @@ -503,45 +507,61 @@ static inline void save_comm(char *comm, struct task_struct *task) memcpy(comm, task->comm, TASK_COMM_LEN); } -static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt) +static void hist_elt_data_free(struct hist_elt_data *elt_data) +{ + kfree(elt_data->comm); + kfree(elt_data); +} + +static void hist_trigger_elt_data_free(struct tracing_map_elt *elt) { - kfree((char *)elt->private_data); + struct hist_elt_data *elt_data = elt->private_data; + + hist_elt_data_free(elt_data); } -static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt) +static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) { struct hist_trigger_data *hist_data = elt->map->private_data; + unsigned int size = TASK_COMM_LEN; + struct hist_elt_data *elt_data; struct hist_field *key_field; unsigned int i; + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + if (!elt_data) + return -ENOMEM; + for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_EXECNAME) { - unsigned int size = TASK_COMM_LEN + 1; - - elt->private_data = kzalloc(size, GFP_KERNEL); - if (!elt->private_data) + elt_data->comm = kzalloc(size, GFP_KERNEL); + if (!elt_data->comm) { + kfree(elt_data); return -ENOMEM; + } break; } } + elt->private_data = elt_data; + return 0; } -static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt) +static void hist_trigger_elt_data_init(struct tracing_map_elt *elt) { - char *comm = elt->private_data; + struct hist_elt_data *elt_data = elt->private_data; - if (comm) - save_comm(comm, current); + if (elt_data->comm) + save_comm(elt_data->comm, current); } -static const struct tracing_map_ops hist_trigger_elt_comm_ops = { - .elt_alloc = hist_trigger_elt_comm_alloc, - .elt_free = hist_trigger_elt_comm_free, - .elt_init = hist_trigger_elt_comm_init, +static const struct tracing_map_ops hist_trigger_elt_data_ops = { + .elt_alloc = hist_trigger_elt_data_alloc, + .elt_free = hist_trigger_elt_data_free, + .elt_init = hist_trigger_elt_data_init, }; static const char *get_hist_field_flags(struct hist_field *hist_field) @@ -1484,21 +1504,6 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) return 0; } -static bool need_tracing_map_ops(struct hist_trigger_data *hist_data) -{ - struct hist_field *key_field; - unsigned int i; - - for_each_hist_key_field(i, hist_data) { - key_field = hist_data->fields[i]; - - if (key_field->flags & HIST_FIELD_FL_EXECNAME) - return true; - } - - return false; -} - static struct hist_trigger_data * create_hist_data(unsigned int map_bits, struct hist_trigger_attrs *attrs, @@ -1524,8 +1529,7 @@ create_hist_data(unsigned int map_bits, if (ret) goto free; - if (need_tracing_map_ops(hist_data)) - map_ops = &hist_trigger_elt_comm_ops; + map_ops = &hist_trigger_elt_data_ops; hist_data->map = tracing_map_create(map_bits, hist_data->key_size, map_ops, hist_data); @@ -1713,7 +1717,13 @@ hist_trigger_entry_print(struct seq_file *m, seq_printf(m, "%s: [%llx] %-55s", field_name, uval, str); } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { - char *comm = elt->private_data; + struct hist_elt_data *elt_data = elt->private_data; + char *comm; + + if (WARN_ON_ONCE(!elt_data)) + return; + + comm = elt_data->comm; uval = *(u64 *)(key + key_field->offset); seq_printf(m, "%s: %-16s[%10llu]", field_name, -- cgit v1.2.3-71-gd317 From df35d93bbff0297617edf105e6b4057a3953a1a9 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:54 -0600 Subject: tracing: Pass tracing_map_elt to hist_field accessor functions Some accessor functions, such as for variable references, require access to a corrsponding tracing_map_elt. Add a tracing_map_elt param to the function signature and update the accessor functions accordingly. Link: http://lkml.kernel.org/r/e0f292b068e9e4948da1d5af21b5ae0efa9b5717.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 91 +++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f072ed3122c8..7a54ab50176b 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -26,8 +26,10 @@ struct hist_field; -typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event, - struct ring_buffer_event *rbe); +typedef u64 (*hist_field_fn_t) (struct hist_field *field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event); #define HIST_FIELD_OPERANDS_MAX 2 #define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) @@ -59,28 +61,36 @@ struct hist_field { char *name; }; -static u64 hist_field_none(struct hist_field *field, void *event, - struct ring_buffer_event *rbe) +static u64 hist_field_none(struct hist_field *field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { return 0; } -static u64 hist_field_counter(struct hist_field *field, void *event, - struct ring_buffer_event *rbe) +static u64 hist_field_counter(struct hist_field *field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { return 1; } -static u64 hist_field_string(struct hist_field *hist_field, void *event, - struct ring_buffer_event *rbe) +static u64 hist_field_string(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { char *addr = (char *)(event + hist_field->field->offset); return (u64)(unsigned long)addr; } -static u64 hist_field_dynstring(struct hist_field *hist_field, void *event, - struct ring_buffer_event *rbe) +static u64 hist_field_dynstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { u32 str_item = *(u32 *)(event + hist_field->field->offset); int str_loc = str_item & 0xffff; @@ -89,54 +99,64 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event, return (u64)(unsigned long)addr; } -static u64 hist_field_pstring(struct hist_field *hist_field, void *event, - struct ring_buffer_event *rbe) +static u64 hist_field_pstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { char **addr = (char **)(event + hist_field->field->offset); return (u64)(unsigned long)*addr; } -static u64 hist_field_log2(struct hist_field *hist_field, void *event, - struct ring_buffer_event *rbe) +static u64 hist_field_log2(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, event, rbe); + u64 val = operand->fn(operand, elt, rbe, event); return (u64) ilog2(roundup_pow_of_two(val)); } -static u64 hist_field_plus(struct hist_field *hist_field, void *event, - struct ring_buffer_event *rbe) +static u64 hist_field_plus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, event, rbe); - u64 val2 = operand2->fn(operand2, event, rbe); + u64 val1 = operand1->fn(operand1, elt, rbe, event); + u64 val2 = operand2->fn(operand2, elt, rbe, event); return val1 + val2; } -static u64 hist_field_minus(struct hist_field *hist_field, void *event, - struct ring_buffer_event *rbe) +static u64 hist_field_minus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, event, rbe); - u64 val2 = operand2->fn(operand2, event, rbe); + u64 val1 = operand1->fn(operand1, elt, rbe, event); + u64 val2 = operand2->fn(operand2, elt, rbe, event); return val1 - val2; } -static u64 hist_field_unary_minus(struct hist_field *hist_field, void *event, - struct ring_buffer_event *rbe) +static u64 hist_field_unary_minus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { struct hist_field *operand = hist_field->operands[0]; - s64 sval = (s64)operand->fn(operand, event, rbe); + s64 sval = (s64)operand->fn(operand, elt, rbe, event); u64 val = (u64)-sval; return val; @@ -144,8 +164,9 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field, void *event, #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, \ - void *event, \ - struct ring_buffer_event *rbe) \ + struct tracing_map_elt *elt, \ + struct ring_buffer_event *rbe, \ + void *event) \ { \ type *addr = (type *)(event + hist_field->field->offset); \ \ @@ -233,8 +254,10 @@ struct hist_trigger_data { bool remove; }; -static u64 hist_field_timestamp(struct hist_field *hist_field, void *event, - struct ring_buffer_event *rbe) +static u64 hist_field_timestamp(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) { struct hist_trigger_data *hist_data = hist_field->hist_data; struct trace_array *tr = hist_data->event_file->tr; @@ -1570,7 +1593,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, rec, rbe); + hist_val = hist_field->fn(hist_field, elt, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; tracing_map_set_var(elt, var_idx, hist_val); @@ -1582,7 +1605,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_key_field(i, hist_data) { hist_field = hist_data->fields[i]; if (hist_field->flags & HIST_FIELD_FL_VAR) { - hist_val = hist_field->fn(hist_field, rec, rbe); + hist_val = hist_field->fn(hist_field, elt, rbe, rec); var_idx = hist_field->var.idx; tracing_map_set_var(elt, var_idx, hist_val); } @@ -1620,9 +1643,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, bool use_compound_key = (hist_data->n_keys > 1); unsigned long entries[HIST_STACKTRACE_DEPTH]; char compound_key[HIST_KEY_SIZE_MAX]; + struct tracing_map_elt *elt = NULL; struct stack_trace stacktrace; struct hist_field *key_field; - struct tracing_map_elt *elt; u64 field_contents; void *key = NULL; unsigned int i; @@ -1643,7 +1666,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, key = entries; } else { - field_contents = key_field->fn(key_field, rec, rbe); + field_contents = key_field->fn(key_field, elt, rbe, rec); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; -- cgit v1.2.3-71-gd317 From 19a9facd0fe33a3e376923383958b2c86cbd3994 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:55 -0600 Subject: tracing: Add hist_field 'type' field Future support for synthetic events requires hist_field 'type' information, so add a field for that. Also, make other hist_field attribute usage consistent (size, is_signed, etc). Link: http://lkml.kernel.org/r/3fd12a2e86316b05151ba0d7c68268e780af2c9d.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7a54ab50176b..e30bd86bee8e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -54,6 +54,7 @@ struct hist_field { unsigned int size; unsigned int offset; unsigned int is_signed; + const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; struct hist_var var; @@ -717,6 +718,7 @@ static void destroy_hist_field(struct hist_field *hist_field, kfree(hist_field->var.name); kfree(hist_field->name); + kfree(hist_field->type); kfree(hist_field); } @@ -742,6 +744,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_HITCOUNT) { hist_field->fn = hist_field_counter; + hist_field->size = sizeof(u64); + hist_field->type = kstrdup("u64", GFP_KERNEL); + if (!hist_field->type) + goto free; goto out; } @@ -755,12 +761,18 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field->fn = hist_field_log2; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; + hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); + if (!hist_field->type) + goto free; goto out; } if (flags & HIST_FIELD_FL_TIMESTAMP) { hist_field->fn = hist_field_timestamp; hist_field->size = sizeof(u64); + hist_field->type = kstrdup("u64", GFP_KERNEL); + if (!hist_field->type) + goto free; goto out; } @@ -770,6 +782,11 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (is_string_field(field)) { flags |= HIST_FIELD_FL_STRING; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = kstrdup(field->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + if (field->filter_type == FILTER_STATIC_STRING) hist_field->fn = hist_field_string; else if (field->filter_type == FILTER_DYN_STRING) @@ -777,6 +794,12 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, else hist_field->fn = hist_field_pstring; } else { + hist_field->size = field->size; + hist_field->is_signed = field->is_signed; + hist_field->type = kstrdup(field->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + hist_field->fn = select_value_fn(field->size, field->is_signed); if (!hist_field->fn) { @@ -949,6 +972,11 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, expr->operands[0] = operand1; expr->operator = FIELD_OP_UNARY_MINUS; expr->name = expr_str(expr, 0); + expr->type = kstrdup(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free; + } return expr; free: @@ -1042,6 +1070,11 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->operands[1] = operand2; expr->operator = field_op; expr->name = expr_str(expr, 0); + expr->type = kstrdup(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free; + } switch (field_op) { case FIELD_OP_MINUS: -- cgit v1.2.3-71-gd317 From 067fe038e70f6e64960d26a79c4df5f1413d0f13 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:56 -0600 Subject: tracing: Add variable reference handling to hist triggers Add the necessary infrastructure to allow the variables defined on one event to be referenced in another. This allows variables set by a previous event to be referenced and used in expressions combining the variable values saved by that previous event and the event fields of the current event. For example, here's how a latency can be calculated and saved into yet another variable named 'wakeup_lat': # echo 'hist:keys=pid,prio:ts0=common_timestamp ... # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ... In the first event, the event's timetamp is saved into the variable ts0. In the next line, ts0 is subtracted from the second event's timestamp to produce the latency. Further users of variable references will be described in subsequent patches, such as for instance how the 'wakeup_lat' variable above can be displayed in a latency histogram. Link: http://lkml.kernel.org/r/b1d3e6975374e34d501ff417c20189c3f9b2c7b8.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 + kernel/trace/trace.h | 3 + kernel/trace/trace_events_hist.c | 661 +++++++++++++++++++++++++++++++++++- kernel/trace/trace_events_trigger.c | 6 + 4 files changed, 656 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82cc8891fda6..68f8702af9fb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7783,6 +7783,7 @@ static int instance_mkdir(const char *name) INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); + INIT_LIST_HEAD(&tr->hist_vars); if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; @@ -8533,6 +8534,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); + INIT_LIST_HEAD(&global_trace.hist_vars); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 89771b4f16df..99b7ee7ed127 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -274,6 +274,7 @@ struct trace_array { int function_enabled; #endif int time_stamp_abs_ref; + struct list_head hist_vars; }; enum { @@ -1548,6 +1549,8 @@ extern void pause_named_trigger(struct event_trigger_data *data); extern void unpause_named_trigger(struct event_trigger_data *data); extern void set_named_trigger_data(struct event_trigger_data *data, struct event_trigger_data *named_data); +extern struct event_trigger_data * +get_named_trigger_data(struct event_trigger_data *data); extern int register_event_command(struct event_command *cmd); extern int unregister_event_command(struct event_command *cmd); extern int register_trigger_hist_enable_disable_cmds(void); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e30bd86bee8e..dbcdd2ff76a4 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -59,7 +59,12 @@ struct hist_field { struct hist_trigger_data *hist_data; struct hist_var var; enum field_op_id operator; + char *system; + char *event_name; char *name; + unsigned int var_idx; + unsigned int var_ref_idx; + bool read_once; }; static u64 hist_field_none(struct hist_field *field, @@ -214,6 +219,7 @@ enum hist_field_flags { HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11, HIST_FIELD_FL_VAR = 1 << 12, HIST_FIELD_FL_EXPR = 1 << 13, + HIST_FIELD_FL_VAR_REF = 1 << 14, }; struct var_defs { @@ -253,6 +259,8 @@ struct hist_trigger_data { struct tracing_map *map; bool enable_timestamps; bool remove; + struct hist_field *var_refs[TRACING_MAP_VARS_MAX]; + unsigned int n_var_refs; }; static u64 hist_field_timestamp(struct hist_field *hist_field, @@ -271,6 +279,214 @@ static u64 hist_field_timestamp(struct hist_field *hist_field, return ts; } +struct hist_var_data { + struct list_head list; + struct hist_trigger_data *hist_data; +}; + +static struct hist_field * +check_field_for_var_ref(struct hist_field *hist_field, + struct hist_trigger_data *var_data, + unsigned int var_idx) +{ + struct hist_field *found = NULL; + + if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) { + if (hist_field->var.idx == var_idx && + hist_field->var.hist_data == var_data) { + found = hist_field; + } + } + + return found; +} + +static struct hist_field * +check_field_for_var_refs(struct hist_trigger_data *hist_data, + struct hist_field *hist_field, + struct hist_trigger_data *var_data, + unsigned int var_idx, + unsigned int level) +{ + struct hist_field *found = NULL; + unsigned int i; + + if (level > 3) + return found; + + if (!hist_field) + return found; + + found = check_field_for_var_ref(hist_field, var_data, var_idx); + if (found) + return found; + + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) { + struct hist_field *operand; + + operand = hist_field->operands[i]; + found = check_field_for_var_refs(hist_data, operand, var_data, + var_idx, level + 1); + if (found) + return found; + } + + return found; +} + +static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, + struct hist_trigger_data *var_data, + unsigned int var_idx) +{ + struct hist_field *hist_field, *found = NULL; + unsigned int i; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + found = check_field_for_var_refs(hist_data, hist_field, + var_data, var_idx, 0); + if (found) + return found; + } + + return found; +} + +static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, + unsigned int var_idx) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *found = NULL; + struct hist_var_data *var_data; + + list_for_each_entry(var_data, &tr->hist_vars, list) { + if (var_data->hist_data == hist_data) + continue; + found = find_var_ref(var_data->hist_data, hist_data, var_idx); + if (found) + break; + } + + return found; +} + +static bool check_var_refs(struct hist_trigger_data *hist_data) +{ + struct hist_field *field; + bool found = false; + int i; + + for_each_hist_field(i, hist_data) { + field = hist_data->fields[i]; + if (field && field->flags & HIST_FIELD_FL_VAR) { + if (find_any_var_ref(hist_data, field->var.idx)) { + found = true; + break; + } + } + } + + return found; +} + +static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data, *found = NULL; + + list_for_each_entry(var_data, &tr->hist_vars, list) { + if (var_data->hist_data == hist_data) { + found = var_data; + break; + } + } + + return found; +} + +static bool field_has_hist_vars(struct hist_field *hist_field, + unsigned int level) +{ + int i; + + if (level > 3) + return false; + + if (!hist_field) + return false; + + if (hist_field->flags & HIST_FIELD_FL_VAR || + hist_field->flags & HIST_FIELD_FL_VAR_REF) + return true; + + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) { + struct hist_field *operand; + + operand = hist_field->operands[i]; + if (field_has_hist_vars(operand, level + 1)) + return true; + } + + return false; +} + +static bool has_hist_vars(struct hist_trigger_data *hist_data) +{ + struct hist_field *hist_field; + int i; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (field_has_hist_vars(hist_field, 0)) + return true; + } + + return false; +} + +static int save_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data; + + var_data = find_hist_vars(hist_data); + if (var_data) + return 0; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + var_data = kzalloc(sizeof(*var_data), GFP_KERNEL); + if (!var_data) { + trace_array_put(tr); + return -ENOMEM; + } + + var_data->hist_data = hist_data; + list_add(&var_data->list, &tr->hist_vars); + + return 0; +} + +static void remove_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data; + + var_data = find_hist_vars(hist_data); + if (!var_data) + return; + + if (WARN_ON(check_var_refs(hist_data))) + return; + + list_del(&var_data->list); + + kfree(var_data); + + trace_array_put(tr); +} + static struct hist_field *find_var_field(struct hist_trigger_data *hist_data, const char *var_name) { @@ -313,10 +529,137 @@ static struct hist_field *find_var(struct hist_trigger_data *hist_data, return NULL; } +static struct trace_event_file *find_var_file(struct trace_array *tr, + char *system, + char *event_name, + char *var_name) +{ + struct hist_trigger_data *var_hist_data; + struct hist_var_data *var_data; + struct trace_event_file *file, *found = NULL; + + if (system) + return find_event_file(tr, system, event_name); + + list_for_each_entry(var_data, &tr->hist_vars, list) { + var_hist_data = var_data->hist_data; + file = var_hist_data->event_file; + if (file == found) + continue; + + if (find_var_field(var_hist_data, var_name)) { + if (found) + return NULL; + + found = file; + } + } + + return found; +} + +static struct hist_field *find_file_var(struct trace_event_file *file, + const char *var_name) +{ + struct hist_trigger_data *test_data; + struct event_trigger_data *test; + struct hist_field *hist_field; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + test_data = test->private_data; + hist_field = find_var_field(test_data, var_name); + if (hist_field) + return hist_field; + } + } + + return NULL; +} + +static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, + char *system, + char *event_name, + char *var_name) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *hist_field = NULL; + struct trace_event_file *file; + + file = find_var_file(tr, system, event_name, var_name); + if (!file) + return NULL; + + hist_field = find_file_var(file, var_name); + + return hist_field; +} + struct hist_elt_data { char *comm; + u64 *var_ref_vals; }; +static u64 hist_field_var_ref(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_elt_data *elt_data; + u64 var_val = 0; + + elt_data = elt->private_data; + var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; + + return var_val; +} + +static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key, + u64 *var_ref_vals, bool self) +{ + struct hist_trigger_data *var_data; + struct tracing_map_elt *var_elt; + struct hist_field *hist_field; + unsigned int i, var_idx; + bool resolved = true; + u64 var_val = 0; + + for (i = 0; i < hist_data->n_var_refs; i++) { + hist_field = hist_data->var_refs[i]; + var_idx = hist_field->var.idx; + var_data = hist_field->var.hist_data; + + if (var_data == NULL) { + resolved = false; + break; + } + + if ((self && var_data != hist_data) || + (!self && var_data == hist_data)) + continue; + + var_elt = tracing_map_lookup(var_data->map, key); + if (!var_elt) { + resolved = false; + break; + } + + if (!tracing_map_var_set(var_elt, var_idx)) { + resolved = false; + break; + } + + if (self || !hist_field->read_once) + var_val = tracing_map_read_var(var_elt, var_idx); + else + var_val = tracing_map_read_var_once(var_elt, var_idx); + + var_ref_vals[i] = var_val; + } + + return resolved; +} + static const char *hist_field_name(struct hist_field *field, unsigned int level) { @@ -331,8 +674,20 @@ static const char *hist_field_name(struct hist_field *field, field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; - else if (field->flags & HIST_FIELD_FL_EXPR) - field_name = field->name; + else if (field->flags & HIST_FIELD_FL_EXPR || + field->flags & HIST_FIELD_FL_VAR_REF) { + if (field->system) { + static char full_name[MAX_FILTER_STR_VAL]; + + strcat(full_name, field->system); + strcat(full_name, "."); + strcat(full_name, field->event_name); + strcat(full_name, "."); + strcat(full_name, field->name); + field_name = full_name; + } else + field_name = field->name; + } if (field_name == NULL) field_name = ""; @@ -612,6 +967,9 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) static void expr_field_str(struct hist_field *field, char *expr) { + if (field->flags & HIST_FIELD_FL_VAR_REF) + strcat(expr, "$"); + strcat(expr, hist_field_name(field, 0)); if (field->flags) { @@ -742,6 +1100,11 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_EXPR) goto out; /* caller will populate */ + if (flags & HIST_FIELD_FL_VAR_REF) { + hist_field->fn = hist_field_var_ref; + goto out; + } + if (flags & HIST_FIELD_FL_HITCOUNT) { hist_field->fn = hist_field_counter; hist_field->size = sizeof(u64); @@ -835,6 +1198,144 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data) } } +static int init_var_ref(struct hist_field *ref_field, + struct hist_field *var_field, + char *system, char *event_name) +{ + int err = 0; + + ref_field->var.idx = var_field->var.idx; + ref_field->var.hist_data = var_field->hist_data; + ref_field->size = var_field->size; + ref_field->is_signed = var_field->is_signed; + ref_field->flags |= var_field->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + + if (system) { + ref_field->system = kstrdup(system, GFP_KERNEL); + if (!ref_field->system) + return -ENOMEM; + } + + if (event_name) { + ref_field->event_name = kstrdup(event_name, GFP_KERNEL); + if (!ref_field->event_name) { + err = -ENOMEM; + goto free; + } + } + + ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL); + if (!ref_field->name) { + err = -ENOMEM; + goto free; + } + + ref_field->type = kstrdup(var_field->type, GFP_KERNEL); + if (!ref_field->type) { + err = -ENOMEM; + goto free; + } + out: + return err; + free: + kfree(ref_field->system); + kfree(ref_field->event_name); + kfree(ref_field->name); + + goto out; +} + +static struct hist_field *create_var_ref(struct hist_field *var_field, + char *system, char *event_name) +{ + unsigned long flags = HIST_FIELD_FL_VAR_REF; + struct hist_field *ref_field; + + ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); + if (ref_field) { + if (init_var_ref(ref_field, var_field, system, event_name)) { + destroy_hist_field(ref_field, 0); + return NULL; + } + } + + return ref_field; +} + +static bool is_var_ref(char *var_name) +{ + if (!var_name || strlen(var_name) < 2 || var_name[0] != '$') + return false; + + return true; +} + +static char *field_name_from_var(struct hist_trigger_data *hist_data, + char *var_name) +{ + char *name, *field; + unsigned int i; + + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) { + name = hist_data->attrs->var_defs.name[i]; + + if (strcmp(var_name, name) == 0) { + field = hist_data->attrs->var_defs.expr[i]; + if (contains_operator(field) || is_var_ref(field)) + continue; + return field; + } + } + + return NULL; +} + +static char *local_field_var_ref(struct hist_trigger_data *hist_data, + char *system, char *event_name, + char *var_name) +{ + struct trace_event_call *call; + + if (system && event_name) { + call = hist_data->event_file->event_call; + + if (strcmp(system, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + if (!!system != !!event_name) + return NULL; + + if (!is_var_ref(var_name)) + return NULL; + + var_name++; + + return field_name_from_var(hist_data, var_name); +} + +static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, + char *system, char *event_name, + char *var_name) +{ + struct hist_field *var_field = NULL, *ref_field = NULL; + + if (!is_var_ref(var_name)) + return NULL; + + var_name++; + + var_field = find_event_var(hist_data, system, event_name, var_name); + if (var_field) + ref_field = create_var_ref(var_field, system, event_name); + + return ref_field; +} + static struct ftrace_event_field * parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *field_str, unsigned long *flags) @@ -891,10 +1392,40 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long *flags, char *var_name) { + char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str; struct ftrace_event_field *field = NULL; struct hist_field *hist_field = NULL; int ret = 0; + s = strchr(str, '.'); + if (s) { + s = strchr(++s, '.'); + if (s) { + ref_system = strsep(&str, "."); + if (!str) { + ret = -EINVAL; + goto out; + } + ref_event = strsep(&str, "."); + if (!str) { + ret = -EINVAL; + goto out; + } + ref_var = str; + } + } + + s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); + if (!s) { + hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); + if (hist_field) { + hist_data->var_refs[hist_data->n_var_refs] = hist_field; + hist_field->var_ref_idx = hist_data->n_var_refs++; + return hist_field; + } + } else + str = s; + field = parse_field(hist_data, file, str, flags); if (IS_ERR(field)) { ret = PTR_ERR(field); @@ -1066,6 +1597,9 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, goto free; } + operand1->read_once = true; + operand2->read_once = true; + expr->operands[0] = operand1; expr->operands[1] = operand2; expr->operator = field_op; @@ -1238,6 +1772,12 @@ static int create_key_field(struct hist_trigger_data *hist_data, goto out; } + if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { + destroy_hist_field(hist_field, 0); + ret = -EINVAL; + goto out; + } + key_size = hist_field->size; } @@ -1576,6 +2116,7 @@ create_hist_data(unsigned int map_bits, hist_data->attrs = attrs; hist_data->remove = remove; + hist_data->event_file = file; ret = create_hist_fields(hist_data, file); if (ret) @@ -1598,12 +2139,6 @@ create_hist_data(unsigned int map_bits, ret = create_tracing_map_fields(hist_data); if (ret) goto free; - - ret = tracing_map_init(hist_data->map); - if (ret) - goto free; - - hist_data->event_file = file; out: return hist_data; free: @@ -1618,12 +2153,17 @@ create_hist_data(unsigned int map_bits, static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe) + struct ring_buffer_event *rbe, + u64 *var_ref_vals) { + struct hist_elt_data *elt_data; struct hist_field *hist_field; unsigned int i, var_idx; u64 hist_val; + elt_data = elt->private_data; + elt_data->var_ref_vals = var_ref_vals; + for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; hist_val = hist_field->fn(hist_field, elt, rbe, rec); @@ -1675,6 +2215,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, struct hist_trigger_data *hist_data = data->private_data; bool use_compound_key = (hist_data->n_keys > 1); unsigned long entries[HIST_STACKTRACE_DEPTH]; + u64 var_ref_vals[TRACING_MAP_VARS_MAX]; char compound_key[HIST_KEY_SIZE_MAX]; struct tracing_map_elt *elt = NULL; struct stack_trace stacktrace; @@ -1714,9 +2255,15 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, if (use_compound_key) key = compound_key; + if (hist_data->n_var_refs && + !resolve_var_refs(hist_data, key, var_ref_vals, false)) + return; + elt = tracing_map_insert(hist_data->map, key); - if (elt) - hist_trigger_elt_update(hist_data, elt, rec, rbe); + if (!elt) + return; + + hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -1931,8 +2478,11 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) seq_puts(m, "common_timestamp"); - else if (field_name) + else if (field_name) { + if (hist_field->flags & HIST_FIELD_FL_VAR_REF) + seq_putc(m, '$'); seq_printf(m, "%s", field_name); + } if (hist_field->flags) { const char *flags_str = get_hist_field_flags(hist_field); @@ -2072,7 +2622,11 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, if (!data->ref) { if (data->name) del_named_trigger(data); + trigger_data_free(data); + + remove_hist_vars(hist_data); + destroy_hist_data(hist_data); } } @@ -2285,23 +2839,55 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, goto out; } - list_add_rcu(&data->list, &file->triggers); ret++; - update_cond_flag(file); - if (hist_data->enable_timestamps) tracing_set_time_stamp_abs(file->tr, true); + out: + return ret; +} + +static int hist_trigger_enable(struct event_trigger_data *data, + struct trace_event_file *file) +{ + int ret = 0; + + list_add_tail_rcu(&data->list, &file->triggers); + + update_cond_flag(file); if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); update_cond_flag(file); ret--; } - out: + return ret; } +static bool hist_trigger_check_refs(struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + hist_data = test->private_data; + if (check_var_refs(hist_data)) + return true; + break; + } + } + + return false; +} + static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) @@ -2334,11 +2920,30 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, } } +static bool hist_file_check_refs(struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; + if (check_var_refs(hist_data)) + return true; + } + } + + return false; +} + static void hist_unreg_all(struct trace_event_file *file) { struct event_trigger_data *test, *n; struct hist_trigger_data *hist_data; + if (hist_file_check_refs(file)) + return; + list_for_each_entry_safe(test, n, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { hist_data = test->private_data; @@ -2414,6 +3019,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, } if (remove) { + if (hist_trigger_check_refs(trigger_data, file)) { + ret = -EBUSY; + goto out_free; + } + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); ret = 0; goto out_free; @@ -2431,14 +3041,33 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, goto out_free; } else if (ret < 0) goto out_free; + + if (get_named_trigger_data(trigger_data)) + goto enable; + + if (has_hist_vars(hist_data)) + save_hist_vars(hist_data); + + ret = tracing_map_init(hist_data->map); + if (ret) + goto out_unreg; +enable: + ret = hist_trigger_enable(trigger_data, file); + if (ret) + goto out_unreg; + /* Just return zero, not the number of registered triggers */ ret = 0; out: return ret; + out_unreg: + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); + remove_hist_vars(hist_data); + kfree(trigger_data); destroy_hist_data(hist_data); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 632471692462..d251cabcf69a 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -909,6 +909,12 @@ void set_named_trigger_data(struct event_trigger_data *data, data->named_data = named_data; } +struct event_trigger_data * +get_named_trigger_data(struct event_trigger_data *data) +{ + return data->named_data; +} + static void traceon_trigger(struct event_trigger_data *data, void *rec, struct ring_buffer_event *event) -- cgit v1.2.3-71-gd317 From 0212e2aa30e112363aa559f30f6c24ae095f3e78 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:57 -0600 Subject: tracing: Add hist trigger action hook Add a hook for executing extra actions whenever a histogram entry is added or updated. The default 'action' when a hist entry is added to a histogram is to update the set of values associated with it. Some applications may want to perform additional actions at that point, such as generate another event, or compare and save a maximum. Add a simple framework for doing that; specific actions will be implemented on top of it in later patches. Link: http://lkml.kernel.org/r/9482ba6a3eaf5ca6e60954314beacd0e25c05b24.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 106 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index dbcdd2ff76a4..68b9d6d396a6 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -33,6 +33,7 @@ typedef u64 (*hist_field_fn_t) (struct hist_field *field, #define HIST_FIELD_OPERANDS_MAX 2 #define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) +#define HIST_ACTIONS_MAX 8 enum field_op_id { FIELD_OP_NONE, @@ -242,6 +243,9 @@ struct hist_trigger_attrs { char *assignment_str[TRACING_MAP_VARS_MAX]; unsigned int n_assignments; + char *action_str[HIST_ACTIONS_MAX]; + unsigned int n_actions; + struct var_defs var_defs; }; @@ -261,6 +265,21 @@ struct hist_trigger_data { bool remove; struct hist_field *var_refs[TRACING_MAP_VARS_MAX]; unsigned int n_var_refs; + + struct action_data *actions[HIST_ACTIONS_MAX]; + unsigned int n_actions; +}; + +struct action_data; + +typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, + struct action_data *data, u64 *var_ref_vals); + +struct action_data { + action_fn_t fn; + unsigned int var_ref_idx; }; static u64 hist_field_timestamp(struct hist_field *hist_field, @@ -764,6 +783,9 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) for (i = 0; i < attrs->n_assignments; i++) kfree(attrs->assignment_str[i]); + for (i = 0; i < attrs->n_actions; i++) + kfree(attrs->action_str[i]); + kfree(attrs->name); kfree(attrs->sort_key_str); kfree(attrs->keys_str); @@ -771,6 +793,16 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) kfree(attrs); } +static int parse_action(char *str, struct hist_trigger_attrs *attrs) +{ + int ret = 0; + + if (attrs->n_actions >= HIST_ACTIONS_MAX) + return ret; + + return ret; +} + static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) { int ret = 0; @@ -854,8 +886,9 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) else if (strcmp(str, "clear") == 0) attrs->clear = true; else { - ret = -EINVAL; - goto free; + ret = parse_action(str, attrs); + if (ret) + goto free; } } @@ -2047,11 +2080,55 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) return ret; } +static void destroy_actions(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + kfree(data); + } +} + +static int parse_actions(struct hist_trigger_data *hist_data) +{ + unsigned int i; + int ret = 0; + char *str; + + for (i = 0; i < hist_data->attrs->n_actions; i++) { + str = hist_data->attrs->action_str[i]; + } + + return ret; +} + +static int create_actions(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + struct action_data *data; + unsigned int i; + int ret = 0; + + for (i = 0; i < hist_data->attrs->n_actions; i++) { + data = hist_data->actions[i]; + } + + return ret; +} + static void destroy_hist_data(struct hist_trigger_data *hist_data) { + if (!hist_data) + return; + destroy_hist_trigger_attrs(hist_data->attrs); destroy_hist_fields(hist_data); tracing_map_destroy(hist_data->map); + + destroy_actions(hist_data); + kfree(hist_data); } @@ -2118,6 +2195,10 @@ create_hist_data(unsigned int map_bits, hist_data->remove = remove; hist_data->event_file = file; + ret = parse_actions(hist_data); + if (ret) + goto free; + ret = create_hist_fields(hist_data, file); if (ret) goto free; @@ -2209,6 +2290,20 @@ static inline void add_to_key(char *compound_key, void *key, memcpy(compound_key + key_field->offset, key, size); } +static void +hist_trigger_actions(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, u64 *var_ref_vals) +{ + struct action_data *data; + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + data = hist_data->actions[i]; + data->fn(hist_data, elt, rec, rbe, data, var_ref_vals); + } +} + static void event_hist_trigger(struct event_trigger_data *data, void *rec, struct ring_buffer_event *rbe) { @@ -2264,6 +2359,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, return; hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); + + if (resolve_var_refs(hist_data, key, var_ref_vals, true)) + hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -3048,6 +3146,10 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, if (has_hist_vars(hist_data)) save_hist_vars(hist_data); + ret = create_actions(hist_data, file); + if (ret) + goto out_unreg; + ret = tracing_map_init(hist_data->map); if (ret) goto out_unreg; -- cgit v1.2.3-71-gd317 From 4b147936fa509650beaf638b331573c23ba4d609 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:58 -0600 Subject: tracing: Add support for 'synthetic' events Synthetic events are user-defined events generated from hist trigger variables saved from one or more other events. To define a synthetic event, the user writes a simple specification consisting of the name of the new event along with one or more variables and their type(s), to the tracing/synthetic_events file. For instance, the following creates a new event named 'wakeup_latency' with 3 fields: lat, pid, and prio: # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> \ /sys/kernel/debug/tracing/synthetic_events Reading the tracing/synthetic_events file lists all the currently-defined synthetic events, in this case the event we defined above: # cat /sys/kernel/debug/tracing/synthetic_events wakeup_latency u64 lat; pid_t pid; int prio At this point, the synthetic event is ready to use, and a histogram can be defined using it: # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger The new event is created under the tracing/events/synthetic/ directory and looks and behaves just like any other event: # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency enable filter format hist id trigger Although a histogram can be defined for it, nothing will happen until an action tracing that event via the trace_synth() function occurs. The trace_synth() function is very similar to all the other trace_* invocations spread throughout the kernel, except in this case the trace_ function and its corresponding tracepoint isn't statically generated but defined by the user at run-time. How this can be automatically hooked up via a hist trigger 'action' is discussed in a subsequent patch. Link: http://lkml.kernel.org/r/c68df2284b7d172669daf9be29db62ad49bbc559.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi [fix noderef.cocci warnings, sizeof pointer for kcalloc of event->fields] Signed-off-by: Fengguang Wu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 895 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 893 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 68b9d6d396a6..80d16d33ad5e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -20,10 +20,16 @@ #include #include #include +#include #include "tracing_map.h" #include "trace.h" +#define SYNTH_SYSTEM "synthetic" +#define SYNTH_FIELDS_MAX 16 + +#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ + struct hist_field; typedef u64 (*hist_field_fn_t) (struct hist_field *field, @@ -270,6 +276,26 @@ struct hist_trigger_data { unsigned int n_actions; }; +struct synth_field { + char *type; + char *name; + size_t size; + bool is_signed; + bool is_string; +}; + +struct synth_event { + struct list_head list; + int ref; + char *name; + struct synth_field **fields; + unsigned int n_fields; + unsigned int n_u64; + struct trace_event_class class; + struct trace_event_call call; + struct tracepoint *tp; +}; + struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, @@ -282,6 +308,790 @@ struct action_data { unsigned int var_ref_idx; }; +static LIST_HEAD(synth_event_list); +static DEFINE_MUTEX(synth_event_mutex); + +struct synth_trace_event { + struct trace_entry ent; + u64 fields[]; +}; + +static int synth_event_define_fields(struct trace_event_call *call) +{ + struct synth_trace_event trace; + int offset = offsetof(typeof(trace), fields); + struct synth_event *event = call->data; + unsigned int i, size, n_u64; + char *name, *type; + bool is_signed; + int ret = 0; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + size = event->fields[i]->size; + is_signed = event->fields[i]->is_signed; + type = event->fields[i]->type; + name = event->fields[i]->name; + ret = trace_define_field(call, type, name, offset, size, + is_signed, FILTER_OTHER); + if (ret) + break; + + if (event->fields[i]->is_string) { + offset += STR_VAR_LEN_MAX; + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + offset += sizeof(u64); + n_u64++; + } + } + + event->n_u64 = n_u64; + + return ret; +} + +static bool synth_field_signed(char *type) +{ + if (strncmp(type, "u", 1) == 0) + return false; + + return true; +} + +static int synth_field_is_string(char *type) +{ + if (strstr(type, "char[") != NULL) + return true; + + return false; +} + +static int synth_field_string_size(char *type) +{ + char buf[4], *end, *start; + unsigned int len; + int size, err; + + start = strstr(type, "char["); + if (start == NULL) + return -EINVAL; + start += strlen("char["); + + end = strchr(type, ']'); + if (!end || end < start) + return -EINVAL; + + len = end - start; + if (len > 3) + return -EINVAL; + + strncpy(buf, start, len); + buf[len] = '\0'; + + err = kstrtouint(buf, 0, &size); + if (err) + return err; + + if (size > STR_VAR_LEN_MAX) + return -EINVAL; + + return size; +} + +static int synth_field_size(char *type) +{ + int size = 0; + + if (strcmp(type, "s64") == 0) + size = sizeof(s64); + else if (strcmp(type, "u64") == 0) + size = sizeof(u64); + else if (strcmp(type, "s32") == 0) + size = sizeof(s32); + else if (strcmp(type, "u32") == 0) + size = sizeof(u32); + else if (strcmp(type, "s16") == 0) + size = sizeof(s16); + else if (strcmp(type, "u16") == 0) + size = sizeof(u16); + else if (strcmp(type, "s8") == 0) + size = sizeof(s8); + else if (strcmp(type, "u8") == 0) + size = sizeof(u8); + else if (strcmp(type, "char") == 0) + size = sizeof(char); + else if (strcmp(type, "unsigned char") == 0) + size = sizeof(unsigned char); + else if (strcmp(type, "int") == 0) + size = sizeof(int); + else if (strcmp(type, "unsigned int") == 0) + size = sizeof(unsigned int); + else if (strcmp(type, "long") == 0) + size = sizeof(long); + else if (strcmp(type, "unsigned long") == 0) + size = sizeof(unsigned long); + else if (strcmp(type, "pid_t") == 0) + size = sizeof(pid_t); + else if (synth_field_is_string(type)) + size = synth_field_string_size(type); + + return size; +} + +static const char *synth_field_fmt(char *type) +{ + const char *fmt = "%llu"; + + if (strcmp(type, "s64") == 0) + fmt = "%lld"; + else if (strcmp(type, "u64") == 0) + fmt = "%llu"; + else if (strcmp(type, "s32") == 0) + fmt = "%d"; + else if (strcmp(type, "u32") == 0) + fmt = "%u"; + else if (strcmp(type, "s16") == 0) + fmt = "%d"; + else if (strcmp(type, "u16") == 0) + fmt = "%u"; + else if (strcmp(type, "s8") == 0) + fmt = "%d"; + else if (strcmp(type, "u8") == 0) + fmt = "%u"; + else if (strcmp(type, "char") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned char") == 0) + fmt = "%u"; + else if (strcmp(type, "int") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned int") == 0) + fmt = "%u"; + else if (strcmp(type, "long") == 0) + fmt = "%ld"; + else if (strcmp(type, "unsigned long") == 0) + fmt = "%lu"; + else if (strcmp(type, "pid_t") == 0) + fmt = "%d"; + else if (synth_field_is_string(type)) + fmt = "%s"; + + return fmt; +} + +static enum print_line_t print_synth_event(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct synth_trace_event *entry; + struct synth_event *se; + unsigned int i, n_u64; + char print_fmt[32]; + const char *fmt; + + entry = (struct synth_trace_event *)iter->ent; + se = container_of(event, struct synth_event, call.event); + + trace_seq_printf(s, "%s: ", se->name); + + for (i = 0, n_u64 = 0; i < se->n_fields; i++) { + if (trace_seq_has_overflowed(s)) + goto end; + + fmt = synth_field_fmt(se->fields[i]->type); + + /* parameter types */ + if (tr->trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", fmt); + + snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); + + /* parameter values */ + if (se->fields[i]->is_string) { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + (char *)&entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64++; + } + } +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static struct trace_event_functions synth_event_funcs = { + .trace = print_synth_event +}; + +static notrace void trace_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int var_ref_idx) +{ + struct trace_event_file *trace_file = __data; + struct synth_trace_event *entry; + struct trace_event_buffer fbuffer; + struct synth_event *event; + unsigned int i, n_u64; + int fields_size = 0; + + event = trace_file->event_call->data; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + fields_size = event->n_u64 * sizeof(u64); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + fields_size); + if (!entry) + return; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + if (event->fields[i]->is_string) { + char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i]; + char *str_field = (char *)&entry->fields[n_u64]; + + strncpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + entry->fields[n_u64] = var_ref_vals[var_ref_idx + i]; + n_u64++; + } + } + + trace_event_buffer_commit(&fbuffer); +} + +static void free_synth_event_print_fmt(struct trace_event_call *call) +{ + if (call) { + kfree(call->print_fmt); + call->print_fmt = NULL; + } +} + +static int __set_synth_event_print_fmt(struct synth_event *event, + char *buf, int len) +{ + const char *fmt; + int pos = 0; + int i; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < event->n_fields; i++) { + fmt = synth_field_fmt(event->fields[i]->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", + event->fields[i]->name, fmt, + i == event->n_fields - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < event->n_fields; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", event->fields[i]->name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_synth_event_print_fmt(struct trace_event_call *call) +{ + struct synth_event *event = call->data; + char *print_fmt; + int len; + + /* First: called with 0 length to calculate the needed length */ + len = __set_synth_event_print_fmt(event, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_synth_event_print_fmt(event, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_synth_field(struct synth_field *field) +{ + kfree(field->type); + kfree(field->name); + kfree(field); +} + +static struct synth_field *parse_synth_field(char *field_type, + char *field_name) +{ + struct synth_field *field; + int len, ret = 0; + char *array; + + if (field_type[0] == ';') + field_type++; + + len = strlen(field_name); + if (field_name[len - 1] == ';') + field_name[len - 1] = '\0'; + + field = kzalloc(sizeof(*field), GFP_KERNEL); + if (!field) + return ERR_PTR(-ENOMEM); + + len = strlen(field_type) + 1; + array = strchr(field_name, '['); + if (array) + len += strlen(array); + field->type = kzalloc(len, GFP_KERNEL); + if (!field->type) { + ret = -ENOMEM; + goto free; + } + strcat(field->type, field_type); + if (array) { + strcat(field->type, array); + *array = '\0'; + } + + field->size = synth_field_size(field->type); + if (!field->size) { + ret = -EINVAL; + goto free; + } + + if (synth_field_is_string(field->type)) + field->is_string = true; + + field->is_signed = synth_field_signed(field->type); + + field->name = kstrdup(field_name, GFP_KERNEL); + if (!field->name) { + ret = -ENOMEM; + goto free; + } + out: + return field; + free: + free_synth_field(field); + field = ERR_PTR(ret); + goto out; +} + +static void free_synth_tracepoint(struct tracepoint *tp) +{ + if (!tp) + return; + + kfree(tp->name); + kfree(tp); +} + +static struct tracepoint *alloc_synth_tracepoint(char *name) +{ + struct tracepoint *tp; + + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + tp->name = kstrdup(name, GFP_KERNEL); + if (!tp->name) { + kfree(tp); + return ERR_PTR(-ENOMEM); + } + + return tp; +} + +typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, + unsigned int var_ref_idx); + +static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, + unsigned int var_ref_idx) +{ + struct tracepoint *tp = event->tp; + + if (unlikely(atomic_read(&tp->key.enabled) > 0)) { + struct tracepoint_func *probe_func_ptr; + synth_probe_func_t probe_func; + void *__data; + + if (!(cpu_online(raw_smp_processor_id()))) + return; + + probe_func_ptr = rcu_dereference_sched((tp)->funcs); + if (probe_func_ptr) { + do { + probe_func = probe_func_ptr->func; + __data = probe_func_ptr->data; + probe_func(__data, var_ref_vals, var_ref_idx); + } while ((++probe_func_ptr)->func); + } + } +} + +static struct synth_event *find_synth_event(const char *name) +{ + struct synth_event *event; + + list_for_each_entry(event, &synth_event_list, list) { + if (strcmp(event->name, name) == 0) + return event; + } + + return NULL; +} + +static int register_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret = 0; + + event->call.class = &event->class; + event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); + if (!event->class.system) { + ret = -ENOMEM; + goto out; + } + + event->tp = alloc_synth_tracepoint(event->name); + if (IS_ERR(event->tp)) { + ret = PTR_ERR(event->tp); + event->tp = NULL; + goto out; + } + + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &synth_event_funcs; + call->class->define_fields = synth_event_define_fields; + + ret = register_trace_event(&call->event); + if (!ret) { + ret = -ENODEV; + goto out; + } + call->flags = TRACE_EVENT_FL_TRACEPOINT; + call->class->reg = trace_event_reg; + call->class->probe = trace_event_raw_event_synth; + call->data = event; + call->tp = event->tp; + + ret = trace_add_event_call(call); + if (ret) { + pr_warn("Failed to register synthetic event: %s\n", + trace_event_name(call)); + goto err; + } + + ret = set_synth_event_print_fmt(call); + if (ret < 0) { + trace_remove_event_call(call); + goto err; + } + out: + return ret; + err: + unregister_trace_event(&call->event); + goto out; +} + +static int unregister_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret; + + ret = trace_remove_event_call(call); + + return ret; +} + +static void free_synth_event(struct synth_event *event) +{ + unsigned int i; + + if (!event) + return; + + for (i = 0; i < event->n_fields; i++) + free_synth_field(event->fields[i]); + + kfree(event->fields); + kfree(event->name); + kfree(event->class.system); + free_synth_tracepoint(event->tp); + free_synth_event_print_fmt(&event->call); + kfree(event); +} + +static struct synth_event *alloc_synth_event(char *event_name, int n_fields, + struct synth_field **fields) +{ + struct synth_event *event; + unsigned int i; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) { + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->name = kstrdup(event_name, GFP_KERNEL); + if (!event->name) { + kfree(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); + if (!event->fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + for (i = 0; i < n_fields; i++) + event->fields[i] = fields[i]; + + event->n_fields = n_fields; + out: + return event; +} + +static void add_or_delete_synth_event(struct synth_event *event, int delete) +{ + if (delete) + free_synth_event(event); + else { + mutex_lock(&synth_event_mutex); + if (!find_synth_event(event->name)) + list_add(&event->list, &synth_event_list); + else + free_synth_event(event); + mutex_unlock(&synth_event_mutex); + } +} + +static int create_synth_event(int argc, char **argv) +{ + struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; + struct synth_event *event = NULL; + bool delete_event = false; + int i, n_fields = 0, ret = 0; + char *name; + + mutex_lock(&synth_event_mutex); + + /* + * Argument syntax: + * - Add synthetic event: field[;field] ... + * - Remove synthetic event: ! field[;field] ... + * where 'field' = type field_name + */ + if (argc < 1) { + ret = -EINVAL; + goto out; + } + + name = argv[0]; + if (name[0] == '!') { + delete_event = true; + name++; + } + + event = find_synth_event(name); + if (event) { + if (delete_event) { + if (event->ref) { + event = NULL; + ret = -EBUSY; + goto out; + } + list_del(&event->list); + goto out; + } + event = NULL; + ret = -EEXIST; + goto out; + } else if (delete_event) + goto out; + + if (argc < 2) { + ret = -EINVAL; + goto out; + } + + for (i = 1; i < argc - 1; i++) { + if (strcmp(argv[i], ";") == 0) + continue; + if (n_fields == SYNTH_FIELDS_MAX) { + ret = -EINVAL; + goto err; + } + + field = parse_synth_field(argv[i], argv[i + 1]); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto err; + } + fields[n_fields] = field; + i++; n_fields++; + } + + if (i < argc) { + ret = -EINVAL; + goto err; + } + + event = alloc_synth_event(name, n_fields, fields); + if (IS_ERR(event)) { + ret = PTR_ERR(event); + event = NULL; + goto err; + } + out: + mutex_unlock(&synth_event_mutex); + + if (event) { + if (delete_event) { + ret = unregister_synth_event(event); + add_or_delete_synth_event(event, !ret); + } else { + ret = register_synth_event(event); + add_or_delete_synth_event(event, ret); + } + } + + return ret; + err: + mutex_unlock(&synth_event_mutex); + + for (i = 0; i < n_fields; i++) + free_synth_field(fields[i]); + free_synth_event(event); + + return ret; +} + +static int release_all_synth_events(void) +{ + struct list_head release_events; + struct synth_event *event, *e; + int ret = 0; + + INIT_LIST_HEAD(&release_events); + + mutex_lock(&synth_event_mutex); + + list_for_each_entry(event, &synth_event_list, list) { + if (event->ref) { + mutex_unlock(&synth_event_mutex); + return -EBUSY; + } + } + + list_splice_init(&event->list, &release_events); + + mutex_unlock(&synth_event_mutex); + + list_for_each_entry_safe(event, e, &release_events, list) { + list_del(&event->list); + + ret = unregister_synth_event(event); + add_or_delete_synth_event(event, !ret); + } + + return ret; +} + + +static void *synth_events_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&synth_event_mutex); + + return seq_list_start(&synth_event_list, *pos); +} + +static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &synth_event_list, pos); +} + +static void synth_events_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&synth_event_mutex); +} + +static int synth_events_seq_show(struct seq_file *m, void *v) +{ + struct synth_field *field; + struct synth_event *event = v; + unsigned int i; + + seq_printf(m, "%s\t", event->name); + + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + + /* parameter values */ + seq_printf(m, "%s %s%s", field->type, field->name, + i == event->n_fields - 1 ? "" : "; "); + } + + seq_putc(m, '\n'); + + return 0; +} + +static const struct seq_operations synth_events_seq_op = { + .start = synth_events_seq_start, + .next = synth_events_seq_next, + .stop = synth_events_seq_stop, + .show = synth_events_seq_show +}; + +static int synth_events_open(struct inode *inode, struct file *file) +{ + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = release_all_synth_events(); + if (ret < 0) + return ret; + } + + return seq_open(file, &synth_events_seq_op); +} + +static ssize_t synth_events_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_synth_event); +} + +static const struct file_operations synth_events_fops = { + .open = synth_events_open, + .write = synth_events_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static u64 hist_field_timestamp(struct hist_field *hist_field, struct tracing_map_elt *elt, struct ring_buffer_event *rbe, @@ -2963,6 +3773,28 @@ static int hist_trigger_enable(struct event_trigger_data *data, return ret; } +static bool have_hist_trigger_match(struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + bool match = false; + + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (hist_trigger_match(data, test, named_data, false)) { + match = true; + break; + } + } + } + + return match; +} + static bool hist_trigger_check_refs(struct event_trigger_data *data, struct trace_event_file *file) { @@ -3038,6 +3870,8 @@ static void hist_unreg_all(struct trace_event_file *file) { struct event_trigger_data *test, *n; struct hist_trigger_data *hist_data; + struct synth_event *se; + const char *se_name; if (hist_file_check_refs(file)) return; @@ -3047,6 +3881,14 @@ static void hist_unreg_all(struct trace_event_file *file) hist_data = test->private_data; list_del_rcu(&test->list); trace_event_trigger_enable_disable(file, 0); + + mutex_lock(&synth_event_mutex); + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref--; + mutex_unlock(&synth_event_mutex); + update_cond_flag(file); if (hist_data->enable_timestamps) tracing_set_time_stamp_abs(file->tr, false); @@ -3065,6 +3907,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, struct hist_trigger_attrs *attrs; struct event_trigger_ops *trigger_ops; struct hist_trigger_data *hist_data; + struct synth_event *se; + const char *se_name; bool remove = false; char *trigger; int ret = 0; @@ -3095,10 +3939,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); - ret = -ENOMEM; trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); - if (!trigger_data) + if (!trigger_data) { + ret = -ENOMEM; goto out_free; + } trigger_data->count = -1; trigger_data->ops = trigger_ops; @@ -3117,12 +3962,23 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, } if (remove) { + if (!have_hist_trigger_match(trigger_data, file)) + goto out_free; + if (hist_trigger_check_refs(trigger_data, file)) { ret = -EBUSY; goto out_free; } cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + + mutex_lock(&synth_event_mutex); + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref--; + mutex_unlock(&synth_event_mutex); + ret = 0; goto out_free; } @@ -3158,6 +4014,13 @@ enable: if (ret) goto out_unreg; + mutex_lock(&synth_event_mutex); + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref++; + mutex_unlock(&synth_event_mutex); + /* Just return zero, not the number of registered triggers */ ret = 0; out: @@ -3330,3 +4193,31 @@ __init int register_trigger_hist_enable_disable_cmds(void) return ret; } + +static __init int trace_events_hist_init(void) +{ + struct dentry *entry = NULL; + struct dentry *d_tracer; + int err = 0; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) { + err = PTR_ERR(d_tracer); + goto err; + } + + entry = tracefs_create_file("synthetic_events", 0644, d_tracer, + NULL, &synth_events_fops); + if (!entry) { + err = -ENODEV; + goto err; + } + + return err; + err: + pr_warn("Could not create tracefs 'synthetic_events' entry\n"); + + return err; +} + +fs_initcall(trace_events_hist_init); -- cgit v1.2.3-71-gd317 From 02205a6752f223779a1b0e9e8ffacbea6e717851 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:51:59 -0600 Subject: tracing: Add support for 'field variables' Users should be able to directly specify event fields in hist trigger 'actions' rather than being forced to explicitly create a variable for that purpose. Add support allowing fields to be used directly in actions, which essentially does just that - creates 'invisible' variables for each bare field specified in an action. If a bare field refers to a field on another (matching) event, it even creates a special histogram for the purpose (since variables can't be defined on an existing histogram after histogram creation). Here's a simple example that demonstrates both. Basically the onmatch() action creates a list of variables corresponding to the parameters of the synthetic event to be generated, and then uses those values to generate the event. So for the wakeup_latency synthetic event 'call' below the first param, $wakeup_lat, is a variable defined explicitly on sched_switch, where 'next_pid' is just a normal field on sched_switch, and prio is a normal field on sched_waking. Since the mechanism works on variables, those two normal fields just have 'invisible' variables created internally for them. In the case of 'prio', which is on another event, we actually need to create an additional hist trigger and define the invisible variable on that, since once a hist trigger is defined, variables can't be added to it later. echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> /sys/kernel/debug/tracing/synthetic_events echo 'hist:keys=pid:ts0=common_timestamp.usecs >> /sys/kernel/debug/tracing/events/sched/sched_waking/trigger echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,prio) >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger Link: http://lkml.kernel.org/r/8e8dcdac1ea180ed7a3689e1caeeccede9dc42b3.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 531 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 530 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 80d16d33ad5e..ad96fd110707 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -255,6 +255,16 @@ struct hist_trigger_attrs { struct var_defs var_defs; }; +struct field_var { + struct hist_field *var; + struct hist_field *val; +}; + +struct field_var_hist { + struct hist_trigger_data *hist_data; + char *cmd; +}; + struct hist_trigger_data { struct hist_field *fields[HIST_FIELDS_MAX]; unsigned int n_vals; @@ -274,6 +284,12 @@ struct hist_trigger_data { struct action_data *actions[HIST_ACTIONS_MAX]; unsigned int n_actions; + + struct field_var *field_vars[SYNTH_FIELDS_MAX]; + unsigned int n_field_vars; + unsigned int n_field_var_str; + struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX]; + unsigned int n_field_var_hists; }; struct synth_field { @@ -1427,6 +1443,7 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, struct hist_elt_data { char *comm; u64 *var_ref_vals; + char *field_var_str[SYNTH_FIELDS_MAX]; }; static u64 hist_field_var_ref(struct hist_field *hist_field, @@ -1731,6 +1748,11 @@ static inline void save_comm(char *comm, struct task_struct *task) static void hist_elt_data_free(struct hist_elt_data *elt_data) { + unsigned int i; + + for (i = 0; i < SYNTH_FIELDS_MAX; i++) + kfree(elt_data->field_var_str[i]); + kfree(elt_data->comm); kfree(elt_data); } @@ -1748,7 +1770,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) unsigned int size = TASK_COMM_LEN; struct hist_elt_data *elt_data; struct hist_field *key_field; - unsigned int i; + unsigned int i, n_str; elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); if (!elt_data) @@ -1767,6 +1789,18 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) } } + n_str = hist_data->n_field_var_str; + + size = STR_VAR_LEN_MAX; + + for (i = 0; i < n_str; i++) { + elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL); + if (!elt_data->field_var_str[i]) { + hist_elt_data_free(elt_data); + return -ENOMEM; + } + } + elt->private_data = elt_data; return 0; @@ -2473,6 +2507,470 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, return ERR_PTR(ret); } +static char *find_trigger_filter(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (test->private_data == hist_data) + return test->filter_str; + } + } + + return NULL; +} + +static struct event_command trigger_hist_cmd; +static int event_hist_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); + +static bool compatible_keys(struct hist_trigger_data *target_hist_data, + struct hist_trigger_data *hist_data, + unsigned int n_keys) +{ + struct hist_field *target_hist_field, *hist_field; + unsigned int n, i, j; + + if (hist_data->n_fields - hist_data->n_vals != n_keys) + return false; + + i = hist_data->n_vals; + j = target_hist_data->n_vals; + + for (n = 0; n < n_keys; n++) { + hist_field = hist_data->fields[i + n]; + target_hist_field = target_hist_data->fields[j + n]; + + if (strcmp(hist_field->type, target_hist_field->type) != 0) + return false; + if (hist_field->size != target_hist_field->size) + return false; + if (hist_field->is_signed != target_hist_field->is_signed) + return false; + } + + return true; +} + +static struct hist_trigger_data * +find_compatible_hist(struct hist_trigger_data *target_hist_data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *test; + unsigned int n_keys; + + n_keys = target_hist_data->n_fields - target_hist_data->n_vals; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; + + if (compatible_keys(target_hist_data, hist_data, n_keys)) + return hist_data; + } + } + + return NULL; +} + +static struct trace_event_file *event_file(struct trace_array *tr, + char *system, char *event_name) +{ + struct trace_event_file *file; + + file = find_event_file(tr, system, event_name); + if (!file) + return ERR_PTR(-EINVAL); + + return file; +} + +static struct hist_field * +find_synthetic_field_var(struct hist_trigger_data *target_hist_data, + char *system, char *event_name, char *field_name) +{ + struct hist_field *event_var; + char *synthetic_name; + + synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!synthetic_name) + return ERR_PTR(-ENOMEM); + + strcpy(synthetic_name, "synthetic_"); + strcat(synthetic_name, field_name); + + event_var = find_event_var(target_hist_data, system, event_name, synthetic_name); + + kfree(synthetic_name); + + return event_var; +} + +/** + * create_field_var_hist - Automatically create a histogram and var for a field + * @target_hist_data: The target hist trigger + * @subsys_name: Optional subsystem name + * @event_name: Optional event name + * @field_name: The name of the field (and the resulting variable) + * + * Hist trigger actions fetch data from variables, not directly from + * events. However, for convenience, users are allowed to directly + * specify an event field in an action, which will be automatically + * converted into a variable on their behalf. + + * If a user specifies a field on an event that isn't the event the + * histogram currently being defined (the target event histogram), the + * only way that can be accomplished is if a new hist trigger is + * created and the field variable defined on that. + * + * This function creates a new histogram compatible with the target + * event (meaning a histogram with the same key as the target + * histogram), and creates a variable for the specified field, but + * with 'synthetic_' prepended to the variable name in order to avoid + * collision with normal field variables. + * + * Return: The variable created for the field. + */ +struct hist_field * +create_field_var_hist(struct hist_trigger_data *target_hist_data, + char *subsys_name, char *event_name, char *field_name) +{ + struct trace_array *tr = target_hist_data->event_file->tr; + struct hist_field *event_var = ERR_PTR(-EINVAL); + struct hist_trigger_data *hist_data; + unsigned int i, n, first = true; + struct field_var_hist *var_hist; + struct trace_event_file *file; + struct hist_field *key_field; + char *saved_filter; + char *cmd; + int ret; + + if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) + return ERR_PTR(-EINVAL); + + file = event_file(tr, subsys_name, event_name); + + if (IS_ERR(file)) { + ret = PTR_ERR(file); + return ERR_PTR(ret); + } + + /* + * Look for a histogram compatible with target. We'll use the + * found histogram specification to create a new matching + * histogram with our variable on it. target_hist_data is not + * yet a registered histogram so we can't use that. + */ + hist_data = find_compatible_hist(target_hist_data, file); + if (!hist_data) + return ERR_PTR(-EINVAL); + + /* See if a synthetic field variable has already been created */ + event_var = find_synthetic_field_var(target_hist_data, subsys_name, + event_name, field_name); + if (!IS_ERR_OR_NULL(event_var)) + return event_var; + + var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL); + if (!var_hist) + return ERR_PTR(-ENOMEM); + + cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!cmd) { + kfree(var_hist); + return ERR_PTR(-ENOMEM); + } + + /* Use the same keys as the compatible histogram */ + strcat(cmd, "keys="); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + if (!first) + strcat(cmd, ","); + strcat(cmd, key_field->field->name); + first = false; + } + + /* Create the synthetic field variable specification */ + strcat(cmd, ":synthetic_"); + strcat(cmd, field_name); + strcat(cmd, "="); + strcat(cmd, field_name); + + /* Use the same filter as the compatible histogram */ + saved_filter = find_trigger_filter(hist_data, file); + if (saved_filter) { + strcat(cmd, " if "); + strcat(cmd, saved_filter); + } + + var_hist->cmd = kstrdup(cmd, GFP_KERNEL); + if (!var_hist->cmd) { + kfree(cmd); + kfree(var_hist); + return ERR_PTR(-ENOMEM); + } + + /* Save the compatible histogram information */ + var_hist->hist_data = hist_data; + + /* Create the new histogram with our variable */ + ret = event_hist_trigger_func(&trigger_hist_cmd, file, + "", "hist", cmd); + if (ret) { + kfree(cmd); + kfree(var_hist->cmd); + kfree(var_hist); + return ERR_PTR(ret); + } + + kfree(cmd); + + /* If we can't find the variable, something went wrong */ + event_var = find_synthetic_field_var(target_hist_data, subsys_name, + event_name, field_name); + if (IS_ERR_OR_NULL(event_var)) { + kfree(var_hist->cmd); + kfree(var_hist); + return ERR_PTR(-EINVAL); + } + + n = target_hist_data->n_field_var_hists; + target_hist_data->field_var_hists[n] = var_hist; + target_hist_data->n_field_var_hists++; + + return event_var; +} + +struct hist_field * +find_target_event_var(struct hist_trigger_data *hist_data, + char *subsys_name, char *event_name, char *var_name) +{ + struct trace_event_file *file = hist_data->event_file; + struct hist_field *hist_field = NULL; + + if (subsys_name) { + struct trace_event_call *call; + + if (!event_name) + return NULL; + + call = file->event_call; + + if (strcmp(subsys_name, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + hist_field = find_var_field(hist_data, var_name); + + return hist_field; +} + +static inline void __update_field_vars(struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *rec, + struct field_var **field_vars, + unsigned int n_field_vars, + unsigned int field_var_str_start) +{ + struct hist_elt_data *elt_data = elt->private_data; + unsigned int i, j, var_idx; + u64 var_val; + + for (i = 0, j = field_var_str_start; i < n_field_vars; i++) { + struct field_var *field_var = field_vars[i]; + struct hist_field *var = field_var->var; + struct hist_field *val = field_var->val; + + var_val = val->fn(val, elt, rbe, rec); + var_idx = var->var.idx; + + if (val->flags & HIST_FIELD_FL_STRING) { + char *str = elt_data->field_var_str[j++]; + char *val_str = (char *)(uintptr_t)var_val; + + strncpy(str, val_str, STR_VAR_LEN_MAX); + var_val = (u64)(uintptr_t)str; + } + tracing_map_set_var(elt, var_idx, var_val); + } +} + +static void update_field_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *rec) +{ + __update_field_vars(elt, rbe, rec, hist_data->field_vars, + hist_data->n_field_vars, 0); +} + +static struct hist_field *create_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *name, int size, const char *type) +{ + struct hist_field *var; + int idx; + + if (find_var(hist_data, file, name) && !hist_data->remove) { + var = ERR_PTR(-EINVAL); + goto out; + } + + var = kzalloc(sizeof(struct hist_field), GFP_KERNEL); + if (!var) { + var = ERR_PTR(-ENOMEM); + goto out; + } + + idx = tracing_map_add_var(hist_data->map); + if (idx < 0) { + kfree(var); + var = ERR_PTR(-EINVAL); + goto out; + } + + var->flags = HIST_FIELD_FL_VAR; + var->var.idx = idx; + var->var.hist_data = var->hist_data = hist_data; + var->size = size; + var->var.name = kstrdup(name, GFP_KERNEL); + var->type = kstrdup(type, GFP_KERNEL); + if (!var->var.name || !var->type) { + kfree(var->var.name); + kfree(var->type); + kfree(var); + var = ERR_PTR(-ENOMEM); + } + out: + return var; +} + +static struct field_var *create_field_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *field_name) +{ + struct hist_field *val = NULL, *var = NULL; + unsigned long flags = HIST_FIELD_FL_VAR; + struct field_var *field_var; + int ret = 0; + + if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { + ret = -EINVAL; + goto err; + } + + val = parse_atom(hist_data, file, field_name, &flags, NULL); + if (IS_ERR(val)) { + ret = PTR_ERR(val); + goto err; + } + + var = create_var(hist_data, file, field_name, val->size, val->type); + if (IS_ERR(var)) { + kfree(val); + ret = PTR_ERR(var); + goto err; + } + + field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL); + if (!field_var) { + kfree(val); + kfree(var); + ret = -ENOMEM; + goto err; + } + + field_var->var = var; + field_var->val = val; + out: + return field_var; + err: + field_var = ERR_PTR(ret); + goto out; +} + +/** + * create_target_field_var - Automatically create a variable for a field + * @target_hist_data: The target hist trigger + * @subsys_name: Optional subsystem name + * @event_name: Optional event name + * @var_name: The name of the field (and the resulting variable) + * + * Hist trigger actions fetch data from variables, not directly from + * events. However, for convenience, users are allowed to directly + * specify an event field in an action, which will be automatically + * converted into a variable on their behalf. + + * This function creates a field variable with the name var_name on + * the hist trigger currently being defined on the target event. If + * subsys_name and event_name are specified, this function simply + * verifies that they do in fact match the target event subsystem and + * event name. + * + * Return: The variable created for the field. + */ +struct field_var * +create_target_field_var(struct hist_trigger_data *target_hist_data, + char *subsys_name, char *event_name, char *var_name) +{ + struct trace_event_file *file = target_hist_data->event_file; + + if (subsys_name) { + struct trace_event_call *call; + + if (!event_name) + return NULL; + + call = file->event_call; + + if (strcmp(subsys_name, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + return create_field_var(target_hist_data, file, var_name); +} + +static void destroy_field_var(struct field_var *field_var) +{ + if (!field_var) + return; + + destroy_hist_field(field_var->var, 0); + destroy_hist_field(field_var->val, 0); + + kfree(field_var); +} + +static void destroy_field_vars(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_field_vars; i++) + destroy_field_var(hist_data->field_vars[i]); +} + +void save_field_var(struct hist_trigger_data *hist_data, + struct field_var *field_var) +{ + hist_data->field_vars[hist_data->n_field_vars++] = field_var; + + if (field_var->val->flags & HIST_FIELD_FL_STRING) + hist_data->n_field_var_str++; +} + static int create_hitcount_val(struct hist_trigger_data *hist_data) { hist_data->fields[HITCOUNT_IDX] = @@ -2928,6 +3426,16 @@ static int create_actions(struct hist_trigger_data *hist_data, return ret; } +static void destroy_field_var_hists(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_field_var_hists; i++) { + kfree(hist_data->field_var_hists[i]->cmd); + kfree(hist_data->field_var_hists[i]); + } +} + static void destroy_hist_data(struct hist_trigger_data *hist_data) { if (!hist_data) @@ -2938,6 +3446,8 @@ static void destroy_hist_data(struct hist_trigger_data *hist_data) tracing_map_destroy(hist_data->map); destroy_actions(hist_data); + destroy_field_vars(hist_data); + destroy_field_var_hists(hist_data); kfree(hist_data); } @@ -3074,6 +3584,8 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, tracing_map_set_var(elt, var_idx, hist_val); } } + + update_field_vars(hist_data, elt, rbe, rec); } static inline void add_to_key(char *compound_key, void *key, @@ -3518,6 +4030,21 @@ static int event_hist_trigger_init(struct event_trigger_ops *ops, return 0; } +static void unregister_field_var_hists(struct hist_trigger_data *hist_data) +{ + struct trace_event_file *file; + unsigned int i; + char *cmd; + int ret; + + for (i = 0; i < hist_data->n_field_var_hists; i++) { + file = hist_data->field_var_hists[i]->hist_data->event_file; + cmd = hist_data->field_var_hists[i]->cmd; + ret = event_hist_trigger_func(&trigger_hist_cmd, file, + "!hist", "hist", cmd); + } +} + static void event_hist_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data) { @@ -3535,6 +4062,8 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, remove_hist_vars(hist_data); + unregister_field_var_hists(hist_data); + destroy_hist_data(hist_data); } } -- cgit v1.2.3-71-gd317 From c282a386a39771588fe4cfdc01bbb8a255092e38 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:52:00 -0600 Subject: tracing: Add 'onmatch' hist trigger action support Add an 'onmatch(matching.event).(param list)' hist trigger action which is invoked with the set of variables or event fields named in the 'param list'. The result is the generation of a synthetic event that consists of the values contained in those variables and/or fields at the time the invoking event was hit. As an example the below defines a simple synthetic event using a variable defined on the sched_wakeup_new event, and shows the event definition with unresolved fields, since the sched_wakeup_new event with the testpid variable hasn't been defined yet: # echo 'wakeup_new_test pid_t pid; int prio' >> \ /sys/kernel/debug/tracing/synthetic_events # cat /sys/kernel/debug/tracing/synthetic_events wakeup_new_test pid_t pid; int prio The following hist trigger both defines a testpid variable and specifies an onmatch() trace action that uses that variable along with a non-variable field to generate a wakeup_new_test synthetic event whenever a sched_wakeup_new event occurs, which because of the 'if comm == "cyclictest"' filter only happens when the executable is cyclictest: # echo 'hist:testpid=pid:keys=$testpid:\ onmatch(sched.sched_wakeup_new).wakeup_new_test($testpid, prio) \ if comm=="cyclictest"' >> \ /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger Creating and displaying a histogram based on those events is now just a matter of using the fields and new synthetic event in the tracing/events/synthetic directory, as usual: # echo 'hist:keys=pid,prio:sort=pid,prio' >> \ /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/trigger Link: http://lkml.kernel.org/r/8c2a574bcb7530c876629c901ecd23911b14afe8.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Rajvi Jingar Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 488 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 475 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ad96fd110707..9ac6089b7513 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -285,6 +285,8 @@ struct hist_trigger_data { struct action_data *actions[HIST_ACTIONS_MAX]; unsigned int n_actions; + struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX]; + unsigned int n_synth_var_refs; struct field_var *field_vars[SYNTH_FIELDS_MAX]; unsigned int n_field_vars; unsigned int n_field_var_str; @@ -321,7 +323,18 @@ typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, struct action_data { action_fn_t fn; - unsigned int var_ref_idx; + unsigned int n_params; + char *params[SYNTH_FIELDS_MAX]; + + union { + struct { + unsigned int var_ref_idx; + char *match_event; + char *match_event_system; + char *synth_event_name; + struct synth_event *synth_event; + } onmatch; + }; }; static LIST_HEAD(synth_event_list); @@ -887,6 +900,21 @@ static struct synth_event *alloc_synth_event(char *event_name, int n_fields, return event; } +static void action_trace(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, + struct action_data *data, u64 *var_ref_vals) +{ + struct synth_event *event = data->onmatch.synth_event; + + trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx); +} + +struct hist_var_data { + struct list_head list; + struct hist_trigger_data *hist_data; +}; + static void add_or_delete_synth_event(struct synth_event *event, int delete) { if (delete) @@ -1124,11 +1152,6 @@ static u64 hist_field_timestamp(struct hist_field *hist_field, return ts; } -struct hist_var_data { - struct list_head list; - struct hist_trigger_data *hist_data; -}; - static struct hist_field * check_field_for_var_ref(struct hist_field *hist_field, struct hist_trigger_data *var_data, @@ -1194,6 +1217,14 @@ static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, return found; } + for (i = 0; i < hist_data->n_synth_var_refs; i++) { + hist_field = hist_data->synth_var_refs[i]; + found = check_field_for_var_refs(hist_data, hist_field, + var_data, var_idx, 0); + if (found) + return found; + } + return found; } @@ -1422,6 +1453,37 @@ static struct hist_field *find_file_var(struct trace_event_file *file, return NULL; } +static struct hist_field * +find_match_var(struct hist_trigger_data *hist_data, char *var_name) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *hist_field, *found = NULL; + struct trace_event_file *file; + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->fn == action_trace) { + char *system = data->onmatch.match_event_system; + char *event_name = data->onmatch.match_event; + + file = find_var_file(tr, system, event_name, var_name); + if (!file) + continue; + hist_field = find_file_var(file, var_name); + if (hist_field) { + if (found) { + return ERR_PTR(-EINVAL); + } + + found = hist_field; + } + } + } + return found; +} + static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, char *system, char *event_name, @@ -1431,6 +1493,14 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, struct hist_field *hist_field = NULL; struct trace_event_file *file; + if (!system || !event_name) { + hist_field = find_match_var(hist_data, var_name); + if (IS_ERR(hist_field)) + return NULL; + if (hist_field) + return hist_field; + } + file = find_var_file(tr, system, event_name, var_name); if (!file) return NULL; @@ -1622,11 +1692,21 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) static int parse_action(char *str, struct hist_trigger_attrs *attrs) { - int ret = 0; + int ret = -EINVAL; if (attrs->n_actions >= HIST_ACTIONS_MAX) return ret; + if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0)) { + attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); + if (!attrs->action_str[attrs->n_actions]) { + ret = -ENOMEM; + return ret; + } + attrs->n_actions++; + ret = 0; + } + return ret; } @@ -2635,7 +2715,7 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data, * * Return: The variable created for the field. */ -struct hist_field * +static struct hist_field * create_field_var_hist(struct hist_trigger_data *target_hist_data, char *subsys_name, char *event_name, char *field_name) { @@ -2748,7 +2828,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, return event_var; } -struct hist_field * +static struct hist_field * find_target_event_var(struct hist_trigger_data *hist_data, char *subsys_name, char *event_name, char *var_name) { @@ -2919,7 +2999,7 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, * * Return: The variable created for the field. */ -struct field_var * +static struct field_var * create_target_field_var(struct hist_trigger_data *target_hist_data, char *subsys_name, char *event_name, char *var_name) { @@ -2943,6 +3023,27 @@ create_target_field_var(struct hist_trigger_data *target_hist_data, return create_field_var(target_hist_data, file, var_name); } +static void onmatch_destroy(struct action_data *data) +{ + unsigned int i; + + mutex_lock(&synth_event_mutex); + + kfree(data->onmatch.match_event); + kfree(data->onmatch.match_event_system); + kfree(data->onmatch.synth_event_name); + + for (i = 0; i < data->n_params; i++) + kfree(data->params[i]); + + if (data->onmatch.synth_event) + data->onmatch.synth_event->ref--; + + kfree(data); + + mutex_unlock(&synth_event_mutex); +} + static void destroy_field_var(struct field_var *field_var) { if (!field_var) @@ -2962,8 +3063,8 @@ static void destroy_field_vars(struct hist_trigger_data *hist_data) destroy_field_var(hist_data->field_vars[i]); } -void save_field_var(struct hist_trigger_data *hist_data, - struct field_var *field_var) +static void save_field_var(struct hist_trigger_data *hist_data, + struct field_var *field_var) { hist_data->field_vars[hist_data->n_field_vars++] = field_var; @@ -2971,6 +3072,304 @@ void save_field_var(struct hist_trigger_data *hist_data, hist_data->n_field_var_str++; } + +static void destroy_synth_var_refs(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_synth_var_refs; i++) + destroy_hist_field(hist_data->synth_var_refs[i], 0); +} + +static void save_synth_var_ref(struct hist_trigger_data *hist_data, + struct hist_field *var_ref) +{ + hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref; + + hist_data->var_refs[hist_data->n_var_refs] = var_ref; + var_ref->var_ref_idx = hist_data->n_var_refs++; +} + +static int check_synth_field(struct synth_event *event, + struct hist_field *hist_field, + unsigned int field_pos) +{ + struct synth_field *field; + + if (field_pos >= event->n_fields) + return -EINVAL; + + field = event->fields[field_pos]; + + if (strcmp(field->type, hist_field->type) != 0) + return -EINVAL; + + return 0; +} + +static int parse_action_params(char *params, struct action_data *data) +{ + char *param, *saved_param; + int ret = 0; + + while (params) { + if (data->n_params >= SYNTH_FIELDS_MAX) + goto out; + + param = strsep(¶ms, ","); + if (!param) { + ret = -EINVAL; + goto out; + } + + param = strstrip(param); + if (strlen(param) < 2) { + ret = -EINVAL; + goto out; + } + + saved_param = kstrdup(param, GFP_KERNEL); + if (!saved_param) { + ret = -ENOMEM; + goto out; + } + + data->params[data->n_params++] = saved_param; + } + out: + return ret; +} + +static struct hist_field * +onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, + char *system, char *event, char *var) +{ + struct hist_field *hist_field; + + var++; /* skip '$' */ + + hist_field = find_target_event_var(hist_data, system, event, var); + if (!hist_field) { + if (!system) { + system = data->onmatch.match_event_system; + event = data->onmatch.match_event; + } + + hist_field = find_event_var(hist_data, system, event, var); + } + + return hist_field; +} + +static struct hist_field * +onmatch_create_field_var(struct hist_trigger_data *hist_data, + struct action_data *data, char *system, + char *event, char *var) +{ + struct hist_field *hist_field = NULL; + struct field_var *field_var; + + /* + * First try to create a field var on the target event (the + * currently being defined). This will create a variable for + * unqualified fields on the target event, or if qualified, + * target fields that have qualified names matching the target. + */ + field_var = create_target_field_var(hist_data, system, event, var); + + if (field_var && !IS_ERR(field_var)) { + save_field_var(hist_data, field_var); + hist_field = field_var->var; + } else { + field_var = NULL; + /* + * If no explicit system.event is specfied, default to + * looking for fields on the onmatch(system.event.xxx) + * event. + */ + if (!system) { + system = data->onmatch.match_event_system; + event = data->onmatch.match_event; + } + + /* + * At this point, we're looking at a field on another + * event. Because we can't modify a hist trigger on + * another event to add a variable for a field, we need + * to create a new trigger on that event and create the + * variable at the same time. + */ + hist_field = create_field_var_hist(hist_data, system, event, var); + if (IS_ERR(hist_field)) + goto free; + } + out: + return hist_field; + free: + destroy_field_var(field_var); + hist_field = NULL; + goto out; +} + +static int onmatch_create(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + struct action_data *data) +{ + char *event_name, *param, *system = NULL; + struct hist_field *hist_field, *var_ref; + unsigned int i, var_ref_idx; + unsigned int field_pos = 0; + struct synth_event *event; + int ret = 0; + + mutex_lock(&synth_event_mutex); + event = find_synth_event(data->onmatch.synth_event_name); + if (!event) { + mutex_unlock(&synth_event_mutex); + return -EINVAL; + } + event->ref++; + mutex_unlock(&synth_event_mutex); + + var_ref_idx = hist_data->n_var_refs; + + for (i = 0; i < data->n_params; i++) { + char *p; + + p = param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto err; + } + + system = strsep(¶m, "."); + if (!param) { + param = (char *)system; + system = event_name = NULL; + } else { + event_name = strsep(¶m, "."); + if (!param) { + kfree(p); + ret = -EINVAL; + goto err; + } + } + + if (param[0] == '$') + hist_field = onmatch_find_var(hist_data, data, system, + event_name, param); + else + hist_field = onmatch_create_field_var(hist_data, data, + system, + event_name, + param); + + if (!hist_field) { + kfree(p); + ret = -EINVAL; + goto err; + } + + if (check_synth_field(event, hist_field, field_pos) == 0) { + var_ref = create_var_ref(hist_field, system, event_name); + if (!var_ref) { + kfree(p); + ret = -ENOMEM; + goto err; + } + + save_synth_var_ref(hist_data, var_ref); + field_pos++; + kfree(p); + continue; + } + + kfree(p); + ret = -EINVAL; + goto err; + } + + if (field_pos != event->n_fields) { + ret = -EINVAL; + goto err; + } + + data->fn = action_trace; + data->onmatch.synth_event = event; + data->onmatch.var_ref_idx = var_ref_idx; + out: + return ret; + err: + mutex_lock(&synth_event_mutex); + event->ref--; + mutex_unlock(&synth_event_mutex); + + goto out; +} + +static struct action_data *onmatch_parse(struct trace_array *tr, char *str) +{ + char *match_event, *match_event_system; + char *synth_event_name, *params; + struct action_data *data; + int ret = -EINVAL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + match_event = strsep(&str, ")"); + if (!match_event || !str) + goto free; + + match_event_system = strsep(&match_event, "."); + if (!match_event) + goto free; + + if (IS_ERR(event_file(tr, match_event_system, match_event))) + goto free; + + data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL); + if (!data->onmatch.match_event) { + ret = -ENOMEM; + goto free; + } + + data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL); + if (!data->onmatch.match_event_system) { + ret = -ENOMEM; + goto free; + } + + strsep(&str, "."); + if (!str) + goto free; + + synth_event_name = strsep(&str, "("); + if (!synth_event_name || !str) + goto free; + + data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL); + if (!data->onmatch.synth_event_name) { + ret = -ENOMEM; + goto free; + } + + params = strsep(&str, ")"); + if (!params || !str || (str && strlen(str))) + goto free; + + ret = parse_action_params(params, data); + if (ret) + goto free; + out: + return data; + free: + onmatch_destroy(data); + data = ERR_PTR(ret); + goto out; +} + static int create_hitcount_val(struct hist_trigger_data *hist_data) { hist_data->fields[HITCOUNT_IDX] = @@ -3395,18 +3794,39 @@ static void destroy_actions(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - kfree(data); + if (data->fn == action_trace) + onmatch_destroy(data); + else + kfree(data); } } static int parse_actions(struct hist_trigger_data *hist_data) { + struct trace_array *tr = hist_data->event_file->tr; + struct action_data *data; unsigned int i; int ret = 0; char *str; for (i = 0; i < hist_data->attrs->n_actions; i++) { str = hist_data->attrs->action_str[i]; + + if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) { + char *action_str = str + strlen("onmatch("); + + data = onmatch_parse(tr, action_str); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + data->fn = action_trace; + } else { + ret = -EINVAL; + break; + } + + hist_data->actions[hist_data->n_actions++] = data; } return ret; @@ -3421,11 +3841,50 @@ static int create_actions(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->attrs->n_actions; i++) { data = hist_data->actions[i]; + + if (data->fn == action_trace) { + ret = onmatch_create(hist_data, file, data); + if (ret) + return ret; + } } return ret; } +static void print_onmatch_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + unsigned int i; + + seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system, + data->onmatch.match_event); + + seq_printf(m, "%s(", data->onmatch.synth_event->name); + + for (i = 0; i < data->n_params; i++) { + if (i) + seq_puts(m, ","); + seq_printf(m, "%s", data->params[i]); + } + + seq_puts(m, ")"); +} + +static void print_actions_spec(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->fn == action_trace) + print_onmatch_spec(m, hist_data, data); + } +} + static void destroy_field_var_hists(struct hist_trigger_data *hist_data) { unsigned int i; @@ -3448,6 +3907,7 @@ static void destroy_hist_data(struct hist_trigger_data *hist_data) destroy_actions(hist_data); destroy_field_vars(hist_data); destroy_field_var_hists(hist_data); + destroy_synth_var_refs(hist_data); kfree(hist_data); } @@ -4004,6 +4464,8 @@ static int event_hist_trigger_print(struct seq_file *m, } seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); + print_actions_spec(m, hist_data); + if (data->filter_str) seq_printf(m, " if %s", data->filter_str); -- cgit v1.2.3-71-gd317 From 50450603ec9cb808d39b1461fe67a81d82b37129 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:52:01 -0600 Subject: tracing: Add 'onmax' hist trigger action support Add an 'onmax(var).save(field,...)' hist trigger action which is invoked whenever an event exceeds the current maximum. The end result is that the trace event fields or variables specified as the onmax.save() params will be saved if 'var' exceeds the current maximum for that hist trigger entry. This allows context from the event that exhibited the new maximum to be saved for later reference. When the histogram is displayed, additional fields displaying the saved values will be printed. As an example the below defines a couple of hist triggers, one for sched_wakeup and another for sched_switch, keyed on pid. Whenever a sched_wakeup occurs, the timestamp is saved in the entry corresponding to the current pid, and when the scheduler switches back to that pid, the timestamp difference is calculated. If the resulting latency exceeds the current maximum latency, the specified save() values are saved: # echo 'hist:keys=pid:ts0=common_timestamp.usecs \ if comm=="cyclictest"' >> \ /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger # echo 'hist:keys=next_pid:\ wakeup_lat=common_timestamp.usecs-$ts0:\ onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) \ if next_comm=="cyclictest"' >> \ /sys/kernel/debug/tracing/events/sched/sched_switch/trigger When the histogram is displayed, the max value and the saved values corresponding to the max are displayed following the rest of the fields: # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist { next_pid: 3728 } hitcount: 199 \ max: 123 next_comm: cyclictest prev_pid: 0 \ prev_prio: 120 prev_comm: swapper/3 { next_pid: 3730 } hitcount: 1321 \ max: 15 next_comm: cyclictest prev_pid: 0 \ prev_prio: 120 prev_comm: swapper/1 { next_pid: 3729 } hitcount: 1973\ max: 25 next_comm: cyclictest prev_pid: 0 \ prev_prio: 120 prev_comm: swapper/0 Totals: Hits: 3493 Entries: 3 Dropped: 0 Link: http://lkml.kernel.org/r/006907f71b1e839bb059337ec3c496f84fcb71de.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 331 ++++++++++++++++++++++++++++++++++----- 1 file changed, 296 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9ac6089b7513..7bcc32a7e266 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -292,6 +292,10 @@ struct hist_trigger_data { unsigned int n_field_var_str; struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX]; unsigned int n_field_var_hists; + + struct field_var *max_vars[SYNTH_FIELDS_MAX]; + unsigned int n_max_vars; + unsigned int n_max_var_str; }; struct synth_field { @@ -334,6 +338,14 @@ struct action_data { char *synth_event_name; struct synth_event *synth_event; } onmatch; + + struct { + char *var_str; + char *fn_name; + unsigned int max_var_ref_idx; + struct hist_field *max_var; + struct hist_field *var; + } onmax; }; }; @@ -1697,7 +1709,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) if (attrs->n_actions >= HIST_ACTIONS_MAX) return ret; - if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0)) { + if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) || + (strncmp(str, "onmax(", strlen("onmax(")) == 0)) { attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); if (!attrs->action_str[attrs->n_actions]) { ret = -ENOMEM; @@ -1869,7 +1882,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) } } - n_str = hist_data->n_field_var_str; + n_str = hist_data->n_field_var_str + hist_data->n_max_var_str; size = STR_VAR_LEN_MAX; @@ -2894,6 +2907,15 @@ static void update_field_vars(struct hist_trigger_data *hist_data, hist_data->n_field_vars, 0); } +static void update_max_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *rec) +{ + __update_field_vars(elt, rbe, rec, hist_data->max_vars, + hist_data->n_max_vars, hist_data->n_field_var_str); +} + static struct hist_field *create_var(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *name, int size, const char *type) @@ -3023,6 +3045,227 @@ create_target_field_var(struct hist_trigger_data *target_hist_data, return create_field_var(target_hist_data, file, var_name); } +static void onmax_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) +{ + unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx; + + seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx)); + + for (i = 0; i < hist_data->n_max_vars; i++) { + struct hist_field *save_val = hist_data->max_vars[i]->val; + struct hist_field *save_var = hist_data->max_vars[i]->var; + u64 val; + + save_var_idx = save_var->var.idx; + + val = tracing_map_read_var(elt, save_var_idx); + + if (save_val->flags & HIST_FIELD_FL_STRING) { + seq_printf(m, " %s: %-32s", save_var->var.name, + (char *)(uintptr_t)(val)); + } else + seq_printf(m, " %s: %10llu", save_var->var.name, val); + } +} + +static void onmax_save(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, + struct action_data *data, u64 *var_ref_vals) +{ + unsigned int max_idx = data->onmax.max_var->var.idx; + unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx; + + u64 var_val, max_val; + + var_val = var_ref_vals[max_var_ref_idx]; + max_val = tracing_map_read_var(elt, max_idx); + + if (var_val <= max_val) + return; + + tracing_map_set_var(elt, max_idx, var_val); + + update_max_vars(hist_data, elt, rbe, rec); +} + +static void onmax_destroy(struct action_data *data) +{ + unsigned int i; + + destroy_hist_field(data->onmax.max_var, 0); + destroy_hist_field(data->onmax.var, 0); + + kfree(data->onmax.var_str); + kfree(data->onmax.fn_name); + + for (i = 0; i < data->n_params; i++) + kfree(data->params[i]); + + kfree(data); +} + +static int onmax_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct trace_event_file *file = hist_data->event_file; + struct hist_field *var_field, *ref_field, *max_var; + unsigned int var_ref_idx = hist_data->n_var_refs; + struct field_var *field_var; + char *onmax_var_str, *param; + unsigned long flags; + unsigned int i; + int ret = 0; + + onmax_var_str = data->onmax.var_str; + if (onmax_var_str[0] != '$') + return -EINVAL; + onmax_var_str++; + + var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str); + if (!var_field) + return -EINVAL; + + flags = HIST_FIELD_FL_VAR_REF; + ref_field = create_hist_field(hist_data, NULL, flags, NULL); + if (!ref_field) + return -ENOMEM; + + if (init_var_ref(ref_field, var_field, NULL, NULL)) { + destroy_hist_field(ref_field, 0); + ret = -ENOMEM; + goto out; + } + hist_data->var_refs[hist_data->n_var_refs] = ref_field; + ref_field->var_ref_idx = hist_data->n_var_refs++; + data->onmax.var = ref_field; + + data->fn = onmax_save; + data->onmax.max_var_ref_idx = var_ref_idx; + max_var = create_var(hist_data, file, "max", sizeof(u64), "u64"); + if (IS_ERR(max_var)) { + ret = PTR_ERR(max_var); + goto out; + } + data->onmax.max_var = max_var; + + for (i = 0; i < data->n_params; i++) { + param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto out; + } + + field_var = create_target_field_var(hist_data, NULL, NULL, param); + if (IS_ERR(field_var)) { + ret = PTR_ERR(field_var); + kfree(param); + goto out; + } + + hist_data->max_vars[hist_data->n_max_vars++] = field_var; + if (field_var->val->flags & HIST_FIELD_FL_STRING) + hist_data->n_max_var_str++; + + kfree(param); + } + out: + return ret; +} + +static int parse_action_params(char *params, struct action_data *data) +{ + char *param, *saved_param; + int ret = 0; + + while (params) { + if (data->n_params >= SYNTH_FIELDS_MAX) + goto out; + + param = strsep(¶ms, ","); + if (!param) { + ret = -EINVAL; + goto out; + } + + param = strstrip(param); + if (strlen(param) < 2) { + ret = -EINVAL; + goto out; + } + + saved_param = kstrdup(param, GFP_KERNEL); + if (!saved_param) { + ret = -ENOMEM; + goto out; + } + + data->params[data->n_params++] = saved_param; + } + out: + return ret; +} + +static struct action_data *onmax_parse(char *str) +{ + char *onmax_fn_name, *onmax_var_str; + struct action_data *data; + int ret = -EINVAL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + onmax_var_str = strsep(&str, ")"); + if (!onmax_var_str || !str) { + ret = -EINVAL; + goto free; + } + + data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL); + if (!data->onmax.var_str) { + ret = -ENOMEM; + goto free; + } + + strsep(&str, "."); + if (!str) + goto free; + + onmax_fn_name = strsep(&str, "("); + if (!onmax_fn_name || !str) + goto free; + + if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) { + char *params = strsep(&str, ")"); + + if (!params) { + ret = -EINVAL; + goto free; + } + + ret = parse_action_params(params, data); + if (ret) + goto free; + } else + goto free; + + data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL); + if (!data->onmax.fn_name) { + ret = -ENOMEM; + goto free; + } + out: + return data; + free: + onmax_destroy(data); + data = ERR_PTR(ret); + goto out; +} + static void onmatch_destroy(struct action_data *data) { unsigned int i; @@ -3107,39 +3350,6 @@ static int check_synth_field(struct synth_event *event, return 0; } -static int parse_action_params(char *params, struct action_data *data) -{ - char *param, *saved_param; - int ret = 0; - - while (params) { - if (data->n_params >= SYNTH_FIELDS_MAX) - goto out; - - param = strsep(¶ms, ","); - if (!param) { - ret = -EINVAL; - goto out; - } - - param = strstrip(param); - if (strlen(param) < 2) { - ret = -EINVAL; - goto out; - } - - saved_param = kstrdup(param, GFP_KERNEL); - if (!saved_param) { - ret = -ENOMEM; - goto out; - } - - data->params[data->n_params++] = saved_param; - } - out: - return ret; -} - static struct hist_field * onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, char *system, char *event, char *var) @@ -3796,6 +4006,8 @@ static void destroy_actions(struct hist_trigger_data *hist_data) if (data->fn == action_trace) onmatch_destroy(data); + else if (data->fn == onmax_save) + onmax_destroy(data); else kfree(data); } @@ -3821,6 +4033,15 @@ static int parse_actions(struct hist_trigger_data *hist_data) break; } data->fn = action_trace; + } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) { + char *action_str = str + strlen("onmax("); + + data = onmax_parse(action_str); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + data->fn = onmax_save; } else { ret = -EINVAL; break; @@ -3846,12 +4067,48 @@ static int create_actions(struct hist_trigger_data *hist_data, ret = onmatch_create(hist_data, file, data); if (ret) return ret; + } else if (data->fn == onmax_save) { + ret = onmax_create(hist_data, data); + if (ret) + return ret; } } return ret; } +static void print_actions(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->fn == onmax_save) + onmax_print(m, hist_data, elt, data); + } +} + +static void print_onmax_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + unsigned int i; + + seq_puts(m, ":onmax("); + seq_printf(m, "%s", data->onmax.var_str); + seq_printf(m, ").%s(", data->onmax.fn_name); + + for (i = 0; i < hist_data->n_max_vars; i++) { + seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name); + if (i < hist_data->n_max_vars - 1) + seq_puts(m, ","); + } + seq_puts(m, ")"); +} + static void print_onmatch_spec(struct seq_file *m, struct hist_trigger_data *hist_data, struct action_data *data) @@ -3882,6 +4139,8 @@ static void print_actions_spec(struct seq_file *m, if (data->fn == action_trace) print_onmatch_spec(m, hist_data, data); + else if (data->fn == onmax_save) + print_onmax_spec(m, hist_data, data); } } @@ -4263,6 +4522,8 @@ hist_trigger_entry_print(struct seq_file *m, } } + print_actions(m, hist_data, elt); + seq_puts(m, "\n"); } -- cgit v1.2.3-71-gd317 From ec5ce0987541087dbea5af346bdb85eb04b0f0a2 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:52:02 -0600 Subject: tracing: Allow whitespace to surround hist trigger filter The existing code only allows for one space before and after the 'if' specifying the filter for a hist trigger. Add code to make that more permissive as far as whitespace goes. Specifically, we want to allow spaces in the trigger itself now that we have additional syntax (onmatch/onmax) where spaces are more natural e.g. spaces after commas in param lists. Link: http://lkml.kernel.org/r/1053090c3c308d4f431accdeb59dff4b511d4554.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7bcc32a7e266..7e88daae85b6 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5162,7 +5162,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, struct synth_event *se; const char *se_name; bool remove = false; - char *trigger; + char *trigger, *p; int ret = 0; if (!param) @@ -5171,10 +5171,37 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, if (glob[0] == '!') remove = true; - /* separate the trigger from the filter (k:v [if filter]) */ - trigger = strsep(¶m, " \t"); - if (!trigger) - return -EINVAL; + /* + * separate the trigger from the filter (k:v [if filter]) + * allowing for whitespace in the trigger + */ + p = trigger = param; + do { + p = strstr(p, "if"); + if (!p) + break; + if (p == param) + return -EINVAL; + if (*(p - 1) != ' ' && *(p - 1) != '\t') { + p++; + continue; + } + if (p >= param + strlen(param) - strlen("if") - 1) + return -EINVAL; + if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') { + p++; + continue; + } + break; + } while (p); + + if (!p) + param = NULL; + else { + *(p - 1) = '\0'; + param = strstrip(p); + trigger = strstrip(trigger); + } attrs = parse_hist_trigger_attrs(trigger); if (IS_ERR(attrs)) -- cgit v1.2.3-71-gd317 From 8b7622bf94a44b3f912e6492bf500e86171300b8 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:52:03 -0600 Subject: tracing: Add cpu field for hist triggers A common key to use in a histogram is the cpuid - add a new cpu 'synthetic' field named 'cpu' for that purpose. Link: http://lkml.kernel.org/r/89537645bfc957e0d76e2cacf5f0ada88691a6cc.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.txt | 15 +++++++++++++++ kernel/trace/trace_events_hist.c | 28 +++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt index 25c94730d3fe..be612ca79455 100644 --- a/Documentation/trace/histogram.txt +++ b/Documentation/trace/histogram.txt @@ -172,6 +172,21 @@ The examples below provide a more concrete illustration of the concepts and typical usage patterns discussed above. + 'special' event fields + ------------------------ + + There are a number of 'special event fields' available for use as + keys or values in a hist trigger. These look like and behave as if + they were actual event fields, but aren't really part of the event's + field definition or format file. They are however available for any + event, and can be used anywhere an actual event field could be. + They are: + + common_timestamp u64 - timestamp (from ring buffer) associated + with the event, in nanoseconds. May be + modified by .usecs to have timestamps + interpreted as microseconds. + cpu int - the cpu on which the event occurred. 6.2 'hist' trigger examples --------------------------- diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7e88daae85b6..98be6ad883eb 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -227,6 +227,7 @@ enum hist_field_flags { HIST_FIELD_FL_VAR = 1 << 12, HIST_FIELD_FL_EXPR = 1 << 13, HIST_FIELD_FL_VAR_REF = 1 << 14, + HIST_FIELD_FL_CPU = 1 << 15, }; struct var_defs { @@ -1164,6 +1165,16 @@ static u64 hist_field_timestamp(struct hist_field *hist_field, return ts; } +static u64 hist_field_cpu(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct ring_buffer_event *rbe, + void *event) +{ + int cpu = smp_processor_id(); + + return cpu; +} + static struct hist_field * check_field_for_var_ref(struct hist_field *hist_field, struct hist_trigger_data *var_data, @@ -1602,6 +1613,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; + else if (field->flags & HIST_FIELD_FL_CPU) + field_name = "cpu"; else if (field->flags & HIST_FIELD_FL_EXPR || field->flags & HIST_FIELD_FL_VAR_REF) { if (field->system) { @@ -2109,6 +2122,15 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } + if (flags & HIST_FIELD_FL_CPU) { + hist_field->fn = hist_field_cpu; + hist_field->size = sizeof(int); + hist_field->type = kstrdup("unsigned int", GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + if (WARN_ON_ONCE(!field)) goto out; @@ -2345,7 +2367,9 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->enable_timestamps = true; if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) hist_data->attrs->ts_in_usecs = true; - } else { + } else if (strcmp(field_name, "cpu") == 0) + *flags |= HIST_FIELD_FL_CPU; + else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { field = ERR_PTR(-EINVAL); @@ -4619,6 +4643,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) seq_puts(m, "common_timestamp"); + else if (hist_field->flags & HIST_FIELD_FL_CPU) + seq_puts(m, "cpu"); else if (field_name) { if (hist_field->flags & HIST_FIELD_FL_VAR_REF) seq_putc(m, '$'); -- cgit v1.2.3-71-gd317 From 7e8b88a30b085d4205b6afcc5e577604978b1268 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:52:04 -0600 Subject: tracing: Add hist trigger support for variable reference aliases Add support for alias=$somevar where alias can be used as onmatch.xxx($alias). Aliases are a way of creating a new name for an existing variable, for flexibly in making naming more clear in certain cases. For example in the below the user perhaps feels that using $new_lat in the synthetic event invocation is opaque or doesn't fit well stylistically with previous triggers, so creates an alias of $new_lat named $latency and uses that in the call instead: # echo 'hist:keys=next_pid:new_lat=common_timestamp.usecs' > /sys/kernel/debug/tracing/events/sched/sched_switch/trigger # echo 'hist:keys=pid:latency=$new_lat: onmatch(sched.sched_switch).wake2($latency,pid)' > /sys/kernel/debug/tracing/events/synthetic/wake1/trigger Link: http://lkml.kernel.org/r/ef20a65d921af3a873a6f1e8c71407c926d5586f.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 74 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 98be6ad883eb..32af523501bc 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -228,6 +228,7 @@ enum hist_field_flags { HIST_FIELD_FL_EXPR = 1 << 13, HIST_FIELD_FL_VAR_REF = 1 << 14, HIST_FIELD_FL_CPU = 1 << 15, + HIST_FIELD_FL_ALIAS = 1 << 16, }; struct var_defs { @@ -1609,7 +1610,8 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; - else if (field->flags & HIST_FIELD_FL_LOG2) + else if (field->flags & HIST_FIELD_FL_LOG2 || + field->flags & HIST_FIELD_FL_ALIAS) field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; @@ -2080,7 +2082,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field->hist_data = hist_data; - if (flags & HIST_FIELD_FL_EXPR) + if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS) goto out; /* caller will populate */ if (flags & HIST_FIELD_FL_VAR_REF) { @@ -2217,10 +2219,18 @@ static int init_var_ref(struct hist_field *ref_field, } } - ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL); - if (!ref_field->name) { - err = -ENOMEM; - goto free; + if (var_field->var.name) { + ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL); + if (!ref_field->name) { + err = -ENOMEM; + goto free; + } + } else if (var_field->name) { + ref_field->name = kstrdup(var_field->name, GFP_KERNEL); + if (!ref_field->name) { + err = -ENOMEM; + goto free; + } } ref_field->type = kstrdup(var_field->type, GFP_KERNEL); @@ -2382,6 +2392,28 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, return field; } +static struct hist_field *create_alias(struct hist_trigger_data *hist_data, + struct hist_field *var_ref, + char *var_name) +{ + struct hist_field *alias = NULL; + unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR; + + alias = create_hist_field(hist_data, NULL, flags, var_name); + if (!alias) + return NULL; + + alias->fn = var_ref->fn; + alias->operands[0] = var_ref; + + if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) { + destroy_hist_field(alias, 0); + return NULL; + } + + return alias; +} + static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long *flags, char *var_name) @@ -2415,6 +2447,13 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, if (hist_field) { hist_data->var_refs[hist_data->n_var_refs] = hist_field; hist_field->var_ref_idx = hist_data->n_var_refs++; + if (var_name) { + hist_field = create_alias(hist_data, hist_field, var_name); + if (!hist_field) { + ret = -ENOMEM; + goto out; + } + } return hist_field; } } else @@ -2515,6 +2554,26 @@ static int check_expr_operands(struct hist_field *operand1, unsigned long operand1_flags = operand1->flags; unsigned long operand2_flags = operand2->flags; + if ((operand1_flags & HIST_FIELD_FL_VAR_REF) || + (operand1_flags & HIST_FIELD_FL_ALIAS)) { + struct hist_field *var; + + var = find_var_field(operand1->var.hist_data, operand1->name); + if (!var) + return -EINVAL; + operand1_flags = var->flags; + } + + if ((operand2_flags & HIST_FIELD_FL_VAR_REF) || + (operand2_flags & HIST_FIELD_FL_ALIAS)) { + struct hist_field *var; + + var = find_var_field(operand2->var.hist_data, operand2->name); + if (!var) + return -EINVAL; + operand2_flags = var->flags; + } + if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) return -EINVAL; @@ -4646,7 +4705,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) else if (hist_field->flags & HIST_FIELD_FL_CPU) seq_puts(m, "cpu"); else if (field_name) { - if (hist_field->flags & HIST_FIELD_FL_VAR_REF) + if (hist_field->flags & HIST_FIELD_FL_VAR_REF || + hist_field->flags & HIST_FIELD_FL_ALIAS) seq_putc(m, '$'); seq_printf(m, "%s", field_name); } -- cgit v1.2.3-71-gd317 From f404da6e1d46ced7d3b53377f1e140c486ea7350 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:52:05 -0600 Subject: tracing: Add 'last error' error facility for hist triggers With the addition of variables and actions, it's become necessary to provide more detailed error information to users about syntax errors. Add a 'last error' facility accessible via the erroring event's 'hist' file. Reading the hist file after an error will display more detailed information about what went wrong, if information is available. This extended error information will be available until the next hist trigger command for that event. # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger echo: write error: Invalid argument # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist ERROR: Couldn't yyy: zzz Last command: xxx Also add specific error messages for variable and action errors. Link: http://lkml.kernel.org/r/64e9c422fc8aeafcc2f7a3b4328c0cffe7969129.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.txt | 20 +++++ kernel/trace/trace_events_hist.c | 164 ++++++++++++++++++++++++++++++++++---- 2 files changed, 170 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt index be612ca79455..0aec2d8e166b 100644 --- a/Documentation/trace/histogram.txt +++ b/Documentation/trace/histogram.txt @@ -188,6 +188,26 @@ interpreted as microseconds. cpu int - the cpu on which the event occurred. + Extended error information + -------------------------- + + For some error conditions encountered when invoking a hist trigger + command, extended error information is available via the + corresponding event's 'hist' file. Reading the hist file after an + error will display more detailed information about what went wrong, + if information is available. This extended error information will + be available until the next hist trigger command for that event. + + If available for a given error condition, the extended error + information and usage takes the following form: + + # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger + echo: write error: Invalid argument + + # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist + ERROR: Couldn't yyy: zzz + Last command: xxx + 6.2 'hist' trigger examples --------------------------- diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 32af523501bc..8719b0ea672f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -351,6 +351,65 @@ struct action_data { }; }; + +static char last_hist_cmd[MAX_FILTER_STR_VAL]; +static char hist_err_str[MAX_FILTER_STR_VAL]; + +static void last_cmd_set(char *str) +{ + if (!str) + return; + + strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1); +} + +static void hist_err(char *str, char *var) +{ + int maxlen = MAX_FILTER_STR_VAL - 1; + + if (!str) + return; + + if (strlen(hist_err_str)) + return; + + if (!var) + var = ""; + + if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen) + return; + + strcat(hist_err_str, str); + strcat(hist_err_str, var); +} + +static void hist_err_event(char *str, char *system, char *event, char *var) +{ + char err[MAX_FILTER_STR_VAL]; + + if (system && var) + snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var); + else if (system) + snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); + else + strncpy(err, var, MAX_FILTER_STR_VAL); + + hist_err(str, err); +} + +static void hist_err_clear(void) +{ + hist_err_str[0] = '\0'; +} + +static bool have_hist_err(void) +{ + if (strlen(hist_err_str)) + return true; + + return false; +} + static LIST_HEAD(synth_event_list); static DEFINE_MUTEX(synth_event_mutex); @@ -1448,8 +1507,10 @@ static struct trace_event_file *find_var_file(struct trace_array *tr, continue; if (find_var_field(var_hist_data, var_name)) { - if (found) + if (found) { + hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); return NULL; + } found = file; } @@ -1498,6 +1559,7 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) hist_field = find_file_var(file, var_name); if (hist_field) { if (found) { + hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); return ERR_PTR(-EINVAL); } @@ -1781,6 +1843,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) char *assignment; if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { + hist_err("Too many variables defined: ", str); ret = -EINVAL; goto out; } @@ -2335,6 +2398,10 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, if (var_field) ref_field = create_var_ref(var_field, system, event_name); + if (!ref_field) + hist_err_event("Couldn't find variable: $", + system, event_name, var_name); + return ref_field; } @@ -2494,6 +2561,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, /* we support only -(xxx) i.e. explicit parens required */ if (level > 3) { + hist_err("Too many subexpressions (3 max): ", str); ret = -EINVAL; goto free; } @@ -2575,8 +2643,10 @@ static int check_expr_operands(struct hist_field *operand1, } if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != - (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) + (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { + hist_err("Timestamp units in expression don't match", NULL); return -EINVAL; + } return 0; } @@ -2591,8 +2661,10 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, int field_op, ret = -EINVAL; char *sep, *operand1_str; - if (level > 3) + if (level > 3) { + hist_err("Too many subexpressions (3 max): ", str); return ERR_PTR(-EINVAL); + } field_op = contains_operator(str); @@ -2826,12 +2898,17 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, char *cmd; int ret; - if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) + if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { + hist_err_event("onmatch: Too many field variables defined: ", + subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); + } file = event_file(tr, subsys_name, event_name); if (IS_ERR(file)) { + hist_err_event("onmatch: Event file not found: ", + subsys_name, event_name, field_name); ret = PTR_ERR(file); return ERR_PTR(ret); } @@ -2843,8 +2920,11 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, * yet a registered histogram so we can't use that. */ hist_data = find_compatible_hist(target_hist_data, file); - if (!hist_data) + if (!hist_data) { + hist_err_event("onmatch: Matching event histogram not found: ", + subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); + } /* See if a synthetic field variable has already been created */ event_var = find_synthetic_field_var(target_hist_data, subsys_name, @@ -2903,6 +2983,8 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, kfree(cmd); kfree(var_hist->cmd); kfree(var_hist); + hist_err_event("onmatch: Couldn't create histogram for field: ", + subsys_name, event_name, field_name); return ERR_PTR(ret); } @@ -2914,6 +2996,8 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, if (IS_ERR_OR_NULL(event_var)) { kfree(var_hist->cmd); kfree(var_hist); + hist_err_event("onmatch: Couldn't find synthetic variable: ", + subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); } @@ -3050,18 +3134,21 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, int ret = 0; if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { + hist_err("Too many field variables defined: ", field_name); ret = -EINVAL; goto err; } val = parse_atom(hist_data, file, field_name, &flags, NULL); if (IS_ERR(val)) { + hist_err("Couldn't parse field variable: ", field_name); ret = PTR_ERR(val); goto err; } var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { + hist_err("Couldn't create or find variable: ", field_name); kfree(val); ret = PTR_ERR(var); goto err; @@ -3204,13 +3291,17 @@ static int onmax_create(struct hist_trigger_data *hist_data, int ret = 0; onmax_var_str = data->onmax.var_str; - if (onmax_var_str[0] != '$') + if (onmax_var_str[0] != '$') { + hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str); return -EINVAL; + } onmax_var_str++; var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str); - if (!var_field) + if (!var_field) { + hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str); return -EINVAL; + } flags = HIST_FIELD_FL_VAR_REF; ref_field = create_hist_field(hist_data, NULL, flags, NULL); @@ -3230,6 +3321,7 @@ static int onmax_create(struct hist_trigger_data *hist_data, data->onmax.max_var_ref_idx = var_ref_idx; max_var = create_var(hist_data, file, "max", sizeof(u64), "u64"); if (IS_ERR(max_var)) { + hist_err("onmax: Couldn't create onmax variable: ", "max"); ret = PTR_ERR(max_var); goto out; } @@ -3244,6 +3336,7 @@ static int onmax_create(struct hist_trigger_data *hist_data, field_var = create_target_field_var(hist_data, NULL, NULL, param); if (IS_ERR(field_var)) { + hist_err("onmax: Couldn't create field variable: ", param); ret = PTR_ERR(field_var); kfree(param); goto out; @@ -3276,6 +3369,7 @@ static int parse_action_params(char *params, struct action_data *data) param = strstrip(param); if (strlen(param) < 2) { + hist_err("Invalid action param: ", param); ret = -EINVAL; goto out; } @@ -3451,6 +3545,9 @@ onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, hist_field = find_event_var(hist_data, system, event, var); } + if (!hist_field) + hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var); + return hist_field; } @@ -3518,6 +3615,7 @@ static int onmatch_create(struct hist_trigger_data *hist_data, mutex_lock(&synth_event_mutex); event = find_synth_event(data->onmatch.synth_event_name); if (!event) { + hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name); mutex_unlock(&synth_event_mutex); return -EINVAL; } @@ -3577,12 +3675,15 @@ static int onmatch_create(struct hist_trigger_data *hist_data, continue; } + hist_err_event("onmatch: Param type doesn't match synthetic event field type: ", + system, event_name, param); kfree(p); ret = -EINVAL; goto err; } if (field_pos != event->n_fields) { + hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name); ret = -EINVAL; goto err; } @@ -3612,15 +3713,22 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) return ERR_PTR(-ENOMEM); match_event = strsep(&str, ")"); - if (!match_event || !str) + if (!match_event || !str) { + hist_err("onmatch: Missing closing paren: ", match_event); goto free; + } match_event_system = strsep(&match_event, "."); - if (!match_event) + if (!match_event) { + hist_err("onmatch: Missing subsystem for match event: ", match_event_system); goto free; + } - if (IS_ERR(event_file(tr, match_event_system, match_event))) + if (IS_ERR(event_file(tr, match_event_system, match_event))) { + hist_err_event("onmatch: Invalid subsystem or event name: ", + match_event_system, match_event, NULL); goto free; + } data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL); if (!data->onmatch.match_event) { @@ -3635,12 +3743,16 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) } strsep(&str, "."); - if (!str) + if (!str) { + hist_err("onmatch: Missing . after onmatch(): ", str); goto free; + } synth_event_name = strsep(&str, "("); - if (!synth_event_name || !str) + if (!synth_event_name || !str) { + hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name); goto free; + } data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL); if (!data->onmatch.synth_event_name) { @@ -3649,8 +3761,10 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) } params = strsep(&str, ")"); - if (!params || !str || (str && strlen(str))) + if (!params || !str || (str && strlen(str))) { + hist_err("onmatch: Missing closing paramlist paren: ", params); goto free; + } ret = parse_action_params(params, data); if (ret) @@ -3725,7 +3839,9 @@ static int create_var_field(struct hist_trigger_data *hist_data, if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) return -EINVAL; + if (find_var(hist_data, file, var_name) && !hist_data->remove) { + hist_err("Variable already defined: ", var_name); return -EINVAL; } @@ -3806,6 +3922,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, } if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { + hist_err("Using variable references as keys not supported: ", field_str); destroy_hist_field(hist_field, 0); ret = -EINVAL; goto out; @@ -3919,11 +4036,13 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) var_name = strsep(&field_str, "="); if (!var_name || !field_str) { + hist_err("Malformed assignment: ", var_name); ret = -EINVAL; goto free; } if (n_vars == TRACING_MAP_VARS_MAX) { + hist_err("Too many variables defined: ", var_name); ret = -EINVAL; goto free; } @@ -4675,6 +4794,11 @@ static int hist_show(struct seq_file *m, void *v) hist_trigger_show(m, data, n++); } + if (have_hist_err()) { + seq_printf(m, "\nERROR: %s\n", hist_err_str); + seq_printf(m, " Last command: %s\n", last_hist_cmd); + } + out_unlock: mutex_unlock(&event_mutex); @@ -5039,6 +5163,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (named_data) { if (!hist_trigger_match(data, named_data, named_data, true)) { + hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name); ret = -EINVAL; goto out; } @@ -5058,13 +5183,16 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, test->paused = false; else if (hist_data->attrs->clear) hist_clear(test); - else + else { + hist_err("Hist trigger already exists", NULL); ret = -EEXIST; + } goto out; } } new: if (hist_data->attrs->cont || hist_data->attrs->clear) { + hist_err("Can't clear or continue a nonexistent hist trigger", NULL); ret = -ENOENT; goto out; } @@ -5251,6 +5379,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, char *trigger, *p; int ret = 0; + if (glob && strlen(glob)) { + last_cmd_set(param); + hist_err_clear(); + } + if (!param) return -EINVAL; @@ -5389,6 +5522,9 @@ enable: /* Just return zero, not the number of registered triggers */ ret = 0; out: + if (ret == 0) + hist_err_clear(); + return ret; out_unreg: cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); -- cgit v1.2.3-71-gd317 From d71bd34d78bb78b9e6f8a0be3952d5fa470a260a Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:52:07 -0600 Subject: tracing: Make tracing_set_clock() non-static Allow tracing code outside of trace.c to access tracing_set_clock(). Some applications may require a particular clock in order to function properly, such as latency calculations. Also, add an accessor returning the current clock string. Link: http://lkml.kernel.org/r/6d1c53e9ee2163f54e1849f5376573f54f0e6009.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68f8702af9fb..551a7cd0d705 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6214,7 +6214,7 @@ static int tracing_clock_show(struct seq_file *m, void *v) return 0; } -static int tracing_set_clock(struct trace_array *tr, const char *clockstr) +int tracing_set_clock(struct trace_array *tr, const char *clockstr) { int i; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 99b7ee7ed127..9de3e2a2f042 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -289,6 +289,7 @@ extern int trace_array_get(struct trace_array *tr); extern void trace_array_put(struct trace_array *tr); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); +extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); -- cgit v1.2.3-71-gd317 From a4072fe85ba3671720cab0788291af953db27318 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:52:08 -0600 Subject: tracing: Add a clock attribute for hist triggers The default clock if timestamps are used in a histogram is "global". If timestamps aren't used, the clock is irrelevant. Use the "clock=" param only if you want to override the default "global" clock for a histogram with timestamps. Link: http://lkml.kernel.org/r/427bed1389c5d22aa40c3e0683e30cc3d151e260.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Rajvi Jingar Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.txt | 11 +++++++++- kernel/trace/trace_events_hist.c | 42 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt index df08882d091c..6e05510afc28 100644 --- a/Documentation/trace/histogram.txt +++ b/Documentation/trace/histogram.txt @@ -1671,7 +1671,16 @@ features have been added to the hist trigger support: it is in units of nanoseconds; appending '.usecs' to a common_timestamp field changes the units to microseconds. -These features are decribed in more detail in the following sections. +A note on inter-event timestamps: If common_timestamp is used in a +histogram, the trace buffer is automatically switched over to using +absolute timestamps and the "global" trace clock, in order to avoid +bogus timestamp differences with other clocks that aren't coherent +across CPUs. This can be overridden by specifying one of the other +trace clocks instead, using the "clock=XXX" hist trigger attribute, +where XXX is any of the clocks listed in the tracing/trace_clock +pseudo-file. + +These features are described in more detail in the following sections. 2.2.1 Histogram Variables ------------------------- diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8719b0ea672f..f7d0da20c5c8 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -242,6 +242,7 @@ struct hist_trigger_attrs { char *vals_str; char *sort_key_str; char *name; + char *clock; bool pause; bool cont; bool clear; @@ -1776,6 +1777,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) kfree(attrs->sort_key_str); kfree(attrs->keys_str); kfree(attrs->vals_str); + kfree(attrs->clock); kfree(attrs); } @@ -1831,6 +1833,19 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) ret = -ENOMEM; goto out; } + } else if (strncmp(str, "clock=", strlen("clock=")) == 0) { + strsep(&str, "="); + if (!str) { + ret = -EINVAL; + goto out; + } + + str = strstrip(str); + attrs->clock = kstrdup(str, GFP_KERNEL); + if (!attrs->clock) { + ret = -ENOMEM; + goto out; + } } else if (strncmp(str, "size=", strlen("size=")) == 0) { int map_bits = parse_map_size(str); @@ -1895,6 +1910,14 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) goto free; } + if (!attrs->clock) { + attrs->clock = kstrdup("global", GFP_KERNEL); + if (!attrs->clock) { + ret = -ENOMEM; + goto free; + } + } + return attrs; free: destroy_hist_trigger_attrs(attrs); @@ -4934,6 +4957,8 @@ static int event_hist_trigger_print(struct seq_file *m, seq_puts(m, ".descending"); } seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); + if (hist_data->enable_timestamps) + seq_printf(m, ":clock=%s", hist_data->attrs->clock); print_actions_spec(m, hist_data); @@ -5201,7 +5226,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, data->paused = true; if (named_data) { - destroy_hist_data(data->private_data); data->private_data = named_data->private_data; set_named_trigger_data(data, named_data); data->ops = &event_hist_trigger_named_ops; @@ -5213,10 +5237,22 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, goto out; } - ret++; + if (hist_data->enable_timestamps) { + char *clock = hist_data->attrs->clock; + + ret = tracing_set_clock(file->tr, hist_data->attrs->clock); + if (ret) { + hist_err("Couldn't set trace_clock: ", clock); + goto out; + } - if (hist_data->enable_timestamps) tracing_set_time_stamp_abs(file->tr, true); + } + + if (named_data) + destroy_hist_data(hist_data); + + ret++; out: return ret; } -- cgit v1.2.3-71-gd317 From 8e012066fe0de5ff5be606836f9075511bce5604 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Feb 2018 17:26:32 -0500 Subject: ring-buffer: Add nesting for adding events within events The ring-buffer code has recusion protection in case tracing ends up tracing itself, the ring-buffer will detect that it was called at the same context (normal, softirq, interrupt or NMI), and not continue to record the event. With the histogram synthetic events, they are called while tracing another event at the same context. The recusion protection triggers because it detects tracing at the same context and stops it. Add ring_buffer_nest_start() and ring_buffer_nest_end() that will notify the ring buffer that a trace is about to happen within another trace and that it is intended, and not to trigger the recursion blocking. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 3 +++ kernel/trace/ring_buffer.c | 57 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 7cb84774c20d..a0233edc0718 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -117,6 +117,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, int ring_buffer_write(struct ring_buffer *buffer, unsigned long length, void *data); +void ring_buffer_nest_start(struct ring_buffer *buffer); +void ring_buffer_nest_end(struct ring_buffer *buffer); + struct ring_buffer_event * ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 33073cdebb26..a2fd3893cc02 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -477,6 +477,7 @@ struct ring_buffer_per_cpu { struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; + unsigned long nest; local_t entries_bytes; local_t entries; local_t overrun; @@ -2624,10 +2625,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = pc & NMI_MASK ? RB_CTX_NMI : pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; - if (unlikely(val & (1 << bit))) + if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) return 1; - val |= (1 << bit); + val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; return 0; @@ -2636,7 +2637,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->current_context &= cpu_buffer->current_context - 1; + cpu_buffer->current_context &= + cpu_buffer->current_context - (1 << cpu_buffer->nest); +} + +/* The recursive locking above uses 4 bits */ +#define NESTED_BITS 4 + +/** + * ring_buffer_nest_start - Allow to trace while nested + * @buffer: The ring buffer to modify + * + * The ring buffer has a safty mechanism to prevent recursion. + * But there may be a case where a trace needs to be done while + * tracing something else. In this case, calling this function + * will allow this function to nest within a currently active + * ring_buffer_lock_reserve(). + * + * Call this function before calling another ring_buffer_lock_reserve() and + * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit(). + */ +void ring_buffer_nest_start(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* Enabled by ring_buffer_nest_end() */ + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* This is the shift value for the above recusive locking */ + cpu_buffer->nest += NESTED_BITS; +} + +/** + * ring_buffer_nest_end - Allow to trace while nested + * @buffer: The ring buffer to modify + * + * Must be called after ring_buffer_nest_start() and after the + * ring_buffer_unlock_commit(). + */ +void ring_buffer_nest_end(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* disabled by ring_buffer_nest_start() */ + cpu = raw_smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* This is the shift value for the above recusive locking */ + cpu_buffer->nest -= NESTED_BITS; + preempt_enable_notrace(); } /** -- cgit v1.2.3-71-gd317 From 4708abc6c68b41a656afb431818d5c57d7fdfd24 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Feb 2018 17:29:46 -0500 Subject: tracing: Use the ring-buffer nesting to allow synthetic events to be traced Synthetic events can be done within the recording of other events. Notify the ring buffer via ring_buffer_nest_start() and ring_buffer_nest_end() that this is intended and not to block it due to its recursion protection. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f7d0da20c5c8..4f027642ceef 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -640,6 +640,7 @@ static notrace void trace_event_raw_event_synth(void *__data, struct trace_event_file *trace_file = __data; struct synth_trace_event *entry; struct trace_event_buffer fbuffer; + struct ring_buffer *buffer; struct synth_event *event; unsigned int i, n_u64; int fields_size = 0; @@ -651,10 +652,17 @@ static notrace void trace_event_raw_event_synth(void *__data, fields_size = event->n_u64 * sizeof(u64); + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + buffer = trace_file->tr->trace_buffer.buffer; + ring_buffer_nest_start(buffer); + entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + fields_size); if (!entry) - return; + goto out; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { if (event->fields[i]->is_string) { @@ -670,6 +678,8 @@ static notrace void trace_event_raw_event_synth(void *__data, } trace_event_buffer_commit(&fbuffer); +out: + ring_buffer_nest_end(buffer); } static void free_synth_event_print_fmt(struct trace_event_call *call) -- cgit v1.2.3-71-gd317 From 89e270c1df0c56d6ce3c2d9ed3347b527c684b16 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 15 Jan 2018 20:52:10 -0600 Subject: tracing: Add inter-event blurb to HIST_TRIGGERS config option So that users know that inter-event tracing is supported as part of the HIST_TRIGGERS option, include text to that effect in the help text. Link: http://lkml.kernel.org/r/a38e24231d8d980be636b56d35814570acfd167a.1516069914.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 0b249e2f0c3c..c4f0f2e4126e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -606,7 +606,10 @@ config HIST_TRIGGERS event activity as an initial guide for further investigation using more advanced tools. - See Documentation/trace/events.txt. + Inter-event tracing of quantities such as latencies is also + supported using hist triggers under this option. + + See Documentation/trace/histogram.txt. If in doubt, say N. config MMIOTRACE_TEST -- cgit v1.2.3-71-gd317 From a0ff08fd4e3f8b1cbc18950a8bf1f9067f7e700a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Feb 2018 22:32:51 -0500 Subject: tracing: Remove BUG_ON() from append_filter_string() There's no reason to BUG if there's a bug in the filtering code. Simply do a warning and return. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index a764aec3c9a1..819a13c3e13c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -681,7 +681,8 @@ static int append_filter_string(struct event_filter *filter, int newlen; char *new_filter_string; - BUG_ON(!filter->filter_string); + if (WARN_ON(!filter->filter_string)) + return -EINVAL; newlen = strlen(filter->filter_string) + strlen(string) + 1; new_filter_string = kmalloc(newlen, GFP_KERNEL); if (!new_filter_string) -- cgit v1.2.3-71-gd317 From 559d421267d1594c541143489d9ee9a869dc6093 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Feb 2018 23:14:04 -0500 Subject: tracing: Use trace_seq instead of open code string appending The filter code does open code string appending to produce an error message. Instead it can be simplified by using trace_seq function helpers. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 57 +++++++++++++++----------------------- 1 file changed, 23 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 819a13c3e13c..f42442cd423a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -675,47 +675,36 @@ static int replace_filter_string(struct event_filter *filter, return 0; } -static int append_filter_string(struct event_filter *filter, - char *string) -{ - int newlen; - char *new_filter_string; - - if (WARN_ON(!filter->filter_string)) - return -EINVAL; - newlen = strlen(filter->filter_string) + strlen(string) + 1; - new_filter_string = kmalloc(newlen, GFP_KERNEL); - if (!new_filter_string) - return -ENOMEM; - - strcpy(new_filter_string, filter->filter_string); - strcat(new_filter_string, string); - kfree(filter->filter_string); - filter->filter_string = new_filter_string; - - return 0; -} - static void append_filter_err(struct filter_parse_state *ps, struct event_filter *filter) { + struct trace_seq *s; int pos = ps->lasterr_pos; - char *buf, *pbuf; + char *buf; + int len; - buf = (char *)__get_free_page(GFP_KERNEL); - if (!buf) + if (WARN_ON(!filter->filter_string)) return; - append_filter_string(filter, "\n"); - memset(buf, ' ', PAGE_SIZE); - if (pos > PAGE_SIZE - 128) - pos = 0; - buf[pos] = '^'; - pbuf = &buf[pos] + 1; - - sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); - append_filter_string(filter, buf); - free_page((unsigned long) buf); + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return; + trace_seq_init(s); + + len = strlen(filter->filter_string); + if (pos > len) + len = pos; + + trace_seq_puts(s, filter->filter_string); + trace_seq_printf(s, "\n%*s", pos, "^"); + trace_seq_printf(s, "\nparse_error: %s\n", err_text[ps->lasterr]); + trace_seq_putc(s, 0); + buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); + if (buf) { + kfree(filter->filter_string); + filter->filter_string = buf; + } + kfree(s); } static inline struct event_filter *event_filter(struct trace_event_file *file) -- cgit v1.2.3-71-gd317 From c7399708b3cd9004205923c3d139dcc7d067a8a4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Feb 2018 23:17:54 -0500 Subject: tracing: Remove filter allocator helper The __alloc_filter() function does nothing more that allocate the filter. There's no reason to have it as a helper function. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f42442cd423a..3d60bbeb2ef1 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -866,14 +866,6 @@ void free_event_filter(struct event_filter *filter) __free_filter(filter); } -static struct event_filter *__alloc_filter(void) -{ - struct event_filter *filter; - - filter = kzalloc(sizeof(*filter), GFP_KERNEL); - return filter; -} - static int __alloc_preds(struct event_filter *filter, int n_preds) { struct filter_pred *pred; @@ -1812,7 +1804,7 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, list_add_tail(&filter_item->list, &filter_list); - filter_item->filter = __alloc_filter(); + filter_item->filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (!filter_item->filter) goto fail_mem; filter = filter_item->filter; @@ -1886,7 +1878,7 @@ static int create_filter_start(char *filter_str, bool set_str, WARN_ON_ONCE(*psp || *filterp); /* allocate everything, and if any fails, free all and fail */ - filter = __alloc_filter(); + filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (filter && set_str) err = replace_filter_string(filter, filter_str); -- cgit v1.2.3-71-gd317 From 404a3add43c9c42fe48b61341badfcb9cca165cc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Feb 2018 23:26:11 -0500 Subject: tracing: Only add filter list when needed replace_system_preds() creates a filter list to free even when it doesn't really need to have it. Only save filters that require synchronize_sched() in the filter list to free. This will allow the code to be updated a bit easier in the future. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 3d60bbeb2ef1..2401b7c727a3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1769,6 +1769,7 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, { struct trace_event_file *file; struct filter_list *filter_item; + struct event_filter *filter = NULL; struct filter_list *tmp; LIST_HEAD(filter_list); bool fail = true; @@ -1790,7 +1791,6 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, } list_for_each_entry(file, &tr->events, list) { - struct event_filter *filter; if (file->system != dir) continue; @@ -1798,16 +1798,9 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, if (event_no_set_filter_flag(file)) continue; - filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); - if (!filter_item) - goto fail_mem; - - list_add_tail(&filter_item->list, &filter_list); - - filter_item->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!filter_item->filter) + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) goto fail_mem; - filter = filter_item->filter; /* Can only fail on no memory */ err = replace_filter_string(filter, filter_string); @@ -1821,13 +1814,20 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, append_filter_err(ps, filter); } else event_set_filtered_flag(file); + + + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; + + list_add_tail(&filter_item->list, &filter_list); /* * Regardless of if this returned an error, we still * replace the filter for the call. */ - filter = event_filter(file); - event_set_filter(file, filter_item->filter); - filter_item->filter = filter; + filter_item->filter = event_filter(file); + event_set_filter(file, filter); + filter = NULL; fail = false; } @@ -1856,6 +1856,7 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: + kfree(filter); /* If any call succeeded, we still need to sync */ if (!fail) synchronize_sched(); -- cgit v1.2.3-71-gd317 From 567f6989fd2ac1078d6908fe1bb45932bbeb1b00 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 23 Feb 2018 09:45:27 -0500 Subject: tracing: Embed replace_filter_string() helper function The replace_filter_string() frees the current string and then copies a given string. But in the two locations that it was used, the allocation happened right after the filter was allocated (nothing to replace). There's no need for this to be a helper function. Embedding the allocation in the two places where it was called will make changing the code in the future easier. Also make the variable consistent (always use "filter_string" as the name, as it was used in one instance as "filter_str") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2401b7c727a3..c3c6eee1e4df 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -664,17 +664,6 @@ static void remove_filter_string(struct event_filter *filter) filter->filter_string = NULL; } -static int replace_filter_string(struct event_filter *filter, - char *filter_string) -{ - kfree(filter->filter_string); - filter->filter_string = kstrdup(filter_string, GFP_KERNEL); - if (!filter->filter_string) - return -ENOMEM; - - return 0; -} - static void append_filter_err(struct filter_parse_state *ps, struct event_filter *filter) { @@ -1802,9 +1791,8 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, if (!filter) goto fail_mem; - /* Can only fail on no memory */ - err = replace_filter_string(filter, filter_string); - if (err) + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) goto fail_mem; err = replace_preds(file->event_call, filter, ps, false); @@ -1868,7 +1856,7 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, return -ENOMEM; } -static int create_filter_start(char *filter_str, bool set_str, +static int create_filter_start(char *filter_string, bool set_str, struct filter_parse_state **psp, struct event_filter **filterp) { @@ -1880,8 +1868,11 @@ static int create_filter_start(char *filter_str, bool set_str, /* allocate everything, and if any fails, free all and fail */ filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (filter && set_str) - err = replace_filter_string(filter, filter_str); + if (filter && set_str) { + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + err = -ENOMEM; + } ps = kzalloc(sizeof(*ps), GFP_KERNEL); @@ -1895,7 +1886,7 @@ static int create_filter_start(char *filter_str, bool set_str, *filterp = filter; *psp = ps; - parse_init(ps, filter_ops, filter_str); + parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); if (err && set_str) append_filter_err(ps, filter); -- cgit v1.2.3-71-gd317 From 9e5b127d6f33468143d90c8a45ca12410e4c3fa7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Mar 2018 12:52:04 +0100 Subject: perf/core: Fix perf_output_read_group() Mark reported his arm64 perf fuzzer runs sometimes splat like: armv8pmu_read_counter+0x1e8/0x2d8 armpmu_event_update+0x8c/0x188 armpmu_read+0xc/0x18 perf_output_read+0x550/0x11e8 perf_event_read_event+0x1d0/0x248 perf_event_exit_task+0x468/0xbb8 do_exit+0x690/0x1310 do_group_exit+0xd0/0x2b0 get_signal+0x2e8/0x17a8 do_signal+0x144/0x4f8 do_notify_resume+0x148/0x1e8 work_pending+0x8/0x14 which asserts that we only call pmu::read() on ACTIVE events. The above callchain does: perf_event_exit_task() perf_event_exit_task_context() task_ctx_sched_out() // INACTIVE perf_event_exit_event() perf_event_set_state(EXIT) // EXIT sync_child_event() perf_event_read_event() perf_output_read() perf_output_read_group() leader->pmu->read() Which results in doing a pmu::read() on an !ACTIVE event. I _think_ this is 'new' since we added attr.inherit_stat, which added the perf_event_read_event() to the exit path, without that perf_event_read_output() would only trigger from samples and for @event to trigger a sample, it's leader _must_ be ACTIVE too. Still, adding this check makes it consistent with the @sub case for the siblings. Reported-and-Tested-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 57898102847f..8b6a2774e084 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5730,7 +5730,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if (leader != event) + if ((leader != event) && + (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); values[n++] = perf_event_count(leader); -- cgit v1.2.3-71-gd317 From 8e1a2031e4b556b01ca53cd1fb2d83d811a6605b Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Fri, 8 Sep 2017 11:47:03 +0300 Subject: perf/cor: Use RB trees for pinned/flexible groups Change event groups into RB trees sorted by CPU and then by a 64bit index, so that multiplexing hrtimer interrupt handler would be able skipping to the current CPU's list and ignore groups allocated for the other CPUs. New API for manipulating event groups in the trees is implemented as well as adoption on the API in the current implementation. pinned_group_sched_in() and flexible_group_sched_in() API are introduced to consolidate code enabling the whole group from pinned and flexible groups appropriately. Signed-off-by: Alexey Budankov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Dmitri Prokhorov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Valery Cherepennikov Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/372f9c8b-0cfe-4240-e44d-83d863d40813@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 16 ++- kernel/events/core.c | 307 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 267 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7546822a1d74..6e3f854a34d8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -558,7 +558,11 @@ struct perf_event { */ struct list_head group_entry; struct list_head sibling_list; - + /* + * Node on the pinned or flexible tree located at the event context; + */ + struct rb_node group_node; + u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the @@ -690,6 +694,12 @@ struct perf_event { #endif /* CONFIG_PERF_EVENTS */ }; + +struct perf_event_groups { + struct rb_root tree; + u64 index; +}; + /** * struct perf_event_context - event context structure * @@ -710,8 +720,8 @@ struct perf_event_context { struct mutex mutex; struct list_head active_ctx_list; - struct list_head pinned_groups; - struct list_head flexible_groups; + struct perf_event_groups pinned_groups; + struct perf_event_groups flexible_groups; struct list_head event_list; int nr_events; int nr_active; diff --git a/kernel/events/core.c b/kernel/events/core.c index 8b6a2774e084..c9fee3640f40 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1460,8 +1460,21 @@ static enum event_type_t get_event_type(struct perf_event *event) return event_type; } -static struct list_head * -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +/* + * Helper function to initialize group leader event; + */ +void init_event_group(struct perf_event *event) +{ + RB_CLEAR_NODE(&event->group_node); + event->group_index = 0; +} + +/* + * Extract pinned or flexible groups from the context + * based on event attrs bits; + */ +static struct perf_event_groups * +get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; @@ -1469,6 +1482,169 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) return &ctx->flexible_groups; } +/* + * Helper function to initializes perf event groups object; + */ +void perf_event_groups_init(struct perf_event_groups *groups) +{ + groups->tree = RB_ROOT; + groups->index = 0; +} + +/* + * Compare function for event groups; + * Implements complex key that first sorts by CPU and then by + * virtual index which provides ordering when rotating + * groups for the same CPU; + */ +int perf_event_groups_less(struct perf_event *left, struct perf_event *right) +{ + if (left->cpu < right->cpu) { + return 1; + } else if (left->cpu > right->cpu) { + return 0; + } else { + if (left->group_index < right->group_index) { + return 1; + } else if(left->group_index > right->group_index) { + return 0; + } else { + return 0; + } + } +} + +/* + * Insert a group into a tree using event->cpu as a key. If event->cpu node + * is already attached to the tree then the event is added to the attached + * group's group_list list. + */ +static void +perf_event_groups_insert(struct perf_event_groups *groups, + struct perf_event *event) +{ + struct perf_event *node_event; + struct rb_node *parent; + struct rb_node **node; + + event->group_index = ++groups->index; + + node = &groups->tree.rb_node; + parent = *node; + + while (*node) { + parent = *node; + node_event = container_of(*node, + struct perf_event, group_node); + + if (perf_event_groups_less(event, node_event)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&event->group_node, parent, node); + rb_insert_color(&event->group_node, &groups->tree); +} + +/* + * Helper function to insert event into the pinned or + * flexible groups; + */ +static void +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_insert(groups, event); +} + +/* + * Delete a group from a tree. If the group is directly attached to the tree + * it also detaches all groups on the group's group_list list. + */ +static void +perf_event_groups_delete(struct perf_event_groups *groups, + struct perf_event *event) +{ + if (!RB_EMPTY_NODE(&event->group_node) && + !RB_EMPTY_ROOT(&groups->tree)) + rb_erase(&event->group_node, &groups->tree); + + init_event_group(event); +} + +/* + * Helper function to delete event from its groups; + */ +static void +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_delete(groups, event); +} + +/* + * Get a group by a cpu key from groups tree with the least group_index; + */ +static struct perf_event * +perf_event_groups_first(struct perf_event_groups *groups, int cpu) +{ + struct perf_event *node_event = NULL, *match = NULL; + struct rb_node *node = groups->tree.rb_node; + + while (node) { + node_event = container_of(node, + struct perf_event, group_node); + + if (cpu < node_event->cpu) { + node = node->rb_left; + } else if (cpu > node_event->cpu) { + node = node->rb_right; + } else { + match = node_event; + node = node->rb_left; + } + } + + return match; +} + +/* + * Find group list by a cpu key and rotate it. + */ +static void +perf_event_groups_rotate(struct perf_event_groups *groups, int cpu) +{ + struct perf_event *event = + perf_event_groups_first(groups, cpu); + + if (event) { + perf_event_groups_delete(groups, event); + perf_event_groups_insert(groups, event); + } +} + +/* + * Iterate event groups thru the whole tree. + */ +#define perf_event_groups_for_each(event, groups, node) \ + for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ + typeof(*event), node); event; \ + event = rb_entry_safe(rb_next(&event->node), \ + typeof(*event), node)) +/* + * Iterate event groups with cpu == key. + */ +#define perf_event_groups_for_each_cpu(event, key, groups, node) \ + for (event = perf_event_groups_first(groups, key); \ + event && event->cpu == key; \ + event = rb_entry_safe(rb_next(&event->node), \ + typeof(*event), node)) + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -1489,12 +1665,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { - struct list_head *list; - event->group_caps = event->event_caps; - - list = ctx_group_list(event, ctx); - list_add_tail(&event->group_entry, list); + add_event_to_groups(event, ctx); } list_update_cgroup_event(event, ctx, true); @@ -1688,7 +1860,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) list_del_rcu(&event->event_entry); if (event->group_leader == event) - list_del_init(&event->group_entry); + del_event_from_groups(event, ctx); /* * If event was in error state, then keep it @@ -1706,7 +1878,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; - struct list_head *list = NULL; lockdep_assert_held(&event->ctx->lock); @@ -1727,22 +1898,23 @@ static void perf_group_detach(struct perf_event *event) goto out; } - if (!list_empty(&event->group_entry)) - list = &event->group_entry; - /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - if (list) - list_move_tail(&sibling->group_entry, list); + sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; + if (!RB_EMPTY_NODE(&event->group_node)) { + list_del_init(&sibling->group_entry); + add_event_to_groups(sibling, event->ctx); + } + WARN_ON_ONCE(sibling->ctx != event->ctx); } @@ -2186,6 +2358,22 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } +static int +flexible_group_sched_in(struct perf_event *event, + struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + int *can_add_hw) +{ + if (event->state <= PERF_EVENT_STATE_OFF || !event_filter_match(event)) + return 0; + + if (group_can_go_on(event, cpuctx, *can_add_hw)) + if (group_sched_in(event, cpuctx, ctx)) + *can_add_hw = 0; + + return 1; +} + static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { @@ -2652,6 +2840,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { + int sw = -1, cpu = smp_processor_id(); int is_active = ctx->is_active; struct perf_event *event; @@ -2700,12 +2889,20 @@ static void ctx_sched_out(struct perf_event_context *ctx, perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { - list_for_each_entry(event, &ctx->pinned_groups, group_entry) + perf_event_groups_for_each_cpu(event, cpu, + &ctx->pinned_groups, group_node) + group_sched_out(event, cpuctx, ctx); + perf_event_groups_for_each_cpu(event, sw, + &ctx->pinned_groups, group_node) group_sched_out(event, cpuctx, ctx); } if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry(event, &ctx->flexible_groups, group_entry) + perf_event_groups_for_each_cpu(event, cpu, + &ctx->flexible_groups, group_node) + group_sched_out(event, cpuctx, ctx); + perf_event_groups_for_each_cpu(event, sw, + &ctx->flexible_groups, group_node) group_sched_out(event, cpuctx, ctx); } perf_pmu_enable(ctx->pmu); @@ -2996,23 +3193,28 @@ static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { + int sw = -1, cpu = smp_processor_id(); struct perf_event *event; + int can_add_hw; + + perf_event_groups_for_each_cpu(event, sw, + &ctx->pinned_groups, group_node) { + can_add_hw = 1; + if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) { + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, + PERF_EVENT_STATE_ERROR); + } + } - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - if (!event_filter_match(event)) - continue; - - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx); - - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + perf_event_groups_for_each_cpu(event, cpu, + &ctx->pinned_groups, group_node) { + can_add_hw = 1; + if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) { + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, + PERF_EVENT_STATE_ERROR); + } } } @@ -3020,25 +3222,19 @@ static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { + int sw = -1, cpu = smp_processor_id(); struct perf_event *event; int can_add_hw = 1; - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - /* Ignore events in OFF or ERROR state */ - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - /* - * Listen to the 'cpu' scheduling filter constraint - * of events: - */ - if (!event_filter_match(event)) - continue; + perf_event_groups_for_each_cpu(event, sw, + &ctx->flexible_groups, group_node) + flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw); + + can_add_hw = 1; + perf_event_groups_for_each_cpu(event, cpu, + &ctx->flexible_groups, group_node) + flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw); - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx)) - can_add_hw = 0; - } - } } static void @@ -3119,7 +3315,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ - if (!list_empty(&ctx->pinned_groups)) + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); @@ -3356,8 +3552,12 @@ static void rotate_ctx(struct perf_event_context *ctx) * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ - if (!ctx->rotate_disable) - list_rotate_left(&ctx->flexible_groups); + if (!ctx->rotate_disable) { + int sw = -1, cpu = smp_processor_id(); + + perf_event_groups_rotate(&ctx->flexible_groups, sw); + perf_event_groups_rotate(&ctx->flexible_groups, cpu); + } } static int perf_rotate_context(struct perf_cpu_context *cpuctx) @@ -3715,8 +3915,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx) raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->active_ctx_list); - INIT_LIST_HEAD(&ctx->pinned_groups); - INIT_LIST_HEAD(&ctx->flexible_groups); + perf_event_groups_init(&ctx->pinned_groups); + perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); } @@ -9561,6 +9761,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); @@ -11085,7 +11286,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->pinned_groups, group_node) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) @@ -11101,7 +11302,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->flexible_groups, group_node) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) -- cgit v1.2.3-71-gd317 From 161c85fab7875f306eee9655dee71068feeb14ce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Nov 2017 14:28:27 +0100 Subject: perf/core: Cleanup the rb-tree code Trivial comment and code fixups.. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Dmitri Prokhorov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Valery Cherepennikov Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 84 +++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c9fee3640f40..22165b009d73 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1461,9 +1461,9 @@ static enum event_type_t get_event_type(struct perf_event *event) } /* - * Helper function to initialize group leader event; + * Helper function to initialize event group nodes. */ -void init_event_group(struct perf_event *event) +static void init_event_group(struct perf_event *event) { RB_CLEAR_NODE(&event->group_node); event->group_index = 0; @@ -1471,7 +1471,7 @@ void init_event_group(struct perf_event *event) /* * Extract pinned or flexible groups from the context - * based on event attrs bits; + * based on event attrs bits. */ static struct perf_event_groups * get_event_groups(struct perf_event *event, struct perf_event_context *ctx) @@ -1483,9 +1483,9 @@ get_event_groups(struct perf_event *event, struct perf_event_context *ctx) } /* - * Helper function to initializes perf event groups object; + * Helper function to initializes perf_event_group trees. */ -void perf_event_groups_init(struct perf_event_groups *groups) +static void perf_event_groups_init(struct perf_event_groups *groups) { groups->tree = RB_ROOT; groups->index = 0; @@ -1493,35 +1493,34 @@ void perf_event_groups_init(struct perf_event_groups *groups) /* * Compare function for event groups; - * Implements complex key that first sorts by CPU and then by - * virtual index which provides ordering when rotating - * groups for the same CPU; + * + * Implements complex key that first sorts by CPU and then by virtual index + * which provides ordering when rotating groups for the same CPU. */ -int perf_event_groups_less(struct perf_event *left, struct perf_event *right) +static bool +perf_event_groups_less(struct perf_event *left, struct perf_event *right) { - if (left->cpu < right->cpu) { - return 1; - } else if (left->cpu > right->cpu) { - return 0; - } else { - if (left->group_index < right->group_index) { - return 1; - } else if(left->group_index > right->group_index) { - return 0; - } else { - return 0; - } - } + if (left->cpu < right->cpu) + return true; + if (left->cpu > right->cpu) + return false; + + if (left->group_index < right->group_index) + return true; + if (left->group_index > right->group_index) + return false; + + return false; } /* - * Insert a group into a tree using event->cpu as a key. If event->cpu node - * is already attached to the tree then the event is added to the attached - * group's group_list list. + * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for + * key (see perf_event_groups_less). This places it last inside the CPU + * subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, - struct perf_event *event) + struct perf_event *event) { struct perf_event *node_event; struct rb_node *parent; @@ -1534,8 +1533,7 @@ perf_event_groups_insert(struct perf_event_groups *groups, while (*node) { parent = *node; - node_event = container_of(*node, - struct perf_event, group_node); + node_event = container_of(*node, struct perf_event, group_node); if (perf_event_groups_less(event, node_event)) node = &parent->rb_left; @@ -1548,8 +1546,7 @@ perf_event_groups_insert(struct perf_event_groups *groups, } /* - * Helper function to insert event into the pinned or - * flexible groups; + * Helper function to insert event into the pinned or flexible groups. */ static void add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) @@ -1561,22 +1558,21 @@ add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) } /* - * Delete a group from a tree. If the group is directly attached to the tree - * it also detaches all groups on the group's group_list list. + * Delete a group from a tree. */ static void perf_event_groups_delete(struct perf_event_groups *groups, - struct perf_event *event) + struct perf_event *event) { - if (!RB_EMPTY_NODE(&event->group_node) && - !RB_EMPTY_ROOT(&groups->tree)) - rb_erase(&event->group_node, &groups->tree); + WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || + RB_EMPTY_ROOT(&groups->tree)); + rb_erase(&event->group_node, &groups->tree); init_event_group(event); } /* - * Helper function to delete event from its groups; + * Helper function to delete event from its groups. */ static void del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) @@ -1588,7 +1584,7 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) } /* - * Get a group by a cpu key from groups tree with the least group_index; + * Get the leftmost event in the @cpu subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu) @@ -1597,8 +1593,7 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu) struct rb_node *node = groups->tree.rb_node; while (node) { - node_event = container_of(node, - struct perf_event, group_node); + node_event = container_of(node, struct perf_event, group_node); if (cpu < node_event->cpu) { node = node->rb_left; @@ -1614,13 +1609,14 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu) } /* - * Find group list by a cpu key and rotate it. + * Rotate the @cpu subtree. + * + * Re-insert the leftmost event at the tail of the subtree. */ static void perf_event_groups_rotate(struct perf_event_groups *groups, int cpu) { - struct perf_event *event = - perf_event_groups_first(groups, cpu); + struct perf_event *event = perf_event_groups_first(groups, cpu); if (event) { perf_event_groups_delete(groups, event); @@ -1629,7 +1625,7 @@ perf_event_groups_rotate(struct perf_event_groups *groups, int cpu) } /* - * Iterate event groups thru the whole tree. + * Iterate through the whole groups tree. */ #define perf_event_groups_for_each(event, groups, node) \ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ -- cgit v1.2.3-71-gd317 From 1cac7b1ae3579457200213303fc28ca13b75592f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Nov 2017 14:28:30 +0100 Subject: perf/core: Fix event schedule order Scheduling in events with cpu=-1 before events with cpu=# changes semantics and is undesirable in that it would priorize these events. Given that groups->index is across all groups we actually have an inter-group ordering, meaning we can merge-sort two groups, which is just what we need to preserve semantics. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Dmitri Prokhorov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Valery Cherepennikov Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 157 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 108 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 22165b009d73..2d8c0208ca4a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1608,6 +1608,21 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu) return match; } +/* + * Like rb_entry_next_safe() for the @cpu subtree. + */ +static struct perf_event * +perf_event_groups_next(struct perf_event *event) +{ + struct perf_event *next; + + next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); + if (next && next->cpu == event->cpu) + return next; + + return NULL; +} + /* * Rotate the @cpu subtree. * @@ -2354,22 +2369,6 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } -static int -flexible_group_sched_in(struct perf_event *event, - struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - int *can_add_hw) -{ - if (event->state <= PERF_EVENT_STATE_OFF || !event_filter_match(event)) - return 0; - - if (group_can_go_on(event, cpuctx, *can_add_hw)) - if (group_sched_in(event, cpuctx, ctx)) - *can_add_hw = 0; - - return 1; -} - static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { @@ -3185,52 +3184,112 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); } -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static int visit_groups_merge(struct perf_event_groups *groups, int cpu, + int (*func)(struct perf_event *, void *), void *data) { - int sw = -1, cpu = smp_processor_id(); - struct perf_event *event; - int can_add_hw; + struct perf_event **evt, *evt1, *evt2; + int ret; - perf_event_groups_for_each_cpu(event, sw, - &ctx->pinned_groups, group_node) { - can_add_hw = 1; - if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) { - if (event->state == PERF_EVENT_STATE_INACTIVE) - perf_event_set_state(event, - PERF_EVENT_STATE_ERROR); + evt1 = perf_event_groups_first(groups, -1); + evt2 = perf_event_groups_first(groups, cpu); + + while (evt1 || evt2) { + if (evt1 && evt2) { + if (evt1->group_index < evt2->group_index) + evt = &evt1; + else + evt = &evt2; + } else if (evt1) { + evt = &evt1; + } else { + evt = &evt2; } + + ret = func(*evt, data); + if (ret) + return ret; + + *evt = perf_event_groups_next(*evt); } - perf_event_groups_for_each_cpu(event, cpu, - &ctx->pinned_groups, group_node) { - can_add_hw = 1; - if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) { - if (event->state == PERF_EVENT_STATE_INACTIVE) - perf_event_set_state(event, - PERF_EVENT_STATE_ERROR); - } + return 0; +} + +struct sched_in_data { + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + int can_add_hw; +}; + +static int pinned_sched_in(struct perf_event *event, void *data) +{ + struct sched_in_data *sid = data; + + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) + group_sched_in(event, sid->cpuctx, sid->ctx); + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + + return 0; +} + +static int flexible_sched_in(struct perf_event *event, void *data) +{ + struct sched_in_data *sid = data; + + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { + if (group_sched_in(event, sid->cpuctx, sid->ctx)) + sid->can_add_hw = 0; } + + return 0; } static void -ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +ctx_pinned_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) { - int sw = -1, cpu = smp_processor_id(); - struct perf_event *event; - int can_add_hw = 1; + struct sched_in_data sid = { + .ctx = ctx, + .cpuctx = cpuctx, + .can_add_hw = 1, + }; - perf_event_groups_for_each_cpu(event, sw, - &ctx->flexible_groups, group_node) - flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw); + visit_groups_merge(&ctx->pinned_groups, + smp_processor_id(), + pinned_sched_in, &sid); +} - can_add_hw = 1; - perf_event_groups_for_each_cpu(event, cpu, - &ctx->flexible_groups, group_node) - flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw); +static void +ctx_flexible_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct sched_in_data sid = { + .ctx = ctx, + .cpuctx = cpuctx, + .can_add_hw = 1, + }; + visit_groups_merge(&ctx->flexible_groups, + smp_processor_id(), + flexible_sched_in, &sid); } static void -- cgit v1.2.3-71-gd317 From 8343aae66167df6708128a778e750d48dbe31302 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Nov 2017 14:28:33 +0100 Subject: perf/core: Remove perf_event::group_entry Now that all the grouping is done with RB trees, we no longer need group_entry and can replace the whole thing with sibling_list. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Dmitri Prokhorov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Valery Cherepennikov Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/alpha/kernel/perf_event.c | 2 +- arch/arm/mach-imx/mmdc.c | 2 +- arch/arm/mm/cache-l2x0-pmu.c | 2 +- arch/mips/kernel/perf_event_mipsxx.c | 2 +- arch/powerpc/perf/core-book3s.c | 2 +- arch/powerpc/perf/core-fsl-emb.c | 2 +- arch/sparc/kernel/perf_event.c | 2 +- arch/x86/events/core.c | 2 +- arch/x86/events/intel/uncore.c | 2 +- drivers/bus/arm-cci.c | 2 +- drivers/bus/arm-ccn.c | 2 +- drivers/perf/arm_dsu_pmu.c | 2 +- drivers/perf/arm_pmu.c | 2 +- drivers/perf/hisilicon/hisi_uncore_pmu.c | 3 +-- drivers/perf/qcom_l2_pmu.c | 4 ++-- drivers/perf/qcom_l3_pmu.c | 2 +- drivers/perf/xgene_pmu.c | 2 +- include/linux/perf_event.h | 5 ----- kernel/events/core.c | 37 ++++++++++++++++---------------- 19 files changed, 36 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index a1f6bc7f1e4c..435864c24479 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -351,7 +351,7 @@ static int collect_events(struct perf_event *group, int max_count, evtype[n] = group->hw.event_base; current_idx[n++] = PMC_NO_INDEX; } - list_for_each_entry(pe, &group->sibling_list, group_entry) { + list_for_each_entry(pe, &group->sibling_list, sibling_list) { if (!is_software_event(pe) && pe->state != PERF_EVENT_STATE_OFF) { if (n >= max_count) return -1; diff --git a/arch/arm/mach-imx/mmdc.c b/arch/arm/mach-imx/mmdc.c index 5fb1d2254b5e..27a9ca20933e 100644 --- a/arch/arm/mach-imx/mmdc.c +++ b/arch/arm/mach-imx/mmdc.c @@ -269,7 +269,7 @@ static bool mmdc_pmu_group_is_valid(struct perf_event *event) return false; } - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { if (!mmdc_pmu_group_event_is_valid(sibling, pmu, &counter_mask)) return false; } diff --git a/arch/arm/mm/cache-l2x0-pmu.c b/arch/arm/mm/cache-l2x0-pmu.c index 0a1e2280141f..3a89ea4c2b57 100644 --- a/arch/arm/mm/cache-l2x0-pmu.c +++ b/arch/arm/mm/cache-l2x0-pmu.c @@ -293,7 +293,7 @@ static bool l2x0_pmu_group_is_valid(struct perf_event *event) else if (!is_software_event(leader)) return false; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { if (sibling->pmu == pmu) num_hw++; else if (!is_software_event(sibling)) diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 6668f67a61c3..46097ff3208b 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -711,7 +711,7 @@ static int validate_group(struct perf_event *event) if (mipsxx_pmu_alloc_counter(&fake_cpuc, &leader->hw) < 0) return -EINVAL; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { if (mipsxx_pmu_alloc_counter(&fake_cpuc, &sibling->hw) < 0) return -EINVAL; } diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index f89bbd54ecec..7c1f66050433 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -1426,7 +1426,7 @@ static int collect_events(struct perf_event *group, int max_count, flags[n] = group->hw.event_base; events[n++] = group->hw.config; } - list_for_each_entry(event, &group->sibling_list, group_entry) { + list_for_each_entry(event, &group->sibling_list, sibling_list) { if (event->pmu->task_ctx_nr == perf_hw_context && event->state != PERF_EVENT_STATE_OFF) { if (n >= max_count) diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c index 5d747b4cb8ee..94c2e63662c6 100644 --- a/arch/powerpc/perf/core-fsl-emb.c +++ b/arch/powerpc/perf/core-fsl-emb.c @@ -277,7 +277,7 @@ static int collect_events(struct perf_event *group, int max_count, ctrs[n] = group; n++; } - list_for_each_entry(event, &group->sibling_list, group_entry) { + list_for_each_entry(event, &group->sibling_list, sibling_list) { if (!is_software_event(event) && event->state != PERF_EVENT_STATE_OFF) { if (n >= max_count) diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 5c1f54758312..a0a86d369119 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1342,7 +1342,7 @@ static int collect_events(struct perf_event *group, int max_count, events[n] = group->hw.event_base; current_idx[n++] = PIC_NO_INDEX; } - list_for_each_entry(event, &group->sibling_list, group_entry) { + list_for_each_entry(event, &group->sibling_list, sibling_list) { if (!is_software_event(event) && event->state != PERF_EVENT_STATE_OFF) { if (n >= max_count) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 9c86e10f1196..77a4125b6b1f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -990,7 +990,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, if (!dogrp) return n; - list_for_each_entry(event, &leader->sibling_list, group_entry) { + list_for_each_entry(event, &leader->sibling_list, sibling_list) { if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7874c980d569..9e374cd22ad2 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -354,7 +354,7 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, if (!dogrp) return n; - list_for_each_entry(event, &leader->sibling_list, group_entry) { + list_for_each_entry(event, &leader->sibling_list, sibling_list) { if (!is_box_event(box, event) || event->state <= PERF_EVENT_STATE_OFF) continue; diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c index 5426c04fe24b..c98435bdb64f 100644 --- a/drivers/bus/arm-cci.c +++ b/drivers/bus/arm-cci.c @@ -1311,7 +1311,7 @@ validate_group(struct perf_event *event) if (!validate_event(event->pmu, &fake_pmu, leader)) return -EINVAL; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { if (!validate_event(event->pmu, &fake_pmu, sibling)) return -EINVAL; } diff --git a/drivers/bus/arm-ccn.c b/drivers/bus/arm-ccn.c index b52332e52ca5..1c310a4be000 100644 --- a/drivers/bus/arm-ccn.c +++ b/drivers/bus/arm-ccn.c @@ -847,7 +847,7 @@ static int arm_ccn_pmu_event_init(struct perf_event *event) return -EINVAL; list_for_each_entry(sibling, &event->group_leader->sibling_list, - group_entry) + sibling_list) if (sibling->pmu != event->pmu && !is_software_event(sibling)) return -EINVAL; diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index 38f2cc2a6c74..660680d78147 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -536,7 +536,7 @@ static bool dsu_pmu_validate_group(struct perf_event *event) memset(fake_hw.used_mask, 0, sizeof(fake_hw.used_mask)); if (!dsu_pmu_validate_event(event->pmu, &fake_hw, leader)) return false; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { if (!dsu_pmu_validate_event(event->pmu, &fake_hw, sibling)) return false; } diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 0c2ed11c0603..628d7a7b9526 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -311,7 +311,7 @@ validate_group(struct perf_event *event) if (!validate_event(event->pmu, &fake_pmu, leader)) return -EINVAL; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { if (!validate_event(event->pmu, &fake_pmu, sibling)) return -EINVAL; } diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index 7ed24b954422..e3356087fd76 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -82,8 +82,7 @@ static bool hisi_validate_event_group(struct perf_event *event) counters++; } - list_for_each_entry(sibling, &event->group_leader->sibling_list, - group_entry) { + list_for_each_entry(sibling, &event->group_leader->sibling_list, sibling_list) { if (is_software_event(sibling)) continue; if (sibling->pmu != event->pmu) diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c index 4fdc8486a8e4..5e535a718965 100644 --- a/drivers/perf/qcom_l2_pmu.c +++ b/drivers/perf/qcom_l2_pmu.c @@ -535,7 +535,7 @@ static int l2_cache_event_init(struct perf_event *event) } list_for_each_entry(sibling, &event->group_leader->sibling_list, - group_entry) + sibling_list) if (sibling->pmu != event->pmu && !is_software_event(sibling)) { dev_dbg_ratelimited(&l2cache_pmu->pdev->dev, @@ -572,7 +572,7 @@ static int l2_cache_event_init(struct perf_event *event) } list_for_each_entry(sibling, &event->group_leader->sibling_list, - group_entry) { + sibling_list) { if ((sibling != event) && !is_software_event(sibling) && (L2_EVT_GROUP(sibling->attr.config) == diff --git a/drivers/perf/qcom_l3_pmu.c b/drivers/perf/qcom_l3_pmu.c index 7f6b62b29e9d..5dedf4b1a552 100644 --- a/drivers/perf/qcom_l3_pmu.c +++ b/drivers/perf/qcom_l3_pmu.c @@ -468,7 +468,7 @@ static bool qcom_l3_cache__validate_event_group(struct perf_event *event) counters = event_num_counters(event); counters += event_num_counters(leader); - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { if (is_software_event(sibling)) continue; if (sibling->pmu != event->pmu) diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c index eb23311bc70c..f1f4a56cab5e 100644 --- a/drivers/perf/xgene_pmu.c +++ b/drivers/perf/xgene_pmu.c @@ -950,7 +950,7 @@ static int xgene_perf_event_init(struct perf_event *event) return -EINVAL; list_for_each_entry(sibling, &event->group_leader->sibling_list, - group_entry) + sibling_list) if (sibling->pmu != event->pmu && !is_software_event(sibling)) return -EINVAL; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6e3f854a34d8..84044ec21b31 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -549,14 +549,9 @@ struct perf_event { struct list_head event_entry; /* - * XXX: group_entry and sibling_list should be mutually exclusive; - * either you're a sibling on a group, or you're the group leader. - * Rework the code to always use the same list element. - * * Locked for modification by both ctx->mutex and ctx->lock; holding * either sufficies for read. */ - struct list_head group_entry; struct list_head sibling_list; /* * Node on the pinned or flexible tree located at the event context; diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d8c0208ca4a..9a07bbe66451 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) + list_for_each_entry(sibling, &leader->sibling_list, sibling_list) perf_event_update_time(sibling); } @@ -1835,12 +1835,12 @@ static void perf_group_attach(struct perf_event *event) group_leader->group_caps &= event->event_caps; - list_add_tail(&event->group_entry, &group_leader->sibling_list); + list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; perf_event__header_size(group_leader); - list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + list_for_each_entry(pos, &group_leader->sibling_list, sibling_list) perf_event__header_size(pos); } @@ -1904,7 +1904,7 @@ static void perf_group_detach(struct perf_event *event) * If this is a sibling, remove it from its group. */ if (event->group_leader != event) { - list_del_init(&event->group_entry); + list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; goto out; } @@ -1914,7 +1914,7 @@ static void perf_group_detach(struct perf_event *event) * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ - list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { sibling->group_leader = sibling; @@ -1922,7 +1922,7 @@ static void perf_group_detach(struct perf_event *event) sibling->group_caps = event->group_caps; if (!RB_EMPTY_NODE(&event->group_node)) { - list_del_init(&sibling->group_entry); + list_del_init(&sibling->sibling_list); add_event_to_groups(sibling, event->ctx); } @@ -1932,7 +1932,7 @@ static void perf_group_detach(struct perf_event *event) out: perf_event__header_size(event->group_leader); - list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + list_for_each_entry(tmp, &event->group_leader->sibling_list, sibling_list) perf_event__header_size(tmp); } @@ -1960,7 +1960,7 @@ static inline int pmu_filter_match(struct perf_event *event) if (!__pmu_filter_match(event)) return 0; - list_for_each_entry(child, &event->sibling_list, group_entry) { + list_for_each_entry(child, &event->sibling_list, sibling_list) { if (!__pmu_filter_match(child)) return 0; } @@ -2028,7 +2028,7 @@ group_sched_out(struct perf_event *group_event, /* * Schedule out siblings (if any): */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) + list_for_each_entry(event, &group_event->sibling_list, sibling_list) event_sched_out(event, cpuctx, ctx); perf_pmu_enable(ctx->pmu); @@ -2307,7 +2307,7 @@ group_sched_in(struct perf_event *group_event, /* * Schedule in siblings as one group (if any): */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { + list_for_each_entry(event, &group_event->sibling_list, sibling_list) { if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; @@ -2323,7 +2323,7 @@ group_error: * partial group before returning: * The events up to the failed event are scheduled out normally. */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { + list_for_each_entry(event, &group_event->sibling_list, sibling_list) { if (event == partial_group) break; @@ -3796,7 +3796,7 @@ static void __perf_event_read(void *info) pmu->read(event); - list_for_each_entry(sub, &event->sibling_list, group_entry) { + list_for_each_entry(sub, &event->sibling_list, sibling_list) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since @@ -4642,7 +4642,7 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { + list_for_each_entry(sub, &leader->sibling_list, sibling_list) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); @@ -4836,7 +4836,7 @@ static void perf_event_for_each(struct perf_event *event, event = event->group_leader; perf_event_for_each_child(event, func); - list_for_each_entry(sibling, &event->sibling_list, group_entry) + list_for_each_entry(sibling, &event->sibling_list, sibling_list) perf_event_for_each_child(sibling, func); } @@ -5995,7 +5995,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, __output_copy(handle, values, n * sizeof(u64)); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { + list_for_each_entry(sub, &leader->sibling_list, sibling_list) { n = 0; if ((sub != event) && @@ -9813,7 +9813,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); - INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); init_event_group(event); @@ -10581,7 +10580,7 @@ SYSCALL_DEFINE5(perf_event_open, put_ctx(gctx); list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { + sibling_list) { perf_remove_from_context(sibling, 0); put_ctx(gctx); } @@ -10603,7 +10602,7 @@ SYSCALL_DEFINE5(perf_event_open, * reachable through the group lists. */ list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { + sibling_list) { perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); @@ -11242,7 +11241,7 @@ static int inherit_group(struct perf_event *parent_event, * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway. */ - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + list_for_each_entry(sub, &parent_event->sibling_list, sibling_list) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) -- cgit v1.2.3-71-gd317 From 6668128a9e25f7a11d25359e46df2541e6b43fc9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Nov 2017 14:28:38 +0100 Subject: perf/core: Optimize ctx_sched_out() When an event group contains more events than can be scheduled on the hardware, iterating the full event group for ctx_sched_out is a waste of time. Keep track of the events that got programmed on the hardware, such that we can iterate this smaller list in order to schedule them out. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Dmitri Prokhorov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Valery Cherepennikov Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 53 +++++++++++++++++++++++++--------------------- 2 files changed, 34 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 84044ec21b31..2bb200e1bbea 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -553,6 +553,7 @@ struct perf_event { * either sufficies for read. */ struct list_head sibling_list; + struct list_head active_list; /* * Node on the pinned or flexible tree located at the event context; */ @@ -718,6 +719,10 @@ struct perf_event_context { struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; + + struct list_head pinned_active; + struct list_head flexible_active; + int nr_events; int nr_active; int is_active; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9a07bbe66451..4d601c06074f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1647,14 +1647,6 @@ perf_event_groups_rotate(struct perf_event_groups *groups, int cpu) typeof(*event), node); event; \ event = rb_entry_safe(rb_next(&event->node), \ typeof(*event), node)) -/* - * Iterate event groups with cpu == key. - */ -#define perf_event_groups_for_each_cpu(event, key, groups, node) \ - for (event = perf_event_groups_first(groups, key); \ - event && event->cpu == key; \ - event = rb_entry_safe(rb_next(&event->node), \ - typeof(*event), node)) /* * Add a event from the lists for its context. @@ -1889,8 +1881,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; + struct perf_event_context *ctx = event->ctx; - lockdep_assert_held(&event->ctx->lock); + lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. @@ -1924,6 +1917,13 @@ static void perf_group_detach(struct perf_event *event) if (!RB_EMPTY_NODE(&event->group_node)) { list_del_init(&sibling->sibling_list); add_event_to_groups(sibling, event->ctx); + + if (sibling->state == PERF_EVENT_STATE_ACTIVE) { + struct list_head *list = sibling->attr.pinned ? + &ctx->pinned_active : &ctx->flexible_active; + + list_add_tail(&sibling->active_list, list); + } } WARN_ON_ONCE(sibling->ctx != event->ctx); @@ -1988,6 +1988,13 @@ event_sched_out(struct perf_event *event, if (event->state != PERF_EVENT_STATE_ACTIVE) return; + /* + * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but + * we can schedule events _OUT_ individually through things like + * __perf_remove_from_context(). + */ + list_del_init(&event->active_list); + perf_pmu_disable(event->pmu); event->pmu->del(event, 0); @@ -2835,9 +2842,8 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { - int sw = -1, cpu = smp_processor_id(); + struct perf_event *event, *tmp; int is_active = ctx->is_active; - struct perf_event *event; lockdep_assert_held(&ctx->lock); @@ -2884,20 +2890,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { - perf_event_groups_for_each_cpu(event, cpu, - &ctx->pinned_groups, group_node) - group_sched_out(event, cpuctx, ctx); - perf_event_groups_for_each_cpu(event, sw, - &ctx->pinned_groups, group_node) + list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) group_sched_out(event, cpuctx, ctx); } if (is_active & EVENT_FLEXIBLE) { - perf_event_groups_for_each_cpu(event, cpu, - &ctx->flexible_groups, group_node) - group_sched_out(event, cpuctx, ctx); - perf_event_groups_for_each_cpu(event, sw, - &ctx->flexible_groups, group_node) + list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) group_sched_out(event, cpuctx, ctx); } perf_pmu_enable(ctx->pmu); @@ -3231,8 +3229,10 @@ static int pinned_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) - group_sched_in(event, sid->cpuctx, sid->ctx); + if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { + if (!group_sched_in(event, sid->cpuctx, sid->ctx)) + list_add_tail(&event->active_list, &sid->ctx->pinned_active); + } /* * If this pinned group hasn't been scheduled, @@ -3255,7 +3255,9 @@ static int flexible_sched_in(struct perf_event *event, void *data) return 0; if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { - if (group_sched_in(event, sid->cpuctx, sid->ctx)) + if (!group_sched_in(event, sid->cpuctx, sid->ctx)) + list_add_tail(&event->active_list, &sid->ctx->flexible_active); + else sid->can_add_hw = 0; } @@ -3973,6 +3975,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx) perf_event_groups_init(&ctx->pinned_groups); perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); + INIT_LIST_HEAD(&ctx->pinned_active); + INIT_LIST_HEAD(&ctx->flexible_active); atomic_set(&ctx->refcount, 1); } @@ -9815,6 +9819,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + INIT_LIST_HEAD(&event->active_list); init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); -- cgit v1.2.3-71-gd317 From 6e6804d2fa0eff6520f3a2b48ff52bcb9dc25a9d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Nov 2017 14:28:41 +0100 Subject: perf/core: Simpify perf_event_groups_for_each() The last argument is, and always must be, the same. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Dmitri Prokhorov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Valery Cherepennikov Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d601c06074f..fc5dd072c194 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1642,11 +1642,11 @@ perf_event_groups_rotate(struct perf_event_groups *groups, int cpu) /* * Iterate through the whole groups tree. */ -#define perf_event_groups_for_each(event, groups, node) \ - for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ - typeof(*event), node); event; \ - event = rb_entry_safe(rb_next(&event->node), \ - typeof(*event), node)) +#define perf_event_groups_for_each(event, groups) \ + for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ + typeof(*event), group_node); event; \ + event = rb_entry_safe(rb_next(&event->group_node), \ + typeof(*event), group_node)) /* * Add a event from the lists for its context. @@ -11345,7 +11345,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - perf_event_groups_for_each(event, &parent_ctx->pinned_groups, group_node) { + perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) @@ -11361,7 +11361,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - perf_event_groups_for_each(event, &parent_ctx->flexible_groups, group_node) { + perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) -- cgit v1.2.3-71-gd317 From 8703a7cfe148f73062c568e9a8549ce692104864 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Nov 2017 14:28:44 +0100 Subject: perf/core: Fix tree based event rotation Similar to how first programming cpu=-1 and then cpu=# is wrong, so is rotating both. It was especially wrong when we were still programming the PMU in this same order, because in that scenario we might never actually end up running cpu=# events at all. Cure this by using the active_list to pick the rotation event; since at programming we already select the left-most event. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Dmitri Prokhorov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Valery Cherepennikov Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fc5dd072c194..460e485220e8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1623,22 +1623,6 @@ perf_event_groups_next(struct perf_event *event) return NULL; } -/* - * Rotate the @cpu subtree. - * - * Re-insert the leftmost event at the tail of the subtree. - */ -static void -perf_event_groups_rotate(struct perf_event_groups *groups, int cpu) -{ - struct perf_event *event = perf_event_groups_first(groups, cpu); - - if (event) { - perf_event_groups_delete(groups, event); - perf_event_groups_insert(groups, event); - } -} - /* * Iterate through the whole groups tree. */ @@ -3601,24 +3585,24 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, } /* - * Round-robin a context's events: + * Move @event to the tail of the @ctx's elegible events. */ -static void rotate_ctx(struct perf_event_context *ctx) +static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) { /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ - if (!ctx->rotate_disable) { - int sw = -1, cpu = smp_processor_id(); + if (ctx->rotate_disable) + return; - perf_event_groups_rotate(&ctx->flexible_groups, sw); - perf_event_groups_rotate(&ctx->flexible_groups, cpu); - } + perf_event_groups_delete(&ctx->flexible_groups, event); + perf_event_groups_insert(&ctx->flexible_groups, event); } static int perf_rotate_context(struct perf_cpu_context *cpuctx) { + struct perf_event *ctx_event = NULL, *cpuctx_event = NULL; struct perf_event_context *ctx = NULL; int rotate = 0; @@ -3639,13 +3623,21 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx) perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); + cpuctx_event = list_first_entry_or_null(&cpuctx->ctx.flexible_active, + struct perf_event, active_list); + if (ctx) { + ctx_event = list_first_entry_or_null(&ctx->flexible_active, + struct perf_event, active_list); + } + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + if (cpuctx_event) + rotate_ctx(&cpuctx->ctx, cpuctx_event); + if (ctx_event) + rotate_ctx(ctx, ctx_event); perf_event_sched_in(cpuctx, ctx, current); -- cgit v1.2.3-71-gd317 From 8d5bce0c37fa10f21dbdd6a6d8fcba85202fe24e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Mar 2018 14:56:27 +0100 Subject: perf/core: Optimize perf_rotate_context() event scheduling The event schedule order (as per perf_event_sched_in()) is: - cpu pinned - task pinned - cpu flexible - task flexible But perf_rotate_context() will unschedule cpu-flexible even if it doesn't need a rotation. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 60 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 460e485220e8..f98c0f88cc94 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -430,7 +430,7 @@ static void update_perf_cpu_limits(void) WRITE_ONCE(perf_sample_allowed_ns, tmp); } -static int perf_rotate_context(struct perf_cpu_context *cpuctx); +static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -1041,7 +1041,7 @@ list_update_cgroup_event(struct perf_event *event, static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_context *cpuctx; - int rotations = 0; + bool rotations; lockdep_assert_irqs_disabled(); @@ -3600,52 +3600,66 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) perf_event_groups_insert(&ctx->flexible_groups, event); } -static int perf_rotate_context(struct perf_cpu_context *cpuctx) +static inline struct perf_event * +ctx_first_active(struct perf_event_context *ctx) { - struct perf_event *ctx_event = NULL, *cpuctx_event = NULL; + return list_first_entry_or_null(&ctx->flexible_active, + struct perf_event, active_list); +} + +static bool perf_rotate_context(struct perf_cpu_context *cpuctx) +{ + struct perf_event *cpu_event = NULL, *task_event = NULL; + bool cpu_rotate = false, task_rotate = false; struct perf_event_context *ctx = NULL; - int rotate = 0; + + /* + * Since we run this from IRQ context, nobody can install new + * events, thus the event count values are stable. + */ if (cpuctx->ctx.nr_events) { if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; + cpu_rotate = true; } ctx = cpuctx->task_ctx; if (ctx && ctx->nr_events) { if (ctx->nr_events != ctx->nr_active) - rotate = 1; + task_rotate = true; } - if (!rotate) - goto done; + if (!(cpu_rotate || task_rotate)) + return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - cpuctx_event = list_first_entry_or_null(&cpuctx->ctx.flexible_active, - struct perf_event, active_list); - if (ctx) { - ctx_event = list_first_entry_or_null(&ctx->flexible_active, - struct perf_event, active_list); - } + if (task_rotate) + task_event = ctx_first_active(ctx); + if (cpu_rotate) + cpu_event = ctx_first_active(&cpuctx->ctx); - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) + /* + * As per the order given at ctx_resched() first 'pop' task flexible + * and then, if needed CPU flexible. + */ + if (task_event || (ctx && cpu_event)) ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (cpu_event) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (cpuctx_event) - rotate_ctx(&cpuctx->ctx, cpuctx_event); - if (ctx_event) - rotate_ctx(ctx, ctx_event); + if (task_event) + rotate_ctx(ctx, task_event); + if (cpu_event) + rotate_ctx(&cpuctx->ctx, cpu_event); perf_event_sched_in(cpuctx, ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); -done: - return rotate; + return true; } void perf_event_task_tick(void) -- cgit v1.2.3-71-gd317 From 33801b94741d6c3be9713c10aa627477216c21e2 Mon Sep 17 00:00:00 2001 From: "leilei.lin" Date: Tue, 6 Mar 2018 17:36:37 +0800 Subject: perf/core: Fix installing cgroup events on CPU There's two problems when installing cgroup events on CPUs: firstly list_update_cgroup_event() only tries to set cpuctx->cgrp for the first event, if that mismatches on @cgrp we'll not try again for later additions. Secondly, when we install a cgroup event into an active context, only issue an event reprogram when the event matches the current cgroup context. This avoids a pointless event reprogramming. Signed-off-by: leilei.lin [ Improved the changelog and comments. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: brendan.d.gregg@gmail.com Cc: eranian@gmail.com Cc: linux-kernel@vger.kernel.org Cc: yang_oliver@hotmail.com Link: http://lkml.kernel.org/r/20180306093637.28247-1-linxiulei@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f98c0f88cc94..969f865f9f1c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -937,27 +937,39 @@ list_update_cgroup_event(struct perf_event *event, if (!is_cgroup_event(event)) return; - if (add && ctx->nr_cgroups++) - return; - else if (!add && --ctx->nr_cgroups) - return; /* * Because cgroup events are always per-cpu events, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); - cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; - /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ - if (add) { + + /* + * Since setting cpuctx->cgrp is conditional on the current @cgrp + * matching the event's cgroup, we must do this for every new event, + * because if the first would mismatch, the second would not try again + * and we would leave cpuctx->cgrp unset. + */ + if (add && !cpuctx->cgrp) { struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); - list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) cpuctx->cgrp = cgrp; - } else { - list_del(cpuctx_entry); - cpuctx->cgrp = NULL; } + + if (add && ctx->nr_cgroups++) + return; + else if (!add && --ctx->nr_cgroups) + return; + + /* no cgroup running */ + if (!add) + cpuctx->cgrp = NULL; + + cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; + if (add) + list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); + else + list_del(cpuctx_entry); } #else /* !CONFIG_CGROUP_PERF */ @@ -2489,6 +2501,18 @@ static int __perf_install_in_context(void *info) raw_spin_lock(&task_ctx->lock); } +#ifdef CONFIG_CGROUP_PERF + if (is_cgroup_event(event)) { + /* + * If the current cgroup doesn't match the event's + * cgroup, we should not try to schedule it. + */ + struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); + reprogram = cgroup_is_descendant(cgrp->css.cgroup, + event->cgrp->css.cgroup); + } +#endif + if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); -- cgit v1.2.3-71-gd317 From b6b76dd62c56f257cdd30900ed22668c42a74030 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 12 Mar 2018 19:00:49 +0900 Subject: error-injection: Fix to prohibit jump optimization Since the kprobe which was optimized by jump can not change the execution path, the kprobe for error-injection must not be optimized. To prohibit it, set a dummy post-handler as officially stated in Documentation/kprobes.txt. Fixes: 4b1a29a7f542 ("error-injection: Support fault injection framework") Signed-off-by: Masami Hiramatsu Signed-off-by: Daniel Borkmann --- kernel/fail_function.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 21b0122cb39c..1d5632d8bbcc 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -14,6 +14,15 @@ static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); +static void fei_post_handler(struct kprobe *kp, struct pt_regs *regs, + unsigned long flags) +{ + /* + * A dummy post handler is required to prohibit optimizing, because + * jump optimization does not support execution path overriding. + */ +} + struct fei_attr { struct list_head list; struct kprobe kp; @@ -56,6 +65,7 @@ static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr) return NULL; } attr->kp.pre_handler = fei_kprobe_handler; + attr->kp.post_handler = fei_post_handler; attr->retval = adjust_error_retval(addr, 0); INIT_LIST_HEAD(&attr->list); } -- cgit v1.2.3-71-gd317 From cbd9d9f114f9e5931ba199f4450667656b892f32 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Mar 2018 14:45:41 +0100 Subject: hw_breakpoint: Pass bp_type directly as find_slot_idx() argument Pass bp_type directly as a find_slot_idx() argument, so we don't need to have whole event to get the breakpoint slot type. It will be used in following changes. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Frederic Weisbecker Cc: Hari Bathini Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Michael Ellerman Cc: Milind Chabbi Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20180312134548.31532-2-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3f8cb1e14588..395ca07965af 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -85,9 +85,9 @@ __weak int hw_breakpoint_weight(struct perf_event *bp) return 1; } -static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) +static inline enum bp_type_idx find_slot_idx(u64 bp_type) { - if (bp->attr.bp_type & HW_BREAKPOINT_RW) + if (bp_type & HW_BREAKPOINT_RW) return TYPE_DATA; return TYPE_INST; @@ -122,7 +122,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) list_for_each_entry(iter, &bp_task_head, hw.bp_list) { if (iter->hw.target == tsk && - find_slot_idx(iter) == type && + find_slot_idx(iter->attr.bp_type) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } @@ -292,7 +292,7 @@ static int __reserve_bp_slot(struct perf_event *bp) bp->attr.bp_type == HW_BREAKPOINT_INVALID) return -EINVAL; - type = find_slot_idx(bp); + type = find_slot_idx(bp->attr.bp_type); weight = hw_breakpoint_weight(bp); fetch_bp_busy_slots(&slots, bp, type); @@ -329,7 +329,7 @@ static void __release_bp_slot(struct perf_event *bp) enum bp_type_idx type; int weight; - type = find_slot_idx(bp); + type = find_slot_idx(bp->attr.bp_type); weight = hw_breakpoint_weight(bp); toggle_bp_slot(bp, false, type, weight); } -- cgit v1.2.3-71-gd317 From 1ad9ff7dea4c42bee43f98b7d7ce8037ee22e133 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Mar 2018 14:45:42 +0100 Subject: hw_breakpoint: Pass bp_type argument to __reserve_bp_slot|__release_bp_slot() Passing bp_type argument to __reserve_bp_slot() and __release_bp_slot() functions, so we can pass another bp_type than the one defined in bp->attr.bp_type. This will be handy in following change that fixes breakpoint slot counts during its modification. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Frederic Weisbecker Cc: Hari Bathini Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Michael Ellerman Cc: Milind Chabbi Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20180312134548.31532-3-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 395ca07965af..9b31fbcc3305 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -277,7 +277,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM */ -static int __reserve_bp_slot(struct perf_event *bp) +static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { struct bp_busy_slots slots = {0}; enum bp_type_idx type; @@ -288,11 +288,11 @@ static int __reserve_bp_slot(struct perf_event *bp) return -ENOMEM; /* Basic checks */ - if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || - bp->attr.bp_type == HW_BREAKPOINT_INVALID) + if (bp_type == HW_BREAKPOINT_EMPTY || + bp_type == HW_BREAKPOINT_INVALID) return -EINVAL; - type = find_slot_idx(bp->attr.bp_type); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); fetch_bp_busy_slots(&slots, bp, type); @@ -317,19 +317,19 @@ int reserve_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); - ret = __reserve_bp_slot(bp); + ret = __reserve_bp_slot(bp, bp->attr.bp_type); mutex_unlock(&nr_bp_mutex); return ret; } -static void __release_bp_slot(struct perf_event *bp) +static void __release_bp_slot(struct perf_event *bp, u64 bp_type) { enum bp_type_idx type; int weight; - type = find_slot_idx(bp->attr.bp_type); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); toggle_bp_slot(bp, false, type, weight); } @@ -339,7 +339,7 @@ void release_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); arch_unregister_hw_breakpoint(bp); - __release_bp_slot(bp); + __release_bp_slot(bp, bp->attr.bp_type); mutex_unlock(&nr_bp_mutex); } @@ -354,7 +354,7 @@ int dbg_reserve_bp_slot(struct perf_event *bp) if (mutex_is_locked(&nr_bp_mutex)) return -1; - return __reserve_bp_slot(bp); + return __reserve_bp_slot(bp, bp->attr.bp_type); } int dbg_release_bp_slot(struct perf_event *bp) @@ -362,7 +362,7 @@ int dbg_release_bp_slot(struct perf_event *bp) if (mutex_is_locked(&nr_bp_mutex)) return -1; - __release_bp_slot(bp); + __release_bp_slot(bp, bp->attr.bp_type); return 0; } -- cgit v1.2.3-71-gd317 From ea6a9d530c1779b9bd30944b803820c462f01eac Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Mar 2018 14:45:43 +0100 Subject: hw_breakpoint: Add modify_bp_slot() function Add the modify_bp_slot() function to keep slot numbers correct when changing the breakpoint type. Using existing __release_bp_slot()/__reserve_bp_slot() call sequence to update the slot counts. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Frederic Weisbecker Cc: Hari Bathini Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Michael Ellerman Cc: Milind Chabbi Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20180312134548.31532-4-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 46 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9b31fbcc3305..776948beb4ac 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -44,6 +44,7 @@ #include #include #include +#include #include /* @@ -344,6 +345,38 @@ void release_bp_slot(struct perf_event *bp) mutex_unlock(&nr_bp_mutex); } +static int __modify_bp_slot(struct perf_event *bp, u64 old_type) +{ + int err; + + __release_bp_slot(bp, old_type); + + err = __reserve_bp_slot(bp, bp->attr.bp_type); + if (err) { + /* + * Reserve the old_type slot back in case + * there's no space for the new type. + * + * This must succeed, because we just released + * the old_type slot in the __release_bp_slot + * call above. If not, something is broken. + */ + WARN_ON(__reserve_bp_slot(bp, old_type)); + } + + return err; +} + +static int modify_bp_slot(struct perf_event *bp, u64 old_type) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + ret = __modify_bp_slot(bp, old_type); + mutex_unlock(&nr_bp_mutex); + return ret; +} + /* * Allow the kernel debugger to reserve breakpoint slots without * taking a lock using the dbg_* variant of for the reserve and @@ -435,6 +468,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att u64 old_addr = bp->attr.bp_addr; u64 old_len = bp->attr.bp_len; int old_type = bp->attr.bp_type; + bool modify = attr->bp_type != old_type; int err = 0; /* @@ -452,12 +486,9 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att bp->attr.bp_type = attr->bp_type; bp->attr.bp_len = attr->bp_len; - if (attr->disabled) - goto end; - err = validate_hw_breakpoint(bp); - if (!err) - perf_event_enable(bp); + if (!err && modify) + err = modify_bp_slot(bp, old_type); if (err) { bp->attr.bp_addr = old_addr; @@ -469,9 +500,10 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att return err; } -end: - bp->attr.disabled = attr->disabled; + if (!attr->disabled) + perf_event_enable(bp); + bp->attr.disabled = attr->disabled; return 0; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); -- cgit v1.2.3-71-gd317 From 18ff57b220610a699947f20b156a8245ca7eee98 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Mar 2018 14:45:44 +0100 Subject: hw_breakpoint: Factor out __modify_user_hw_breakpoint() function Moving out all the functionality without the events disabling/enabling calls, because we want to call another disabling/enabling functions in following change. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Frederic Weisbecker Cc: Hari Bathini Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Michael Ellerman Cc: Milind Chabbi Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20180312134548.31532-5-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 46 +++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 776948beb4ac..a556aba223da 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -456,6 +456,33 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); +static int __modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) +{ + u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; + int old_type = bp->attr.bp_type; + bool modify = attr->bp_type != old_type; + int err = 0; + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + err = validate_hw_breakpoint(bp); + if (!err && modify) + err = modify_bp_slot(bp, old_type); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + return err; + } + + bp->attr.disabled = attr->disabled; + return 0; +} + /** * modify_user_hw_breakpoint - modify a user-space hardware breakpoint * @bp: the breakpoint structure to modify @@ -465,11 +492,7 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { - u64 old_addr = bp->attr.bp_addr; - u64 old_len = bp->attr.bp_len; - int old_type = bp->attr.bp_type; - bool modify = attr->bp_type != old_type; - int err = 0; + int err; /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it @@ -482,18 +505,9 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att else perf_event_disable(bp); - bp->attr.bp_addr = attr->bp_addr; - bp->attr.bp_type = attr->bp_type; - bp->attr.bp_len = attr->bp_len; - - err = validate_hw_breakpoint(bp); - if (!err && modify) - err = modify_bp_slot(bp, old_type); + err = __modify_user_hw_breakpoint(bp, attr); if (err) { - bp->attr.bp_addr = old_addr; - bp->attr.bp_type = old_type; - bp->attr.bp_len = old_len; if (!bp->attr.disabled) perf_event_enable(bp); @@ -502,8 +516,6 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att if (!attr->disabled) perf_event_enable(bp); - - bp->attr.disabled = attr->disabled; return 0; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); -- cgit v1.2.3-71-gd317 From 705feaf321c37e4dca3637fd5cb3b275f17a06c9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Mar 2018 14:45:45 +0100 Subject: hw_breakpoint: Add perf_event_attr fields check in __modify_user_hw_breakpoint() And rename it to modify_user_hw_breakpoint_check(). We are about to use modify_user_hw_breakpoint_check() for user space breakpoints modification, we must be very strict to check only the fields we can change have changed. As Peter explained: "Suppose someone does: attr = malloc(sizeof(*attr)); // uninitialized memory attr->type = BP; attr->bp_addr = new_addr; attr->bp_type = bp_type; attr->bp_len = bp_len; ioctl(fd, PERF_IOC_MOD_ATTR, &attr); And feeds absolute shite for the rest of the fields. Then we later want to extend IOC_MOD_ATTR to allow changing attr::sample_type but we can't, because that would break the above application." I'm making this check optional because we already export modify_user_hw_breakpoint() and with this check we could break existing users. Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Frederic Weisbecker Cc: Hari Bathini Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Michael Ellerman Cc: Milind Chabbi Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20180312134548.31532-6-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a556aba223da..0c82663395f7 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -456,7 +456,9 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); -static int __modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) +static int +modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, + bool check) { u64 old_addr = bp->attr.bp_addr; u64 old_len = bp->attr.bp_len; @@ -468,6 +470,9 @@ static int __modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_ bp->attr.bp_type = attr->bp_type; bp->attr.bp_len = attr->bp_len; + if (check && memcmp(&bp->attr, attr, sizeof(*attr))) + return -EINVAL; + err = validate_hw_breakpoint(bp); if (!err && modify) err = modify_bp_slot(bp, old_type); @@ -505,7 +510,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att else perf_event_disable(bp); - err = __modify_user_hw_breakpoint(bp, attr); + err = modify_user_hw_breakpoint_check(bp, attr, false); if (err) { if (!bp->attr.disabled) -- cgit v1.2.3-71-gd317 From 5f970521d3279d99adcdebf329631e36cb9f0deb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Mar 2018 14:45:46 +0100 Subject: perf/core: Move perf_event_attr::sample_max_stack into perf_copy_attr() Move the sample_max_stack check and setup into perf_copy_attr(), so we have all perf_event_attr initial setup in one place and can easily compare attrs in the new ioctl introduced in following change. Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Frederic Weisbecker Cc: Hari Bathini Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Michael Ellerman Cc: Milind Chabbi Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20180312134548.31532-7-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 969f865f9f1c..ee145bdee6ed 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10125,6 +10125,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, ret = -EINVAL; } + if (!attr->sample_max_stack) + attr->sample_max_stack = sysctl_perf_event_max_stack; + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); out: @@ -10338,9 +10341,6 @@ SYSCALL_DEFINE5(perf_event_open, perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EACCES; - if (!attr.sample_max_stack) - attr.sample_max_stack = sysctl_perf_event_max_stack; - /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument -- cgit v1.2.3-71-gd317 From 72199320d49dbafa1a99f94f1cd60dc90035c154 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Mar 2018 17:33:32 +0100 Subject: timekeeping: Add the new CLOCK_MONOTONIC_ACTIVE clock The planned change to unify the behaviour of the MONOTONIC and BOOTTIME clocks vs. suspend removes the ability to retrieve the active non-suspended time of a system. Provide a new CLOCK_MONOTONIC_ACTIVE clock which returns the active non-suspended time of the system via clock_gettime(). This preserves the old behaviour of CLOCK_MONOTONIC before the BOOTTIME/MONOTONIC unification. This new clock also allows applications to detect programmatically that the MONOTONIC and BOOTTIME clocks are identical. Signed-off-by: Thomas Gleixner Cc: Dmitry Torokhov Cc: John Stultz Cc: Jonathan Corbet Cc: Kevin Easton Cc: Linus Torvalds Cc: Mark Salyzyn Cc: Michael Kerrisk Cc: Peter Zijlstra Cc: Petr Mladek Cc: Prarit Bhargava Cc: Sergey Senozhatsky Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20180301165149.965235774@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timekeeper_internal.h | 2 ++ include/linux/timekeeping.h | 1 + include/uapi/linux/time.h | 1 + kernel/time/posix-stubs.c | 2 ++ kernel/time/posix-timers.c | 13 +++++++++++++ kernel/time/timekeeping.c | 36 ++++++++++++++++++++++++++++++++++++ 6 files changed, 55 insertions(+) (limited to 'kernel') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 7acb953298a7..4b3dca173e89 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -52,6 +52,7 @@ struct tk_read_base { * @offs_real: Offset clock monotonic -> clock realtime * @offs_boot: Offset clock monotonic -> clock boottime * @offs_tai: Offset clock monotonic -> clock tai + * @time_suspended: Accumulated suspend time * @tai_offset: The current UTC to TAI offset in seconds * @clock_was_set_seq: The sequence number of clock was set events * @cs_was_changed_seq: The sequence number of clocksource change events @@ -94,6 +95,7 @@ struct timekeeper { ktime_t offs_real; ktime_t offs_boot; ktime_t offs_tai; + ktime_t time_suspended; s32 tai_offset; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index b17bcce58bc4..440b1935d3a5 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -32,6 +32,7 @@ extern void getrawmonotonic64(struct timespec64 *ts); extern void ktime_get_ts64(struct timespec64 *ts); extern time64_t ktime_get_seconds(void); extern time64_t ktime_get_real_seconds(void); +extern void ktime_get_active_ts64(struct timespec64 *ts); extern int __getnstimeofday64(struct timespec64 *tv); extern void getnstimeofday64(struct timespec64 *tv); diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 53f8dd84beb5..61a187df8da2 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -61,6 +61,7 @@ struct itimerval { */ #define CLOCK_SGI_CYCLE 10 #define CLOCK_TAI 11 +#define CLOCK_MONOTONIC_ACTIVE 12 #define MAX_CLOCKS 16 #define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC) diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index b258bee13b02..6259dbc0191a 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -73,6 +73,8 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) case CLOCK_BOOTTIME: get_monotonic_boottime64(tp); break; + case CLOCK_MONOTONIC_ACTIVE: + ktime_get_active_ts64(tp); default: return -EINVAL; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 75043046914e..556fe02a47a4 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -263,6 +263,13 @@ static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) return 0; } +static int posix_get_monotonic_active(clockid_t which_clock, + struct timespec64 *tp) +{ + ktime_get_active_ts64(tp); + return 0; +} + static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) { tp->tv_sec = 0; @@ -1330,6 +1337,11 @@ static const struct k_clock clock_boottime = { .timer_arm = common_hrtimer_arm, }; +static const struct k_clock clock_monotonic_active = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_monotonic_active, +}; + static const struct k_clock * const posix_clocks[] = { [CLOCK_REALTIME] = &clock_realtime, [CLOCK_MONOTONIC] = &clock_monotonic, @@ -1342,6 +1354,7 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, + [CLOCK_MONOTONIC_ACTIVE] = &clock_monotonic_active, }; static const struct k_clock *clockid_to_kclock(const clockid_t id) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e11760121cb2..a2b7f583e64e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -139,6 +139,9 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { tk->offs_boot = ktime_add(tk->offs_boot, delta); + + /* Accumulate time spent in suspend */ + tk->time_suspended += delta; } /* @@ -886,6 +889,39 @@ void ktime_get_ts64(struct timespec64 *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts64); +/** + * ktime_get_active_ts64 - Get the active non-suspended monotonic clock + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime clock and + * the wall_to_monotonic offset, subtracts the accumulated suspend time and + * stores the result in normalized timespec64 format in the variable + * pointed to by @ts. + */ +void ktime_get_active_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 tomono, tsusp; + u64 nsec, nssusp; + unsigned int seq; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(&tk->tkr_mono); + tomono = tk->wall_to_monotonic; + nssusp = tk->time_suspended; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsec + tomono.tv_nsec); + tsusp = ns_to_timespec64(nssusp); + *ts = timespec64_sub(*ts, tsusp); +} + /** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * -- cgit v1.2.3-71-gd317 From d6ed449afdb38f89a7b38ec50e367559e1b8f71f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Mar 2018 17:33:33 +0100 Subject: timekeeping: Make the MONOTONIC clock behave like the BOOTTIME clock The MONOTONIC clock is not fast forwarded by the time spent in suspend on resume. This is only done for the BOOTTIME clock. The reason why the MONOTONIC clock is not forwarded is historical: the original Linux implementation was using jiffies as a base for the MONOTONIC clock and jiffies have never been advanced after resume. At some point when timekeeping was unified in the core code, the MONONOTIC clock was advanced after resume which also advanced jiffies causing interesting side effects. As a consequence the the MONOTONIC clock forwarding was disabled again and the BOOTTIME clock was introduced, which allows to read time since boot. Back then it was not possible to completely distangle the MONOTONIC clock and jiffies because there were still interfaces which exposed the MONOTONIC clock behaviour based on the timer wheel and therefore jiffies. As of today none of the MONOTONIC clock facilities depends on jiffies anymore so the forwarding can be done seperately. This is achieved by forwarding the variables which are used for the jiffies update after resume before the tick is restarted, In timekeeping resume, the change is rather simple. Instead of updating the offset between the MONOTONIC clock and the REALTIME/BOOTTIME clocks, advance the time keeper base for the MONOTONIC and the MONOTONIC_RAW clocks by the time spent in suspend. The MONOTONIC clock is now the same as the BOOTTIME clock and the offset between the REALTIME and the MONOTONIC clocks is the same as before suspend. There might be side effects in applications, which rely on the (unfortunately) well documented behaviour of the MONOTONIC clock, but the downsides of the existing behaviour are probably worse. There is one obvious issue. Up to now it was possible to retrieve the time spent in suspend by observing the delta between the MONOTONIC clock and the BOOTTIME clock. This is not longer available, but the previously introduced mechanism to read the active non-suspended monotonic time can mitigate that in a detectable fashion. Signed-off-by: Thomas Gleixner Cc: Andrew Morton Cc: Dmitry Torokhov Cc: John Stultz Cc: Jonathan Corbet Cc: Kevin Easton Cc: Linus Torvalds Cc: Mark Salyzyn Cc: Michael Kerrisk Cc: Peter Zijlstra Cc: Petr Mladek Cc: Prarit Bhargava Cc: Sergey Senozhatsky Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20180301165150.062975504@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/tick-common.c | 15 +++++++++++++++ kernel/time/tick-internal.h | 6 ++++++ kernel/time/tick-sched.c | 9 +++++++++ kernel/time/timekeeping.c | 7 ++++--- 4 files changed, 34 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 49edc1c4f3e6..099572ca4a8f 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -419,6 +419,19 @@ void tick_suspend_local(void) clockevents_shutdown(td->evtdev); } +static void tick_forward_next_period(void) +{ + ktime_t delta, now = ktime_get(); + u64 n; + + delta = ktime_sub(now, tick_next_period); + n = ktime_divns(delta, tick_period); + tick_next_period += n * tick_period; + if (tick_next_period < now) + tick_next_period += tick_period; + tick_sched_forward_next_period(); +} + /** * tick_resume_local - Resume the local tick device * @@ -431,6 +444,8 @@ void tick_resume_local(void) struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool broadcast = tick_resume_check_broadcast(); + tick_forward_next_period(); + clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e277284c2831..21efab7485ca 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -141,6 +141,12 @@ static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } #endif /* !(BROADCAST && ONESHOT) */ +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +extern void tick_sched_forward_next_period(void); +#else +static inline void tick_sched_forward_next_period(void) { } +#endif + /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 29a5733eff83..f53e37b5d248 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -51,6 +51,15 @@ struct tick_sched *tick_get_tick_sched(int cpu) */ static ktime_t last_jiffies_update; +/* + * Called after resume. Make sure that jiffies are not fast forwarded due to + * clock monotonic being forwarded by the suspended time. + */ +void tick_sched_forward_next_period(void) +{ + last_jiffies_update = tick_next_period; +} + /* * Must be called with interrupts disabled ! */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a2b7f583e64e..b509fe7acd64 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -138,7 +138,9 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* Update both bases so mono and raw stay coupled. */ + tk->tkr_mono.base += delta; + tk->tkr_raw.base += delta; /* Accumulate time spent in suspend */ tk->time_suspended += delta; @@ -1622,7 +1624,6 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, return; } tk_xtime_add(tk, delta); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } @@ -2155,7 +2156,7 @@ out: void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); + ktime_t t = ktime_sub(tk->offs_real, tk->time_suspended); *ts = ktime_to_timespec64(t); } -- cgit v1.2.3-71-gd317 From d6c7270e913db75ca5fdc79915ba780e97ae2857 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Mar 2018 17:33:35 +0100 Subject: timekeeping: Remove boot time specific code Now that the MONOTONIC and BOOTTIME clocks are the same, remove all the special handling from timekeeping. Keep wrappers for the existing users of the *boot* timekeeper interfaces. Signed-off-by: Thomas Gleixner Cc: Dmitry Torokhov Cc: John Stultz Cc: Jonathan Corbet Cc: Kevin Easton Cc: Linus Torvalds Cc: Mark Salyzyn Cc: Michael Kerrisk Cc: Peter Zijlstra Cc: Petr Mladek Cc: Prarit Bhargava Cc: Sergey Senozhatsky Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20180301165150.236279497@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timekeeping.h | 42 +++++++++++++++++------------------------- kernel/time/timekeeping.c | 31 ------------------------------- 2 files changed, 17 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 440b1935d3a5..abb396731332 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -38,15 +38,19 @@ extern int __getnstimeofday64(struct timespec64 *tv); extern void getnstimeofday64(struct timespec64 *tv); extern void getboottime64(struct timespec64 *ts); -#define ktime_get_real_ts64(ts) getnstimeofday64(ts) +#define ktime_get_real_ts64(ts) getnstimeofday64(ts) + +/* Clock BOOTTIME compatibility wrappers */ +static inline void get_monotonic_boottime64(struct timespec64 *ts) +{ + ktime_get_ts64(ts); +} /* * ktime_t based interfaces */ - enum tk_offsets { TK_OFFS_REAL, - TK_OFFS_BOOT, TK_OFFS_TAI, TK_OFFS_MAX, }; @@ -57,6 +61,10 @@ extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs); extern ktime_t ktime_get_raw(void); extern u32 ktime_get_resolution_ns(void); +/* Clock BOOTTIME compatibility wrappers */ +static inline ktime_t ktime_get_boottime(void) { return ktime_get(); } +static inline u64 ktime_get_boot_ns(void) { return ktime_get(); } + /** * ktime_get_real - get the real (wall-) time in ktime_t format */ @@ -65,17 +73,6 @@ static inline ktime_t ktime_get_real(void) return ktime_get_with_offset(TK_OFFS_REAL); } -/** - * ktime_get_boottime - Returns monotonic time since boot in ktime_t format - * - * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the - * time spent in suspend. - */ -static inline ktime_t ktime_get_boottime(void) -{ - return ktime_get_with_offset(TK_OFFS_BOOT); -} - /** * ktime_get_clocktai - Returns the TAI time of day in ktime_t format */ @@ -102,11 +99,6 @@ static inline u64 ktime_get_real_ns(void) return ktime_to_ns(ktime_get_real()); } -static inline u64 ktime_get_boot_ns(void) -{ - return ktime_to_ns(ktime_get_boottime()); -} - static inline u64 ktime_get_tai_ns(void) { return ktime_to_ns(ktime_get_clocktai()); @@ -119,17 +111,17 @@ static inline u64 ktime_get_raw_ns(void) extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); -extern u64 ktime_get_boot_fast_ns(void); extern u64 ktime_get_real_fast_ns(void); -/* - * timespec64 interfaces utilizing the ktime based ones - */ -static inline void get_monotonic_boottime64(struct timespec64 *ts) +/* Clock BOOTTIME compatibility wrappers */ +static inline u64 ktime_get_boot_fast_ns(void) { - *ts = ktime_to_timespec64(ktime_get_boottime()); + return ktime_get_mono_fast_ns(); } +/* + * timespec64 interfaces utilizing the ktime based ones + */ static inline void timekeeping_clocktai64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_clocktai()); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b509fe7acd64..8355c8803282 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -473,36 +473,6 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); -/** - * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. - * - * To keep it NMI safe since we're accessing from tracing, we're not using a - * separate timekeeper with updates to monotonic clock and boot offset - * protected with seqlocks. This has the following minor side effects: - * - * (1) Its possible that a timestamp be taken after the boot offset is updated - * but before the timekeeper is updated. If this happens, the new boot offset - * is added to the old timekeeping making the clock appear to update slightly - * earlier: - * CPU 0 CPU 1 - * timekeeping_inject_sleeptime64() - * __timekeeping_inject_sleeptime(tk, delta); - * timestamp(); - * timekeeping_update(tk, TK_CLEAR_NTP...); - * - * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be - * partially updated. Since the tk->offs_boot update is a rare event, this - * should be a rare occurrence which postprocessing should be able to handle. - */ -u64 notrace ktime_get_boot_fast_ns(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); -} -EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); - - /* * See comment for __ktime_get_fast_ns() vs. timestamp ordering */ @@ -794,7 +764,6 @@ EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, - [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; -- cgit v1.2.3-71-gd317 From 7250a4047aa6106006c2c9b5aff91d7d3fb77962 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Mar 2018 17:33:36 +0100 Subject: posix-timers: Unify MONOTONIC and BOOTTIME clock behavior Now that the MONOTONIC and BOOTTIME clocks are indentical remove all the special casing. The user space visible interfaces still support both clocks, but their behavior is identical. Signed-off-by: Thomas Gleixner Cc: Dmitry Torokhov Cc: John Stultz Cc: Jonathan Corbet Cc: Kevin Easton Cc: Linus Torvalds Cc: Mark Salyzyn Cc: Michael Kerrisk Cc: Peter Zijlstra Cc: Petr Mladek Cc: Prarit Bhargava Cc: Sergey Senozhatsky Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20180301165150.315745557@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/posix-timers.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 556fe02a47a4..8cf95bfee44f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -251,12 +251,6 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) -{ - get_monotonic_boottime64(tp); - return 0; -} - static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) { timekeeping_clocktai64(tp); @@ -1322,21 +1316,6 @@ static const struct k_clock clock_tai = { .timer_arm = common_hrtimer_arm, }; -static const struct k_clock clock_boottime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - .timer_rearm = common_hrtimer_rearm, - .timer_forward = common_hrtimer_forward, - .timer_remaining = common_hrtimer_remaining, - .timer_try_to_cancel = common_hrtimer_try_to_cancel, - .timer_arm = common_hrtimer_arm, -}; - static const struct k_clock clock_monotonic_active = { .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_monotonic_active, @@ -1350,7 +1329,7 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, - [CLOCK_BOOTTIME] = &clock_boottime, + [CLOCK_BOOTTIME] = &clock_monotonic, [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, -- cgit v1.2.3-71-gd317 From 127bfa5f4342e63d83a0b07ece376c2e8878e4a5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Mar 2018 17:33:37 +0100 Subject: hrtimer: Unify MONOTONIC and BOOTTIME clock behavior Now that th MONOTONIC and BOOTTIME clocks are indentical remove all the special casing. The user space visible interfaces still support both clocks, but their behavior is identical. Signed-off-by: Thomas Gleixner Cc: Dmitry Torokhov Cc: John Stultz Cc: Jonathan Corbet Cc: Kevin Easton Cc: Linus Torvalds Cc: Mark Salyzyn Cc: Michael Kerrisk Cc: Peter Zijlstra Cc: Petr Mladek Cc: Prarit Bhargava Cc: Sergey Senozhatsky Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20180301165150.410218515@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/hrtimer.h | 2 -- kernel/time/hrtimer.c | 16 ++-------------- kernel/time/timekeeping.c | 4 +--- kernel/time/timekeeping.h | 1 - 4 files changed, 3 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c7902ca7c9f4..78f456fcd242 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -161,11 +161,9 @@ struct hrtimer_clock_base { enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, HRTIMER_BASE_REALTIME, - HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, HRTIMER_BASE_MONOTONIC_SOFT, HRTIMER_BASE_REALTIME_SOFT, - HRTIMER_BASE_BOOTTIME_SOFT, HRTIMER_BASE_TAI_SOFT, HRTIMER_MAX_CLOCK_BASES, }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 23788100e214..9b082ce86325 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -90,11 +90,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, - { - .index = HRTIMER_BASE_BOOTTIME, - .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, - }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, @@ -110,11 +105,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, - { - .index = HRTIMER_BASE_BOOTTIME_SOFT, - .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, - }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, @@ -129,7 +119,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_BOOTTIME] = HRTIMER_BASE_MONOTONIC, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; @@ -565,14 +555,12 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; - ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, - offs_real, offs_boot, offs_tai); + offs_real, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; - base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8355c8803282..ca90219a1e73 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2195,7 +2195,6 @@ void do_timer(unsigned long ticks) * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset - * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the @@ -2205,7 +2204,7 @@ void do_timer(unsigned long ticks) * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, - ktime_t *offs_boot, ktime_t *offs_tai) + ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; @@ -2222,7 +2221,6 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 7a9b4eb7a1d5..79b67f5e0343 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -6,7 +6,6 @@ */ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, - ktime_t *offs_boot, ktime_t *offs_tai); extern int timekeeping_valid_for_hres(void); -- cgit v1.2.3-71-gd317 From 92af4dcb4e1c5f58dc337bc97bdffd4e853dbc93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Mar 2018 17:33:38 +0100 Subject: tracing: Unify the "boot" and "mono" tracing clocks Unify the "boot" and "mono" tracing clocks and document the new behaviour. Signed-off-by: Thomas Gleixner Cc: Dmitry Torokhov Cc: John Stultz Cc: Jonathan Corbet Cc: Kevin Easton Cc: Linus Torvalds Cc: Mark Salyzyn Cc: Michael Kerrisk Cc: Peter Zijlstra Cc: Petr Mladek Cc: Prarit Bhargava Cc: Sergey Senozhatsky Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20180301165150.489635255@linutronix.de Signed-off-by: Ingo Molnar --- Documentation/trace/ftrace.txt | 14 +++----------- include/linux/timekeeping.h | 6 ------ kernel/trace/trace.c | 2 +- 3 files changed, 4 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index d4601df6e72e..bf89f98bfdb9 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -449,17 +449,9 @@ of ftrace. Here is a list of some of the key files: which is montonic but is not subject to any rate adjustments and ticks at the same rate as the hardware clocksource. - boot: This is the boot clock (CLOCK_BOOTTIME) and is based on the - fast monotonic clock, but also accounts for time spent in - suspend. Since the clock access is designed for use in - tracing in the suspend path, some side effects are possible - if clock is accessed after the suspend time is accounted before - the fast mono clock is updated. In this case, the clock update - appears to happen slightly sooner than it normally would have. - Also on 32-bit systems, it's possible that the 64-bit boot offset - sees a partial update. These effects are rare and post - processing should be able to handle them. See comments in the - ktime_get_boot_fast_ns() function for more information. + boot: Same as mono. Used to be a separate clock which accounted + for the time spent in suspend while CLOCK_MONOTONIC did + not. To set a clock, simply echo the clock name into this file. diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index abb396731332..82c219dfd3bb 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -113,12 +113,6 @@ extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); extern u64 ktime_get_real_fast_ns(void); -/* Clock BOOTTIME compatibility wrappers */ -static inline u64 ktime_get_boot_fast_ns(void) -{ - return ktime_get_mono_fast_ns(); -} - /* * timespec64 interfaces utilizing the ktime based ones */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 20a2300ae4e8..300f4ea39646 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1164,7 +1164,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, - { ktime_get_boot_fast_ns, "boot", 1 }, + { ktime_get_mono_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; -- cgit v1.2.3-71-gd317 From 32ff77e8cc9e66cc4fb38098f64fd54cc8f54573 Mon Sep 17 00:00:00 2001 From: Milind Chabbi Date: Mon, 12 Mar 2018 14:45:47 +0100 Subject: perf/core: Implement fast breakpoint modification via _IOC_MODIFY_ATTRIBUTES Problem and motivation: Once a breakpoint perf event (PERF_TYPE_BREAKPOINT) is created, there is no flexibility to change the breakpoint type (bp_type), breakpoint address (bp_addr), or breakpoint length (bp_len). The only option is to close the perf event and configure a new breakpoint event. This inflexibility has a significant performance overhead. For example, sampling-based, lightweight performance profilers (and also concurrency bug detection tools), monitor different addresses for a short duration using PERF_TYPE_BREAKPOINT and change the address (bp_addr) to another address or change the kind of breakpoint (bp_type) from "write" to a "read" or vice-versa or change the length (bp_len) of the address being monitored. The cost of these modifications is prohibitive since it involves unmapping the circular buffer associated with the perf event, closing the perf event, opening another perf event and mmaping another circular buffer. Solution: The new ioctl flag for perf events, PERF_EVENT_IOC_MODIFY_ATTRIBUTES, introduced in this patch takes a pointer to a struct perf_event_attr as an argument to update an old breakpoint event with new address, type, and size. This facility allows retaining a previous mmaped perf events ring buffer and avoids having to close and reopen another perf event. This patch supports only changing PERF_TYPE_BREAKPOINT event type; future implementations can extend this feature. The patch replicates some of its functionality of modify_user_hw_breakpoint() in kernel/events/hw_breakpoint.c. modify_user_hw_breakpoint cannot be called directly since perf_event_ctx_lock() is already held in _perf_ioctl(). Evidence: Experiments show that the baseline (not able to modify an already created breakpoint) costs an order of magnitude (~10x) more than the suggested optimization (having the ability to dynamically modifying a configured breakpoint via ioctl). When the breakpoints typically do not trap, the speedup due to the suggested optimization is ~10x; even when the breakpoints always trap, the speedup is ~4x due to the suggested optimization. Testing: tests posted at https://github.com/linux-contrib/perf_event_modify_bp demonstrate the performance significance of this patch. Tests also check the functional correctness of the patch. Signed-off-by: Milind Chabbi [ Using modify_user_hw_breakpoint_check function. ] [ Reformated PERF_EVENT_IOC_*, so the values are all in one column. ] Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Frederic Weisbecker Cc: Hari Bathini Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Michael Ellerman Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20180312134548.31532-8-jolsa@kernel.org Signed-off-by: Ingo Molnar --- include/linux/hw_breakpoint.h | 7 +++++ include/uapi/linux/perf_event.h | 23 +++++++++-------- kernel/events/core.c | 48 +++++++++++++++++++++++++++++++++++ kernel/events/hw_breakpoint.c | 2 +- tools/include/uapi/linux/perf_event.h | 23 +++++++++-------- 5 files changed, 80 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index cf045885a499..6058c3844a76 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -53,6 +53,9 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, /* FIXME: only change from the attr, and don't unregister */ extern int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr); +extern int +modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, + bool check); /* * Kernel breakpoints are not associated with any particular thread. @@ -97,6 +100,10 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, static inline int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { return -ENOSYS; } +static inline int +modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, + bool check) { return -ENOSYS; } + static inline struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 6f873503552d..912b85b52344 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -448,17 +448,18 @@ struct perf_event_query_bpf { /* * Ioctls that can be done on a perf event fd: */ -#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) -#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) -#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) -#define PERF_EVENT_IOC_RESET _IO ('$', 3) -#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) -#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) -#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) -#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) -#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) -#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) -#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) +#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/events/core.c b/kernel/events/core.c index ee145bdee6ed..3b4c7792a6ac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2846,6 +2846,41 @@ int perf_event_refresh(struct perf_event *event, int refresh) } EXPORT_SYMBOL_GPL(perf_event_refresh); +static int perf_event_modify_breakpoint(struct perf_event *bp, + struct perf_event_attr *attr) +{ + int err; + + _perf_event_disable(bp); + + err = modify_user_hw_breakpoint_check(bp, attr, true); + if (err) { + if (!bp->attr.disabled) + _perf_event_enable(bp); + + return err; + } + + if (!attr->disabled) + _perf_event_enable(bp); + return 0; +} + +static int perf_event_modify_attr(struct perf_event *event, + struct perf_event_attr *attr) +{ + if (event->attr.type != attr->type) + return -EINVAL; + + switch (event->attr.type) { + case PERF_TYPE_BREAKPOINT: + return perf_event_modify_breakpoint(event, attr); + default: + /* Place holder for future additions. */ + return -EOPNOTSUPP; + } +} + static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) @@ -4952,6 +4987,8 @@ static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -5024,6 +5061,17 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_QUERY_BPF: return perf_event_query_prog_array(event, (void __user *)arg); + + case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { + struct perf_event_attr new_attr; + int err = perf_copy_attr((struct perf_event_attr __user *)arg, + &new_attr); + + if (err) + return err; + + return perf_event_modify_attr(event, &new_attr); + } default: return -ENOTTY; } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 0c82663395f7..6253d5519cd8 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -456,7 +456,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); -static int +int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 6f873503552d..912b85b52344 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -448,17 +448,18 @@ struct perf_event_query_bpf { /* * Ioctls that can be done on a perf event fd: */ -#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) -#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) -#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) -#define PERF_EVENT_IOC_RESET _IO ('$', 3) -#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) -#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) -#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) -#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) -#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) -#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) -#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) +#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, -- cgit v1.2.3-71-gd317 From 537f4146c53c95aac977852b371bafb9c6755ee1 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 6 Mar 2018 15:35:43 +0530 Subject: workqueue: use put_device() instead of kfree() Never directly free @dev after calling device_register(), even if it returned an error! Always use put_device() to give up the reference initialized in this function instead. Signed-off-by: Arvind Yadav Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bb9a519cbf50..ccd1080dd6e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5337,7 +5337,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) ret = device_register(&wq_dev->dev); if (ret) { - kfree(wq_dev); + put_device(&wq_dev->dev); wq->wq_dev = NULL; return ret; } -- cgit v1.2.3-71-gd317 From 6417250d3f894e66a68ba1cd93676143f2376a6f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 6 Mar 2018 19:34:42 -0800 Subject: workqueue: remove unused cancel_work() Found this by accident. There are no usages of bare cancel_work() in current kernel source. Signed-off-by: Stephen Hemminger Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 1 - kernel/workqueue.c | 8 -------- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index bc0cda180c8b..0c3301421c57 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -456,7 +456,6 @@ extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); -extern bool cancel_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ccd1080dd6e7..6ec6ba65127b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3018,14 +3018,6 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork) return ret; } -/* - * See cancel_delayed_work() - */ -bool cancel_work(struct work_struct *work) -{ - return __cancel_work(work, false); -} - /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel -- cgit v1.2.3-71-gd317 From af1d830bf32b27b387b97c8b29dc09e306a9ff7f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 14 Mar 2018 10:24:20 -0500 Subject: jump_label: Fix sparc64 warning The kbuild test robot reported the following warning on sparc64: kernel/jump_label.c: In function '__jump_label_update': kernel/jump_label.c:376:51: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] WARN_ONCE(1, "can't patch jump_label at %pS", (void *)entry->code); On sparc64, the jump_label entry->code field is of type u32, but pointers are 64-bit. Silence the warning by casting entry->code to an unsigned long before casting it to a pointer. This is also what the sparc jump label code does. Fixes: dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt") Reported-by: kbuild test robot Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Jason Baron Cc: Borislav Petkov Cc: "David S . Miller" Link: https://lkml.kernel.org/r/c966fed42be6611254a62d46579ec7416548d572.1521041026.git.jpoimboe@redhat.com --- kernel/jump_label.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 52a0a7af8640..e7214093dcd1 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -373,7 +373,8 @@ static void __jump_label_update(struct static_key *key, if (kernel_text_address(entry->code)) arch_jump_label_transform(entry, jump_label_type(entry)); else - WARN_ONCE(1, "can't patch jump_label at %pS", (void *)entry->code); + WARN_ONCE(1, "can't patch jump_label at %pS", + (void *)(unsigned long)entry->code); } } } -- cgit v1.2.3-71-gd317 From 17a2f1ced0280068897990b0dd25ce70555b8ac7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Dec 2017 21:50:05 +0800 Subject: cpu/hotplug: Merge cpuhp_bp_states and cpuhp_ap_states cpuhp_bp_states and cpuhp_ap_states have different set of steps without any conflicting steps, so that they can be merged. The original `[CPUHP_BRINGUP_CPU] = { },` is removed, because the new cpuhp_hp_states has CPUHP_ONLINE index which is larger than CPUHP_BRINGUP_CPU. Signed-off-by: Lai Jiangshan Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Boris Ostrovsky Cc: "Paul E. McKenney" Link: https://lkml.kernel.org/r/20171201135008.21633-1-jiangshanlai@gmail.com --- kernel/cpu.c | 42 +++++++++++++++--------------------------- 1 file changed, 15 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 53f7dc65f9a3..a1860d42aacf 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -124,8 +124,7 @@ struct cpuhp_step { }; static DEFINE_MUTEX(cpuhp_state_mutex); -static struct cpuhp_step cpuhp_bp_states[]; -static struct cpuhp_step cpuhp_ap_states[]; +static struct cpuhp_step cpuhp_hp_states[]; static bool cpuhp_is_ap_state(enum cpuhp_state state) { @@ -138,10 +137,7 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { - struct cpuhp_step *sp; - - sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; - return sp + state; + return cpuhp_hp_states + state; } /** @@ -1224,7 +1220,7 @@ int __boot_cpu_id; #endif /* CONFIG_SMP */ /* Boot processor state steps */ -static struct cpuhp_step cpuhp_bp_states[] = { +static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_OFFLINE] = { .name = "offline", .startup.single = NULL, @@ -1289,24 +1285,6 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown.single = NULL, .cant_stop = true, }, - /* - * Handled on controll processor until the plugged processor manages - * this itself. - */ - [CPUHP_TEARDOWN_CPU] = { - .name = "cpu:teardown", - .startup.single = NULL, - .teardown.single = takedown_cpu, - .cant_stop = true, - }, -#else - [CPUHP_BRINGUP_CPU] = { }, -#endif -}; - -/* Application processor state steps */ -static struct cpuhp_step cpuhp_ap_states[] = { -#ifdef CONFIG_SMP /* Final state before CPU kills itself */ [CPUHP_AP_IDLE_DEAD] = { .name = "idle:dead", @@ -1340,6 +1318,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { [CPUHP_AP_ONLINE] = { .name = "ap:online", }, + /* + * Handled on controll processor until the plugged processor manages + * this itself. + */ + [CPUHP_TEARDOWN_CPU] = { + .name = "cpu:teardown", + .startup.single = NULL, + .teardown.single = takedown_cpu, + .cant_stop = true, + }, /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", @@ -1408,11 +1396,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state) switch (state) { case CPUHP_AP_ONLINE_DYN: - step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN; + step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN; end = CPUHP_AP_ONLINE_DYN_END; break; case CPUHP_BP_PREPARE_DYN: - step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN; + step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN; end = CPUHP_BP_PREPARE_DYN_END; break; default: -- cgit v1.2.3-71-gd317 From e9baef0d86162add1205eb07bae08e9efc2f1ae0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 8 Mar 2018 15:32:50 -0500 Subject: tracing: Combine enum and arrays into single macro in filter code Instead of having a separate enum that is the index into another array, like a string array, make a single macro that combines them into a single list, and then the two can not get out of sync. This makes it easier to add and remove items. The macro trick is: #define DOGS \ C( JACK, "Jack Russell") \ C( ITALIAN, "Italian Greyhound") \ C( GERMAN, "German Shepherd") #undef C #define C(a, b) a enum { DOGS }; #undef C #define C(a, b) b static char dogs[] = { DOGS }; Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 112 ++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c3c6eee1e4df..a2ef393b3bb2 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -33,22 +33,26 @@ "# Only events with the given fields will be affected.\n" \ "# If no events are modified, an error message will be displayed here" -enum filter_op_ids -{ - OP_OR, - OP_AND, - OP_GLOB, - OP_NE, - OP_EQ, - OP_LT, - OP_LE, - OP_GT, - OP_GE, - OP_BAND, - OP_NOT, - OP_NONE, - OP_OPEN_PAREN, -}; +#define OPS \ + C( OP_OR, "||", 1 ), \ + C( OP_AND, "&&", 2 ), \ + C( OP_GLOB, "~", 4 ), \ + C( OP_NE, "!=", 4 ), \ + C( OP_EQ, "==", 4 ), \ + C( OP_LT, "<", 5 ), \ + C( OP_LE, "<=", 5 ), \ + C( OP_GT, ">", 5 ), \ + C( OP_GE, ">=", 5 ), \ + C( OP_BAND, "&", 6 ), \ + C( OP_NOT, "!", 6 ), \ + C( OP_NONE, "OP_NONE", 0 ), \ + C( OP_OPEN_PAREN, "(", 0 ), \ + C( OP_MAX, NULL, 0 ) + +#undef C +#define C(a, b, c) a + +enum filter_op_ids { OPS }; struct filter_op { int id; @@ -56,56 +60,36 @@ struct filter_op { int precedence; }; -/* Order must be the same as enum filter_op_ids above */ -static struct filter_op filter_ops[] = { - { OP_OR, "||", 1 }, - { OP_AND, "&&", 2 }, - { OP_GLOB, "~", 4 }, - { OP_NE, "!=", 4 }, - { OP_EQ, "==", 4 }, - { OP_LT, "<", 5 }, - { OP_LE, "<=", 5 }, - { OP_GT, ">", 5 }, - { OP_GE, ">=", 5 }, - { OP_BAND, "&", 6 }, - { OP_NOT, "!", 6 }, - { OP_NONE, "OP_NONE", 0 }, - { OP_OPEN_PAREN, "(", 0 }, -}; +#undef C +#define C(a, b, c) { a, b, c } -enum { - FILT_ERR_NONE, - FILT_ERR_INVALID_OP, - FILT_ERR_UNBALANCED_PAREN, - FILT_ERR_TOO_MANY_OPERANDS, - FILT_ERR_OPERAND_TOO_LONG, - FILT_ERR_FIELD_NOT_FOUND, - FILT_ERR_ILLEGAL_FIELD_OP, - FILT_ERR_ILLEGAL_INTVAL, - FILT_ERR_BAD_SUBSYS_FILTER, - FILT_ERR_TOO_MANY_PREDS, - FILT_ERR_MISSING_FIELD, - FILT_ERR_INVALID_FILTER, - FILT_ERR_IP_FIELD_ONLY, - FILT_ERR_ILLEGAL_NOT_OP, -}; +static struct filter_op filter_ops[] = { OPS }; -static char *err_text[] = { - "No error", - "Invalid operator", - "Unbalanced parens", - "Too many operands", - "Operand too long", - "Field not found", - "Illegal operation for field type", - "Illegal integer value", - "Couldn't find or set field in one of a subsystem's events", - "Too many terms in predicate expression", - "Missing field name and/or value", - "Meaningless filter expression", - "Only 'ip' field is supported for function trace", - "Illegal use of '!'", -}; +#define ERRORS \ + C( NONE, "No error"), \ + C( INVALID_OP, "Invalid operator"), \ + C( UNBALANCED_PAREN, "Unbalanced parens"), \ + C( TOO_MANY_OPERANDS, "Too many operands"), \ + C( OPERAND_TOO_LONG, "Operand too long"), \ + C( FIELD_NOT_FOUND, "Field not found"), \ + C( ILLEGAL_FIELD_OP, "Illegal operation for field type"), \ + C( ILLEGAL_INTVAL, "Illegal integer value"), \ + C( BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \ + C( TOO_MANY_PREDS, "Too many terms in predicate expression"), \ + C( MISSING_FIELD, "Missing field name and/or value"), \ + C( INVALID_FILTER, "Meaningless filter expression"), \ + C( IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ + C( ILLEGAL_NOT_OP, "Illegal use of '!'"), + +#undef C +#define C(a, b) FILT_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static char *err_text[] = { ERRORS }; struct opstack_op { enum filter_op_ids op; -- cgit v1.2.3-71-gd317 From 478325f188657d0e503d1f88cdaf516c792352c5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 8 Mar 2018 17:53:20 -0500 Subject: tracing: Clean up and document pred_funcs_##type creation and use The pred_funcs_##type arrays consist of five functions that are assigned based on the ops. The array must be in the same order of the ops each function represents. The PRED_FUNC_START macro denotes the op enum that starts the op that maps to the pred_funcs_##type arrays. This is all very subtle and prone to bugs if the code is changed. Add comments describing how PRED_FUNC_START and pred_funcs_##type array is used, and also a PRED_FUNC_MAX that is the maximum number of functions in the arrays. Clean up select_comparison_fn() that assigns the predicates to the pred_funcs_##type array function as well as add protection in case an op is passed in that does not map correctly to the array. Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 46 ++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index a2ef393b3bb2..9d383f4383dc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -65,6 +65,13 @@ struct filter_op { static struct filter_op filter_ops[] = { OPS }; +/* + * pred functions are OP_LT, OP_LE, OP_GT, OP_GE, and OP_BAND + * pred_funcs_##type below must match the order of them above. + */ +#define PRED_FUNC_START OP_LT +#define PRED_FUNC_MAX (OP_BAND - PRED_FUNC_START) + #define ERRORS \ C( NONE, "No error"), \ C( INVALID_OP, "Invalid operator"), \ @@ -172,8 +179,6 @@ static const filter_pred_fn_t pred_funcs_##type[] = { \ filter_pred_BAND_##type, \ }; -#define PRED_FUNC_START OP_LT - #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ @@ -946,39 +951,52 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, int field_size, int field_is_signed) { filter_pred_fn_t fn = NULL; + int pred_func_index = -1; + + switch (op) { + case OP_EQ: + case OP_NE: + break; + default: + if (WARN_ON_ONCE(op < PRED_FUNC_START)) + return NULL; + pred_func_index = op - PRED_FUNC_START; + if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX)) + return NULL; + } switch (field_size) { case 8: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_64; else if (field_is_signed) - fn = pred_funcs_s64[op - PRED_FUNC_START]; + fn = pred_funcs_s64[pred_func_index]; else - fn = pred_funcs_u64[op - PRED_FUNC_START]; + fn = pred_funcs_u64[pred_func_index]; break; case 4: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_32; else if (field_is_signed) - fn = pred_funcs_s32[op - PRED_FUNC_START]; + fn = pred_funcs_s32[pred_func_index]; else - fn = pred_funcs_u32[op - PRED_FUNC_START]; + fn = pred_funcs_u32[pred_func_index]; break; case 2: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_16; else if (field_is_signed) - fn = pred_funcs_s16[op - PRED_FUNC_START]; + fn = pred_funcs_s16[pred_func_index]; else - fn = pred_funcs_u16[op - PRED_FUNC_START]; + fn = pred_funcs_u16[pred_func_index]; break; case 1: - if (op == OP_EQ || op == OP_NE) + if (pred_func_index < 0) fn = filter_pred_8; else if (field_is_signed) - fn = pred_funcs_s8[op - PRED_FUNC_START]; + fn = pred_funcs_s8[pred_func_index]; else - fn = pred_funcs_u8[op - PRED_FUNC_START]; + fn = pred_funcs_u8[pred_func_index]; break; } -- cgit v1.2.3-71-gd317 From 80765597bc587feae8dbc8ce97a0f32e12a6e625 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 9 Mar 2018 13:19:28 -0500 Subject: tracing: Rewrite filter logic to be simpler and faster Al Viro reviewed the filter logic of ftrace trace events and found it to be very troubling. It creates a binary tree based on the logic operators and walks it during tracing. He sent myself and Tom Zanussi a long explanation (and formal proof) of how to do the string parsing better and end up with a program array that can be simply iterated to come up with the correct results. I took his ideas and his pseudo code and rewrote the filter logic based on them. In doing so, I was able to remove a lot of code, and have a much more condensed filter logic in the process. I wrote a very long comment describing the methadology that Al proposed in my own words. For more info on how this works, read the comment above predicate_parse(). Suggested-by: Al Viro Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 15 +- kernel/trace/trace_events_filter.c | 2311 ++++++++++++++++-------------------- 2 files changed, 1054 insertions(+), 1272 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9de3e2a2f042..6fb46a06c9dc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1216,12 +1216,11 @@ struct ftrace_event_field { int is_signed; }; +struct prog_entry; + struct event_filter { - int n_preds; /* Number assigned */ - int a_preds; /* allocated */ - struct filter_pred __rcu *preds; - struct filter_pred __rcu *root; - char *filter_string; + struct prog_entry __rcu *prog; + char *filter_string; }; struct event_subsystem { @@ -1413,12 +1412,8 @@ struct filter_pred { unsigned short *ops; struct ftrace_event_field *field; int offset; - int not; + int not; int op; - unsigned short index; - unsigned short parent; - unsigned short left; - unsigned short right; }; static inline bool is_string_field(struct ftrace_event_field *field) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9d383f4383dc..703a416aa5c2 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -33,60 +33,52 @@ "# Only events with the given fields will be affected.\n" \ "# If no events are modified, an error message will be displayed here" +/* Due to token parsing '<=' must be before '<' and '>=' must be before '>' */ #define OPS \ - C( OP_OR, "||", 1 ), \ - C( OP_AND, "&&", 2 ), \ - C( OP_GLOB, "~", 4 ), \ - C( OP_NE, "!=", 4 ), \ - C( OP_EQ, "==", 4 ), \ - C( OP_LT, "<", 5 ), \ - C( OP_LE, "<=", 5 ), \ - C( OP_GT, ">", 5 ), \ - C( OP_GE, ">=", 5 ), \ - C( OP_BAND, "&", 6 ), \ - C( OP_NOT, "!", 6 ), \ - C( OP_NONE, "OP_NONE", 0 ), \ - C( OP_OPEN_PAREN, "(", 0 ), \ - C( OP_MAX, NULL, 0 ) + C( OP_GLOB, "~" ), \ + C( OP_NE, "!=" ), \ + C( OP_EQ, "==" ), \ + C( OP_LE, "<=" ), \ + C( OP_LT, "<" ), \ + C( OP_GE, ">=" ), \ + C( OP_GT, ">" ), \ + C( OP_BAND, "&" ), \ + C( OP_MAX, NULL ) #undef C -#define C(a, b, c) a +#define C(a, b) a enum filter_op_ids { OPS }; -struct filter_op { - int id; - char *string; - int precedence; -}; - #undef C -#define C(a, b, c) { a, b, c } +#define C(a, b) b -static struct filter_op filter_ops[] = { OPS }; +static const char * ops[] = { OPS }; /* - * pred functions are OP_LT, OP_LE, OP_GT, OP_GE, and OP_BAND + * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND * pred_funcs_##type below must match the order of them above. */ -#define PRED_FUNC_START OP_LT +#define PRED_FUNC_START OP_LE #define PRED_FUNC_MAX (OP_BAND - PRED_FUNC_START) #define ERRORS \ - C( NONE, "No error"), \ - C( INVALID_OP, "Invalid operator"), \ - C( UNBALANCED_PAREN, "Unbalanced parens"), \ - C( TOO_MANY_OPERANDS, "Too many operands"), \ - C( OPERAND_TOO_LONG, "Operand too long"), \ - C( FIELD_NOT_FOUND, "Field not found"), \ - C( ILLEGAL_FIELD_OP, "Illegal operation for field type"), \ - C( ILLEGAL_INTVAL, "Illegal integer value"), \ - C( BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \ - C( TOO_MANY_PREDS, "Too many terms in predicate expression"), \ - C( MISSING_FIELD, "Missing field name and/or value"), \ - C( INVALID_FILTER, "Meaningless filter expression"), \ - C( IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ - C( ILLEGAL_NOT_OP, "Illegal use of '!'"), + C(NONE, "No error"), \ + C(INVALID_OP, "Invalid operator"), \ + C(TOO_MANY_OPEN, "Too many '('"), \ + C(TOO_MANY_CLOSE, "Too few '('"), \ + C(MISSING_QUOTE, "Missing matching quote"), \ + C(OPERAND_TOO_LONG, "Operand too long"), \ + C(EXPECT_STRING, "Expecting string field"), \ + C(EXPECT_DIGIT, "Expecting numeric field"), \ + C(ILLEGAL_FIELD_OP, "Illegal operation for field type"), \ + C(FIELD_NOT_FOUND, "Field not found"), \ + C(ILLEGAL_INTVAL, "Illegal integer value"), \ + C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \ + C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \ + C(INVALID_FILTER, "Meaningless filter expression"), \ + C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ + C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), #undef C #define C(a, b) FILT_ERR_##a @@ -98,84 +90,535 @@ enum { ERRORS }; static char *err_text[] = { ERRORS }; -struct opstack_op { - enum filter_op_ids op; - struct list_head list; -}; +/* Called after a '!' character but "!=" and "!~" are not "not"s */ +static bool is_not(const char *str) +{ + switch (str[1]) { + case '=': + case '~': + return false; + } + return true; +} -struct postfix_elt { - enum filter_op_ids op; - char *operand; - struct list_head list; +/** + * prog_entry - a singe entry in the filter program + * @target: Index to jump to on a branch (actually one minus the index) + * @when_to_branch: The value of the result of the predicate to do a branch + * @pred: The predicate to execute. + */ +struct prog_entry { + int target; + int when_to_branch; + struct filter_pred *pred; }; -struct filter_parse_state { - struct filter_op *ops; - struct list_head opstack; - struct list_head postfix; +/** + * update_preds- assign a program entry a label target + * @prog: The program array + * @N: The index of the current entry in @prog + * @when_to_branch: What to assign a program entry for its branch condition + * + * The program entry at @N has a target that points to the index of a program + * entry that can have its target and when_to_branch fields updated. + * Update the current program entry denoted by index @N target field to be + * that of the updated entry. This will denote the entry to update if + * we are processing an "||" after an "&&" + */ +static void update_preds(struct prog_entry *prog, int N, int invert) +{ + int t, s; + + t = prog[N].target; + s = prog[t].target; + prog[t].when_to_branch = invert; + prog[t].target = N; + prog[N].target = s; +} + +struct filter_parse_error { int lasterr; int lasterr_pos; - - struct { - char *string; - unsigned int cnt; - unsigned int tail; - } infix; - - struct { - char string[MAX_FILTER_STR_VAL]; - int pos; - unsigned int tail; - } operand; }; -struct pred_stack { - struct filter_pred **preds; - int index; +static void parse_error(struct filter_parse_error *pe, int err, int pos) +{ + pe->lasterr = err; + pe->lasterr_pos = pos; +} + +typedef int (*parse_pred_fn)(const char *str, void *data, int pos, + struct filter_parse_error *pe, + struct filter_pred **pred); + +enum { + INVERT = 1, + PROCESS_AND = 2, + PROCESS_OR = 4, }; -/* If not of not match is equal to not of not, then it is a match */ +/* + * Without going into a formal proof, this explains the method that is used in + * parsing the logical expressions. + * + * For example, if we have: "a && !(!b || (c && g)) || d || e && !f" + * The first pass will convert it into the following program: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto l4; + * n4: r=g; r=!r; l4: if (r) goto l5; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto l7; + * n7: r=f; r=!r; l7: if (!r) goto F + * T: return TRUE + * F: return FALSE + * + * To do this, we use a data structure to represent each of the above + * predicate and conditions that has: + * + * predicate, when_to_branch, invert, target + * + * The "predicate" will hold the function to determine the result "r". + * The "when_to_branch" denotes what "r" should be if a branch is to be taken + * "&&" would contain "!r" or (0) and "||" would contain "r" or (1). + * The "invert" holds whether the value should be reversed before testing. + * The "target" contains the label "l#" to jump to. + * + * A stack is created to hold values when parentheses are used. + * + * To simplify the logic, the labels will start at 0 and not 1. + * + * The possible invert values are 1 and 0. The number of "!"s that are in scope + * before the predicate determines the invert value, if the number is odd then + * the invert value is 1 and 0 otherwise. This means the invert value only + * needs to be toggled when a new "!" is introduced compared to what is stored + * on the stack, where parentheses were used. + * + * The top of the stack and "invert" are initialized to zero. + * + * ** FIRST PASS ** + * + * #1 A loop through all the tokens is done: + * + * #2 If the token is an "(", the stack is push, and the current stack value + * gets the current invert value, and the loop continues to the next token. + * The top of the stack saves the "invert" value to keep track of what + * the current inversion is. As "!(a && !b || c)" would require all + * predicates being affected separately by the "!" before the parentheses. + * And that would end up being equivalent to "(!a || b) && !c" + * + * #3 If the token is an "!", the current "invert" value gets inverted, and + * the loop continues. Note, if the next token is a predicate, then + * this "invert" value is only valid for the current program entry, + * and does not affect other predicates later on. + * + * The only other acceptable token is the predicate string. + * + * #4 A new entry into the program is added saving: the predicate and the + * current value of "invert". The target is currently assigned to the + * previous program index (this will not be its final value). + * + * #5 We now enter another loop and look at the next token. The only valid + * tokens are ")", "&&", "||" or end of the input string "\0". + * + * #6 The invert variable is reset to the current value saved on the top of + * the stack. + * + * #7 The top of the stack holds not only the current invert value, but also + * if a "&&" or "||" needs to be processed. Note, the "&&" takes higher + * precedence than "||". That is "a && b || c && d" is equivalent to + * "(a && b) || (c && d)". Thus the first thing to do is to see if "&&" needs + * to be processed. This is the case if an "&&" was the last token. If it was + * then we call update_preds(). This takes the program, the current index in + * the program, and the current value of "invert". More will be described + * below about this function. + * + * #8 If the next token is "&&" then we set a flag in the top of the stack + * that denotes that "&&" needs to be processed, break out of this loop + * and continue with the outer loop. + * + * #9 Otherwise, if a "||" needs to be processed then update_preds() is called. + * This is called with the program, the current index in the program, but + * this time with an inverted value of "invert" (that is !invert). This is + * because the value taken will become the "when_to_branch" value of the + * program. + * Note, this is called when the next token is not an "&&". As stated before, + * "&&" takes higher precedence, and "||" should not be processed yet if the + * next logical operation is "&&". + * + * #10 If the next token is "||" then we set a flag in the top of the stack + * that denotes that "||" needs to be processed, break out of this loop + * and continue with the outer loop. + * + * #11 If this is the end of the input string "\0" then we break out of both + * loops. + * + * #12 Otherwise, the next token is ")", where we pop the stack and continue + * this inner loop. + * + * Now to discuss the update_pred() function, as that is key to the setting up + * of the program. Remember the "target" of the program is initialized to the + * previous index and not the "l" label. The target holds the index into the + * program that gets affected by the operand. Thus if we have something like + * "a || b && c", when we process "a" the target will be "-1" (undefined). + * When we process "b", its target is "0", which is the index of "a", as that's + * the predicate that is affected by "||". But because the next token after "b" + * is "&&" we don't call update_preds(). Instead continue to "c". As the + * next token after "c" is not "&&" but the end of input, we first process the + * "&&" by calling update_preds() for the "&&" then we process the "||" by + * callin updates_preds() with the values for processing "||". + * + * What does that mean? What update_preds() does is to first save the "target" + * of the program entry indexed by the current program entry's "target" + * (remember the "target" is initialized to previous program entry), and then + * sets that "target" to the current index which represents the label "l#". + * That entry's "when_to_branch" is set to the value passed in (the "invert" + * or "!invert"). Then it sets the current program entry's target to the saved + * "target" value (the old value of the program that had its "target" updated + * to the label). + * + * Looking back at "a || b && c", we have the following steps: + * "a" - prog[0] = { "a", X, -1 } // pred, when_to_branch, target + * "||" - flag that we need to process "||"; continue outer loop + * "b" - prog[1] = { "b", X, 0 } + * "&&" - flag that we need to process "&&"; continue outer loop + * (Notice we did not process "||") + * "c" - prog[2] = { "c", X, 1 } + * update_preds(prog, 2, 0); // invert = 0 as we are processing "&&" + * t = prog[2].target; // t = 1 + * s = prog[t].target; // s = 0 + * prog[t].target = 2; // Set target to "l2" + * prog[t].when_to_branch = 0; + * prog[2].target = s; + * update_preds(prog, 2, 1); // invert = 1 as we are now processing "||" + * t = prog[2].target; // t = 0 + * s = prog[t].target; // s = -1 + * prog[t].target = 2; // Set target to "l2" + * prog[t].when_to_branch = 1; + * prog[2].target = s; + * + * #13 Which brings us to the final step of the first pass, which is to set + * the last program entry's when_to_branch and target, which will be + * when_to_branch = 0; target = N; ( the label after the program entry after + * the last program entry processed above). + * + * If we denote "TRUE" to be the entry after the last program entry processed, + * and "FALSE" the program entry after that, we are now done with the first + * pass. + * + * Making the above "a || b && c" have a progam of: + * prog[0] = { "a", 1, 2 } + * prog[1] = { "b", 0, 2 } + * prog[2] = { "c", 0, 3 } + * + * Which translates into: + * n0: r = a; l0: if (r) goto l2; + * n1: r = b; l1: if (!r) goto l2; + * n2: r = c; l2: if (!r) goto l3; // Which is the same as "goto F;" + * T: return TRUE; l3: + * F: return FALSE + * + * Although, after the first pass, the program is correct, it is + * inefficient. The simple sample of "a || b && c" could be easily been + * converted into: + * n0: r = a; if (r) goto T + * n1: r = b; if (!r) goto F + * n2: r = c; if (!r) goto F + * T: return TRUE; + * F: return FALSE; + * + * The First Pass is over the input string. The next too passes are over + * the program itself. + * + * ** SECOND PASS ** + * + * Which brings us to the second pass. If a jump to a label has the + * same condition as that label, it can instead jump to its target. + * The original example of "a && !(!b || (c && g)) || d || e && !f" + * where the first pass gives us: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto l4; + * n4: r=g; r=!r; l4: if (r) goto l5; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto l7; + * n7: r=f; r=!r; l7: if (!r) goto F: + * T: return TRUE; + * F: return FALSE + * + * We can see that "l3: if (r) goto l4;" and at l4, we have "if (r) goto l5;". + * And "l5: if (r) goto T", we could optimize this by converting l3 and l4 + * to go directly to T. To accomplish this, we start from the last + * entry in the program and work our way back. If the target of the entry + * has the same "when_to_branch" then we could use that entry's target. + * Doing this, the above would end up as: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto T; + * n4: r=g; r=!r; l4: if (r) goto T; + * n5: r=d; l5: if (r) goto T; + * n6: r=e; l6: if (!r) goto F; + * n7: r=f; r=!r; l7: if (!r) goto F; + * T: return TRUE + * F: return FALSE + * + * In that same pass, if the "when_to_branch" doesn't match, we can simply + * go to the program entry after the label. That is, "l2: if (!r) goto l4;" + * where "l4: if (r) goto T;", then we can convert l2 to be: + * "l2: if (!r) goto n5;". + * + * This will have the second pass give us: + * n1: r=a; l1: if (!r) goto n5; + * n2: r=b; l2: if (!r) goto n5; + * n3: r=c; r=!r; l3: if (r) goto T; + * n4: r=g; r=!r; l4: if (r) goto T; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto F; + * n7: r=f; r=!r; l7: if (!r) goto F + * T: return TRUE + * F: return FALSE + * + * Notice, all the "l#" labels are no longer used, and they can now + * be discarded. + * + * ** THIRD PASS ** + * + * For the third pass we deal with the inverts. As they simply just + * make the "when_to_branch" get inverted, a simple loop over the + * program to that does: "when_to_branch ^= invert;" will do the + * job, leaving us with: + * n1: r=a; if (!r) goto n5; + * n2: r=b; if (!r) goto n5; + * n3: r=c: if (!r) goto T; + * n4: r=g; if (!r) goto T; + * n5: r=d; if (r) goto T + * n6: r=e; if (!r) goto F; + * n7: r=f; if (r) goto F + * T: return TRUE + * F: return FALSE + * + * As "r = a; if (!r) goto n5;" is obviously the same as + * "if (!a) goto n5;" without doing anything we can interperate the + * program as: + * n1: if (!a) goto n5; + * n2: if (!b) goto n5; + * n3: if (!c) goto T; + * n4: if (!g) goto T; + * n5: if (d) goto T + * n6: if (!e) goto F; + * n7: if (f) goto F + * T: return TRUE + * F: return FALSE + * + * Since the inverts are discarded at the end, there's no reason to store + * them in the program array (and waste memory). A separate array to hold + * the inverts is used and freed at the end. + */ +static struct prog_entry * +predicate_parse(const char *str, int nr_parens, int nr_preds, + parse_pred_fn parse_pred, void *data, + struct filter_parse_error *pe) +{ + struct prog_entry *prog_stack; + struct prog_entry *prog; + const char *ptr = str; + char *inverts = NULL; + int *op_stack; + int *top; + int invert = 0; + int ret = -ENOMEM; + int len; + int N = 0; + int i; + + nr_preds += 2; /* For TRUE and FALSE */ + + op_stack = kmalloc(sizeof(*op_stack) * nr_parens, GFP_KERNEL); + if (!op_stack) + return ERR_PTR(-ENOMEM); + prog_stack = kmalloc(sizeof(*prog_stack) * nr_preds, GFP_KERNEL); + if (!prog_stack) { + parse_error(pe, -ENOMEM, 0); + goto out_free; + } + inverts = kmalloc(sizeof(*inverts) * nr_preds, GFP_KERNEL); + if (!inverts) { + parse_error(pe, -ENOMEM, 0); + goto out_free; + } + + top = op_stack; + prog = prog_stack; + *top = 0; + + /* First pass */ + while (*ptr) { /* #1 */ + const char *next = ptr++; + + if (isspace(*next)) + continue; + + switch (*next) { + case '(': /* #2 */ + if (top - op_stack > nr_parens) + return ERR_PTR(-EINVAL); + *(++top) = invert; + continue; + case '!': /* #3 */ + if (!is_not(next)) + break; + invert = !invert; + continue; + } + + if (N >= nr_preds) { + parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); + goto out_free; + } + + inverts[N] = invert; /* #4 */ + prog[N].target = N-1; + + len = parse_pred(next, data, ptr - str, pe, &prog[N].pred); + if (len < 0) { + ret = len; + goto out_free; + } + ptr = next + len; + + N++; + + ret = -1; + while (1) { /* #5 */ + next = ptr++; + if (isspace(*next)) + continue; + + switch (*next) { + case ')': + case '\0': + break; + case '&': + case '|': + if (next[1] == next[0]) { + ptr++; + break; + } + default: + parse_error(pe, FILT_ERR_TOO_MANY_PREDS, + next - str); + goto out_free; + } + + invert = *top & INVERT; + + if (*top & PROCESS_AND) { /* #7 */ + update_preds(prog, N - 1, invert); + *top &= ~PROCESS_AND; + } + if (*next == '&') { /* #8 */ + *top |= PROCESS_AND; + break; + } + if (*top & PROCESS_OR) { /* #9 */ + update_preds(prog, N - 1, !invert); + *top &= ~PROCESS_OR; + } + if (*next == '|') { /* #10 */ + *top |= PROCESS_OR; + break; + } + if (!*next) /* #11 */ + goto out; + + if (top == op_stack) { + ret = -1; + /* Too few '(' */ + parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, ptr - str); + goto out_free; + } + top--; /* #12 */ + } + } + out: + if (top != op_stack) { + /* Too many '(' */ + parse_error(pe, FILT_ERR_TOO_MANY_OPEN, ptr - str); + goto out_free; + } + + prog[N].pred = NULL; /* #13 */ + prog[N].target = 1; /* TRUE */ + prog[N+1].pred = NULL; + prog[N+1].target = 0; /* FALSE */ + prog[N-1].target = N; + prog[N-1].when_to_branch = false; + + /* Second Pass */ + for (i = N-1 ; i--; ) { + int target = prog[i].target; + if (prog[i].when_to_branch == prog[target].when_to_branch) + prog[i].target = prog[target].target; + } + + /* Third Pass */ + for (i = 0; i < N; i++) { + invert = inverts[i] ^ prog[i].when_to_branch; + prog[i].when_to_branch = invert; + /* Make sure the program always moves forward */ + if (WARN_ON(prog[i].target <= i)) { + ret = -EINVAL; + goto out_free; + } + } + + return prog; +out_free: + kfree(op_stack); + kfree(prog_stack); + kfree(inverts); + return ERR_PTR(ret); +} + #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr < val); \ - return !!match == !pred->not; \ + return *addr < val; \ } \ static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr <= val); \ - return !!match == !pred->not; \ + return *addr <= val; \ } \ static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr > val); \ - return !!match == !pred->not; \ + return *addr > val; \ } \ static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = (*addr >= val); \ - return !!match == !pred->not; \ + return *addr >= val; \ } \ static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = !!(*addr & val); \ - return match == !pred->not; \ + return !!(*addr & val); \ } \ static const filter_pred_fn_t pred_funcs_##type[] = { \ - filter_pred_LT_##type, \ filter_pred_LE_##type, \ - filter_pred_GT_##type, \ + filter_pred_LT_##type, \ filter_pred_GE_##type, \ + filter_pred_GT_##type, \ filter_pred_BAND_##type, \ }; @@ -261,44 +704,36 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) static int filter_pred_cpu(struct filter_pred *pred, void *event) { int cpu, cmp; - int match = 0; cpu = raw_smp_processor_id(); cmp = pred->val; switch (pred->op) { case OP_EQ: - match = cpu == cmp; - break; + return cpu == cmp; + case OP_NE: + return cpu != cmp; case OP_LT: - match = cpu < cmp; - break; + return cpu < cmp; case OP_LE: - match = cpu <= cmp; - break; + return cpu <= cmp; case OP_GT: - match = cpu > cmp; - break; + return cpu > cmp; case OP_GE: - match = cpu >= cmp; - break; + return cpu >= cmp; default: - break; + return 0; } - - return !!match == !pred->not; } /* Filter predicate for COMM. */ static int filter_pred_comm(struct filter_pred *pred, void *event) { - int cmp, match; + int cmp; cmp = pred->regex.match(current->comm, &pred->regex, - pred->regex.field_len); - match = cmp ^ pred->not; - - return match; + TASK_COMM_LEN); + return cmp ^ pred->not; } static int filter_pred_none(struct filter_pred *pred, void *event) @@ -355,6 +790,7 @@ static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused) return 1; return 0; } + /** * filter_parse_regex - parse a basic regex * @buff: the raw regex @@ -415,10 +851,9 @@ static void filter_build_regex(struct filter_pred *pred) struct regex *r = &pred->regex; char *search; enum regex_type type = MATCH_FULL; - int not = 0; if (pred->op == OP_GLOB) { - type = filter_parse_regex(r->pattern, r->len, &search, ¬); + type = filter_parse_regex(r->pattern, r->len, &search, &pred->not); r->len = strlen(search); memmove(r->pattern, search, r->len+1); } @@ -440,210 +875,32 @@ static void filter_build_regex(struct filter_pred *pred) r->match = regex_match_glob; break; } - - pred->not ^= not; -} - -enum move_type { - MOVE_DOWN, - MOVE_UP_FROM_LEFT, - MOVE_UP_FROM_RIGHT -}; - -static struct filter_pred * -get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, - int index, enum move_type *move) -{ - if (pred->parent & FILTER_PRED_IS_RIGHT) - *move = MOVE_UP_FROM_RIGHT; - else - *move = MOVE_UP_FROM_LEFT; - pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; - - return pred; -} - -enum walk_return { - WALK_PRED_ABORT, - WALK_PRED_PARENT, - WALK_PRED_DEFAULT, -}; - -typedef int (*filter_pred_walkcb_t) (enum move_type move, - struct filter_pred *pred, - int *err, void *data); - -static int walk_pred_tree(struct filter_pred *preds, - struct filter_pred *root, - filter_pred_walkcb_t cb, void *data) -{ - struct filter_pred *pred = root; - enum move_type move = MOVE_DOWN; - int done = 0; - - if (!preds) - return -EINVAL; - - do { - int err = 0, ret; - - ret = cb(move, pred, &err, data); - if (ret == WALK_PRED_ABORT) - return err; - if (ret == WALK_PRED_PARENT) - goto get_parent; - - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - goto get_parent; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - get_parent: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, - &move); - continue; - } - done = 1; - } while (!done); - - /* We are fine. */ - return 0; -} - -/* - * A series of AND or ORs where found together. Instead of - * climbing up and down the tree branches, an array of the - * ops were made in order of checks. We can just move across - * the array and short circuit if needed. - */ -static int process_ops(struct filter_pred *preds, - struct filter_pred *op, void *rec) -{ - struct filter_pred *pred; - int match = 0; - int type; - int i; - - /* - * Micro-optimization: We set type to true if op - * is an OR and false otherwise (AND). Then we - * just need to test if the match is equal to - * the type, and if it is, we can short circuit the - * rest of the checks: - * - * if ((match && op->op == OP_OR) || - * (!match && op->op == OP_AND)) - * return match; - */ - type = op->op == OP_OR; - - for (i = 0; i < op->val; i++) { - pred = &preds[op->ops[i]]; - if (!WARN_ON_ONCE(!pred->fn)) - match = pred->fn(pred, rec); - if (!!match == type) - break; - } - /* If not of not match is equal to not of not, then it is a match */ - return !!match == !op->not; -} - -struct filter_match_preds_data { - struct filter_pred *preds; - int match; - void *rec; -}; - -static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct filter_match_preds_data *d = data; - - *err = 0; - switch (move) { - case MOVE_DOWN: - /* only AND and OR have children */ - if (pred->left != FILTER_PRED_INVALID) { - /* If ops is set, then it was folded. */ - if (!pred->ops) - return WALK_PRED_DEFAULT; - /* We can treat folded ops as a leaf node */ - d->match = process_ops(d->preds, pred, d->rec); - } else { - if (!WARN_ON_ONCE(!pred->fn)) - d->match = pred->fn(pred, d->rec); - } - - return WALK_PRED_PARENT; - case MOVE_UP_FROM_LEFT: - /* - * Check for short circuits. - * - * Optimization: !!match == (pred->op == OP_OR) - * is the same as: - * if ((match && pred->op == OP_OR) || - * (!match && pred->op == OP_AND)) - */ - if (!!d->match == (pred->op == OP_OR)) - return WALK_PRED_PARENT; - break; - case MOVE_UP_FROM_RIGHT: - break; - } - - return WALK_PRED_DEFAULT; } /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - struct filter_pred *preds; - struct filter_pred *root; - struct filter_match_preds_data data = { - /* match is currently meaningless */ - .match = -1, - .rec = rec, - }; - int n_preds, ret; + struct prog_entry *prog; + int i; /* no filter is considered a match */ if (!filter) return 1; - n_preds = filter->n_preds; - if (!n_preds) + prog = rcu_dereference_sched(filter->prog); + if (!prog) return 1; - /* - * n_preds, root and filter->preds are protect with preemption disabled. - */ - root = rcu_dereference_sched(filter->root); - if (!root) - return 1; - - data.preds = preds = rcu_dereference_sched(filter->preds); - ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); - WARN_ON(ret); - return data.match; + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; + int match = pred->fn(pred, rec); + if (match == prog[i].when_to_branch) + i = prog[i].target; + } + return prog[i].target; } EXPORT_SYMBOL_GPL(filter_match_preds); -static void parse_error(struct filter_parse_state *ps, int err, int pos) -{ - ps->lasterr = err; - ps->lasterr_pos = pos; -} - static void remove_filter_string(struct event_filter *filter) { if (!filter) @@ -653,11 +910,11 @@ static void remove_filter_string(struct event_filter *filter) filter->filter_string = NULL; } -static void append_filter_err(struct filter_parse_state *ps, +static void append_filter_err(struct filter_parse_error *pe, struct event_filter *filter) { struct trace_seq *s; - int pos = ps->lasterr_pos; + int pos = pe->lasterr_pos; char *buf; int len; @@ -671,11 +928,19 @@ static void append_filter_err(struct filter_parse_state *ps, len = strlen(filter->filter_string); if (pos > len) - len = pos; + pos = len; + + /* indexing is off by one */ + if (pos) + pos++; trace_seq_puts(s, filter->filter_string); - trace_seq_printf(s, "\n%*s", pos, "^"); - trace_seq_printf(s, "\nparse_error: %s\n", err_text[ps->lasterr]); + if (pe->lasterr > 0) { + trace_seq_printf(s, "\n%*s", pos, "^"); + trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); + } else { + trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); + } trace_seq_putc(s, 0); buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); if (buf) { @@ -715,108 +980,18 @@ void print_subsystem_event_filter(struct event_subsystem *system, mutex_unlock(&event_mutex); } -static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) -{ - stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); - if (!stack->preds) - return -ENOMEM; - stack->index = n_preds; - return 0; -} - -static void __free_pred_stack(struct pred_stack *stack) -{ - kfree(stack->preds); - stack->index = 0; -} - -static int __push_pred_stack(struct pred_stack *stack, - struct filter_pred *pred) -{ - int index = stack->index; - - if (WARN_ON(index == 0)) - return -ENOSPC; - - stack->preds[--index] = pred; - stack->index = index; - return 0; -} - -static struct filter_pred * -__pop_pred_stack(struct pred_stack *stack) -{ - struct filter_pred *pred; - int index = stack->index; - - pred = stack->preds[index++]; - if (!pred) - return NULL; - - stack->index = index; - return pred; -} - -static int filter_set_pred(struct event_filter *filter, - int idx, - struct pred_stack *stack, - struct filter_pred *src) -{ - struct filter_pred *dest = &filter->preds[idx]; - struct filter_pred *left; - struct filter_pred *right; - - *dest = *src; - dest->index = idx; - - if (dest->op == OP_OR || dest->op == OP_AND) { - right = __pop_pred_stack(stack); - left = __pop_pred_stack(stack); - if (!left || !right) - return -EINVAL; - /* - * If both children can be folded - * and they are the same op as this op or a leaf, - * then this op can be folded. - */ - if (left->index & FILTER_PRED_FOLD && - ((left->op == dest->op && !left->not) || - left->left == FILTER_PRED_INVALID) && - right->index & FILTER_PRED_FOLD && - ((right->op == dest->op && !right->not) || - right->left == FILTER_PRED_INVALID)) - dest->index |= FILTER_PRED_FOLD; - - dest->left = left->index & ~FILTER_PRED_FOLD; - dest->right = right->index & ~FILTER_PRED_FOLD; - left->parent = dest->index & ~FILTER_PRED_FOLD; - right->parent = dest->index | FILTER_PRED_IS_RIGHT; - } else { - /* - * Make dest->left invalid to be used as a quick - * way to know this is a leaf node. - */ - dest->left = FILTER_PRED_INVALID; - - /* All leafs allow folding the parent ops. */ - dest->index |= FILTER_PRED_FOLD; - } - - return __push_pred_stack(stack, dest); -} - -static void __free_preds(struct event_filter *filter) +static void free_prog(struct event_filter *filter) { + struct prog_entry *prog; int i; - if (filter->preds) { - for (i = 0; i < filter->n_preds; i++) - kfree(filter->preds[i].ops); - kfree(filter->preds); - filter->preds = NULL; - } - filter->a_preds = 0; - filter->n_preds = 0; + prog = rcu_access_pointer(filter->prog); + if (!prog) + return; + + for (i = 0; prog[i].pred; i++) + kfree(prog[i].pred); + kfree(prog); } static void filter_disable(struct trace_event_file *file) @@ -834,7 +1009,7 @@ static void __free_filter(struct event_filter *filter) if (!filter) return; - __free_preds(filter); + free_prog(filter); kfree(filter->filter_string); kfree(filter); } @@ -844,30 +1019,6 @@ void free_event_filter(struct event_filter *filter) __free_filter(filter); } -static int __alloc_preds(struct event_filter *filter, int n_preds) -{ - struct filter_pred *pred; - int i; - - if (filter->preds) - __free_preds(filter); - - filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); - - if (!filter->preds) - return -ENOMEM; - - filter->a_preds = n_preds; - filter->n_preds = 0; - - for (i = 0; i < n_preds; i++) { - pred = &filter->preds[i]; - pred->fn = filter_pred_none; - } - - return 0; -} - static inline void __remove_filter(struct trace_event_file *file) { filter_disable(file); @@ -898,812 +1049,466 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { - if (file->system != dir) - continue; - __free_subsystem_filter(file); - } -} - -static int filter_add_pred(struct filter_parse_state *ps, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack) -{ - int err; - - if (WARN_ON(filter->n_preds == filter->a_preds)) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - err = filter_set_pred(filter, filter->n_preds, stack, pred); - if (err) - return err; - - filter->n_preds++; - - return 0; -} - -int filter_assign_type(const char *type) -{ - if (strstr(type, "__data_loc") && strstr(type, "char")) - return FILTER_DYN_STRING; - - if (strchr(type, '[') && strstr(type, "char")) - return FILTER_STATIC_STRING; - - return FILTER_OTHER; -} - -static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op) -{ - if (is_string_field(field) && - (op != OP_EQ && op != OP_NE && op != OP_GLOB)) - return false; - if (!is_string_field(field) && op == OP_GLOB) - return false; - - return true; -} - -static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, - int field_size, int field_is_signed) -{ - filter_pred_fn_t fn = NULL; - int pred_func_index = -1; - - switch (op) { - case OP_EQ: - case OP_NE: - break; - default: - if (WARN_ON_ONCE(op < PRED_FUNC_START)) - return NULL; - pred_func_index = op - PRED_FUNC_START; - if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX)) - return NULL; - } - - switch (field_size) { - case 8: - if (pred_func_index < 0) - fn = filter_pred_64; - else if (field_is_signed) - fn = pred_funcs_s64[pred_func_index]; - else - fn = pred_funcs_u64[pred_func_index]; - break; - case 4: - if (pred_func_index < 0) - fn = filter_pred_32; - else if (field_is_signed) - fn = pred_funcs_s32[pred_func_index]; - else - fn = pred_funcs_u32[pred_func_index]; - break; - case 2: - if (pred_func_index < 0) - fn = filter_pred_16; - else if (field_is_signed) - fn = pred_funcs_s16[pred_func_index]; - else - fn = pred_funcs_u16[pred_func_index]; - break; - case 1: - if (pred_func_index < 0) - fn = filter_pred_8; - else if (field_is_signed) - fn = pred_funcs_s8[pred_func_index]; - else - fn = pred_funcs_u8[pred_func_index]; - break; - } - - return fn; -} - -static int init_pred(struct filter_parse_state *ps, - struct ftrace_event_field *field, - struct filter_pred *pred) - -{ - filter_pred_fn_t fn = filter_pred_none; - unsigned long long val; - int ret; - - pred->offset = field->offset; - - if (!is_legal_op(field, pred->op)) { - parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); - return -EINVAL; - } - - if (field->filter_type == FILTER_COMM) { - filter_build_regex(pred); - fn = filter_pred_comm; - pred->regex.field_len = TASK_COMM_LEN; - } else if (is_string_field(field)) { - filter_build_regex(pred); - - if (field->filter_type == FILTER_STATIC_STRING) { - fn = filter_pred_string; - pred->regex.field_len = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) - fn = filter_pred_strloc; - else - fn = filter_pred_pchar; - } else if (is_function_field(field)) { - if (strcmp(field->name, "ip")) { - parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); - return -EINVAL; - } - } else { - if (field->is_signed) - ret = kstrtoll(pred->regex.pattern, 0, &val); - else - ret = kstrtoull(pred->regex.pattern, 0, &val); - if (ret) { - parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); - return -EINVAL; - } - pred->val = val; - - if (field->filter_type == FILTER_CPU) - fn = filter_pred_cpu; - else - fn = select_comparison_fn(pred->op, field->size, - field->is_signed); - if (!fn) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; - } - } - - if (pred->op == OP_NE) - pred->not ^= 1; - - pred->fn = fn; - return 0; -} - -static void parse_init(struct filter_parse_state *ps, - struct filter_op *ops, - char *infix_string) -{ - memset(ps, '\0', sizeof(*ps)); - - ps->infix.string = infix_string; - ps->infix.cnt = strlen(infix_string); - ps->ops = ops; - - INIT_LIST_HEAD(&ps->opstack); - INIT_LIST_HEAD(&ps->postfix); -} - -static char infix_next(struct filter_parse_state *ps) -{ - if (!ps->infix.cnt) - return 0; - - ps->infix.cnt--; - - return ps->infix.string[ps->infix.tail++]; -} - -static char infix_peek(struct filter_parse_state *ps) -{ - if (ps->infix.tail == strlen(ps->infix.string)) - return 0; - - return ps->infix.string[ps->infix.tail]; -} - -static void infix_advance(struct filter_parse_state *ps) -{ - if (!ps->infix.cnt) - return; - - ps->infix.cnt--; - ps->infix.tail++; -} - -static inline int is_precedence_lower(struct filter_parse_state *ps, - int a, int b) -{ - return ps->ops[a].precedence < ps->ops[b].precedence; -} - -static inline int is_op_char(struct filter_parse_state *ps, char c) -{ - int i; - - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (ps->ops[i].string[0] == c) - return 1; - } - - return 0; -} - -static int infix_get_op(struct filter_parse_state *ps, char firstc) -{ - char nextc = infix_peek(ps); - char opstr[3]; - int i; - - opstr[0] = firstc; - opstr[1] = nextc; - opstr[2] = '\0'; - - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (!strcmp(opstr, ps->ops[i].string)) { - infix_advance(ps); - return ps->ops[i].id; - } - } - - opstr[1] = '\0'; - - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (!strcmp(opstr, ps->ops[i].string)) - return ps->ops[i].id; - } - - return OP_NONE; -} - -static inline void clear_operand_string(struct filter_parse_state *ps) -{ - memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); - ps->operand.tail = 0; -} - -static inline int append_operand_char(struct filter_parse_state *ps, char c) -{ - if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) - return -EINVAL; - - ps->operand.string[ps->operand.tail++] = c; - - return 0; -} - -static int filter_opstack_push(struct filter_parse_state *ps, - enum filter_op_ids op) -{ - struct opstack_op *opstack_op; - - opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); - if (!opstack_op) - return -ENOMEM; - - opstack_op->op = op; - list_add(&opstack_op->list, &ps->opstack); - - return 0; -} - -static int filter_opstack_empty(struct filter_parse_state *ps) -{ - return list_empty(&ps->opstack); -} - -static int filter_opstack_top(struct filter_parse_state *ps) -{ - struct opstack_op *opstack_op; - - if (filter_opstack_empty(ps)) - return OP_NONE; - - opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); - - return opstack_op->op; -} - -static int filter_opstack_pop(struct filter_parse_state *ps) -{ - struct opstack_op *opstack_op; - enum filter_op_ids op; - - if (filter_opstack_empty(ps)) - return OP_NONE; - - opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); - op = opstack_op->op; - list_del(&opstack_op->list); - - kfree(opstack_op); - - return op; -} - -static void filter_opstack_clear(struct filter_parse_state *ps) -{ - while (!filter_opstack_empty(ps)) - filter_opstack_pop(ps); -} - -static char *curr_operand(struct filter_parse_state *ps) -{ - return ps->operand.string; -} - -static int postfix_append_operand(struct filter_parse_state *ps, char *operand) -{ - struct postfix_elt *elt; - - elt = kmalloc(sizeof(*elt), GFP_KERNEL); - if (!elt) - return -ENOMEM; - - elt->op = OP_NONE; - elt->operand = kstrdup(operand, GFP_KERNEL); - if (!elt->operand) { - kfree(elt); - return -ENOMEM; - } - - list_add_tail(&elt->list, &ps->postfix); - - return 0; -} - -static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op) -{ - struct postfix_elt *elt; - - elt = kmalloc(sizeof(*elt), GFP_KERNEL); - if (!elt) - return -ENOMEM; - - elt->op = op; - elt->operand = NULL; - - list_add_tail(&elt->list, &ps->postfix); - - return 0; -} - -static void postfix_clear(struct filter_parse_state *ps) -{ - struct postfix_elt *elt; - - while (!list_empty(&ps->postfix)) { - elt = list_first_entry(&ps->postfix, struct postfix_elt, list); - list_del(&elt->list); - kfree(elt->operand); - kfree(elt); - } -} - -static int filter_parse(struct filter_parse_state *ps) -{ - enum filter_op_ids op, top_op; - int in_string = 0; - char ch; - - while ((ch = infix_next(ps))) { - if (ch == '"') { - in_string ^= 1; - continue; - } - - if (in_string) - goto parse_operand; - - if (isspace(ch)) - continue; - - if (is_op_char(ps, ch)) { - op = infix_get_op(ps, ch); - if (op == OP_NONE) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; - } - - if (strlen(curr_operand(ps))) { - postfix_append_operand(ps, curr_operand(ps)); - clear_operand_string(ps); - } - - while (!filter_opstack_empty(ps)) { - top_op = filter_opstack_top(ps); - if (!is_precedence_lower(ps, top_op, op)) { - top_op = filter_opstack_pop(ps); - postfix_append_op(ps, top_op); - continue; - } - break; - } - - filter_opstack_push(ps, op); - continue; - } - - if (ch == '(') { - filter_opstack_push(ps, OP_OPEN_PAREN); - continue; - } - - if (ch == ')') { - if (strlen(curr_operand(ps))) { - postfix_append_operand(ps, curr_operand(ps)); - clear_operand_string(ps); - } - - top_op = filter_opstack_pop(ps); - while (top_op != OP_NONE) { - if (top_op == OP_OPEN_PAREN) - break; - postfix_append_op(ps, top_op); - top_op = filter_opstack_pop(ps); - } - if (top_op == OP_NONE) { - parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); - return -EINVAL; - } + if (file->system != dir) continue; - } -parse_operand: - if (append_operand_char(ps, ch)) { - parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); - return -EINVAL; - } + __free_subsystem_filter(file); } +} - if (strlen(curr_operand(ps))) - postfix_append_operand(ps, curr_operand(ps)); +int filter_assign_type(const char *type) +{ + if (strstr(type, "__data_loc") && strstr(type, "char")) + return FILTER_DYN_STRING; - while (!filter_opstack_empty(ps)) { - top_op = filter_opstack_pop(ps); - if (top_op == OP_NONE) - break; - if (top_op == OP_OPEN_PAREN) { - parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); - return -EINVAL; - } - postfix_append_op(ps, top_op); - } + if (strchr(type, '[') && strstr(type, "char")) + return FILTER_STATIC_STRING; - return 0; + return FILTER_OTHER; } -static struct filter_pred *create_pred(struct filter_parse_state *ps, - struct trace_event_call *call, - enum filter_op_ids op, - char *operand1, char *operand2) +static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, + int field_size, int field_is_signed) { - struct ftrace_event_field *field; - static struct filter_pred pred; - - memset(&pred, 0, sizeof(pred)); - pred.op = op; - - if (op == OP_AND || op == OP_OR) - return &pred; + filter_pred_fn_t fn = NULL; + int pred_func_index = -1; - if (!operand1 || !operand2) { - parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return NULL; + switch (op) { + case OP_EQ: + case OP_NE: + break; + default: + if (WARN_ON_ONCE(op < PRED_FUNC_START)) + return NULL; + pred_func_index = op - PRED_FUNC_START; + if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX)) + return NULL; } - field = trace_find_event_field(call, operand1); - if (!field) { - parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); - return NULL; + switch (field_size) { + case 8: + if (pred_func_index < 0) + fn = filter_pred_64; + else if (field_is_signed) + fn = pred_funcs_s64[pred_func_index]; + else + fn = pred_funcs_u64[pred_func_index]; + break; + case 4: + if (pred_func_index < 0) + fn = filter_pred_32; + else if (field_is_signed) + fn = pred_funcs_s32[pred_func_index]; + else + fn = pred_funcs_u32[pred_func_index]; + break; + case 2: + if (pred_func_index < 0) + fn = filter_pred_16; + else if (field_is_signed) + fn = pred_funcs_s16[pred_func_index]; + else + fn = pred_funcs_u16[pred_func_index]; + break; + case 1: + if (pred_func_index < 0) + fn = filter_pred_8; + else if (field_is_signed) + fn = pred_funcs_s8[pred_func_index]; + else + fn = pred_funcs_u8[pred_func_index]; + break; } - strcpy(pred.regex.pattern, operand2); - pred.regex.len = strlen(pred.regex.pattern); - pred.field = field; - return init_pred(ps, field, &pred) ? NULL : &pred; + return fn; } -static int check_preds(struct filter_parse_state *ps) +/* Called when a predicate is encountered by predicate_parse() */ +static int parse_pred(const char *str, void *data, + int pos, struct filter_parse_error *pe, + struct filter_pred **pred_ptr) { - int n_normal_preds = 0, n_logical_preds = 0; - struct postfix_elt *elt; - int cnt = 0; + struct trace_event_call *call = data; + struct ftrace_event_field *field; + struct filter_pred *pred = NULL; + char num_buf[24]; /* Big enough to hold an address */ + char *field_name; + char q; + u64 val; + int len; + int ret; + int op; + int s; + int i = 0; - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) { - cnt++; - continue; - } + /* First find the field to associate to */ + while (isspace(str[i])) + i++; + s = i; - if (elt->op == OP_AND || elt->op == OP_OR) { - n_logical_preds++; - cnt--; - continue; - } - if (elt->op != OP_NOT) - cnt--; - n_normal_preds++; - /* all ops should have operands */ - if (cnt < 0) - break; - } + while (isalnum(str[i]) || str[i] == '_') + i++; + + len = i - s; + + if (!len) + return -1; + + field_name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!field_name) + return -ENOMEM; + + /* Make sure that the field exists */ - if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) { - parse_error(ps, FILT_ERR_INVALID_FILTER, 0); + field = trace_find_event_field(call, field_name); + kfree(field_name); + if (!field) { + parse_error(pe, FILT_ERR_FIELD_NOT_FOUND, pos + i); return -EINVAL; } - return 0; -} + while (isspace(str[i])) + i++; -static int count_preds(struct filter_parse_state *ps) -{ - struct postfix_elt *elt; - int n_preds = 0; + /* Make sure this op is supported */ + for (op = 0; ops[op]; op++) { + /* This is why '<=' must come before '<' in ops[] */ + if (strncmp(str + i, ops[op], strlen(ops[op])) == 0) + break; + } - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) - continue; - n_preds++; + if (!ops[op]) { + parse_error(pe, FILT_ERR_INVALID_OP, pos + i); + goto err_free; } - return n_preds; -} + i += strlen(ops[op]); -struct check_pred_data { - int count; - int max; -}; + while (isspace(str[i])) + i++; -static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct check_pred_data *d = data; + s = i; - if (WARN_ON(d->count++ > d->max)) { - *err = -EINVAL; - return WALK_PRED_ABORT; - } - return WALK_PRED_DEFAULT; -} + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return -ENOMEM; -/* - * The tree is walked at filtering of an event. If the tree is not correctly - * built, it may cause an infinite loop. Check here that the tree does - * indeed terminate. - */ -static int check_pred_tree(struct event_filter *filter, - struct filter_pred *root) -{ - struct check_pred_data data = { + pred->field = field; + pred->offset = field->offset; + pred->op = op; + + if (ftrace_event_is_function(call)) { /* - * The max that we can hit a node is three times. - * Once going down, once coming up from left, and - * once coming up from right. This is more than enough - * since leafs are only hit a single time. + * Perf does things different with function events. + * It only allows an "ip" field, and expects a string. + * But the string does not need to be surrounded by quotes. + * If it is a string, the assigned function as a nop, + * (perf doesn't use it) and grab everything. */ - .max = 3 * filter->n_preds, - .count = 0, - }; + if (strcmp(field->name, "ip") != 0) { + parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); + goto err_free; + } + pred->fn = filter_pred_none; + + /* + * Quotes are not required, but if they exist then we need + * to read them till we hit a matching one. + */ + if (str[i] == '\'' || str[i] == '"') + q = str[i]; + else + q = 0; + + for (i++; str[i]; i++) { + if (q && str[i] == q) + break; + if (!q && (str[i] == ')' || str[i] == '&' || + str[i] == '|')) + break; + } + /* Skip quotes */ + if (q) + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } - return walk_pred_tree(filter->preds, root, - check_pred_tree_cb, &data); -} + pred->regex.len = len; + strncpy(pred->regex.pattern, str + s, len); + pred->regex.pattern[len] = 0; + + /* This is either a string, or an integer */ + } else if (str[i] == '\'' || str[i] == '"') { + char q = str[i]; + + /* Make sure the op is OK for strings */ + switch (op) { + case OP_NE: + pred->not = 1; + /* Fall through */ + case OP_GLOB: + case OP_EQ: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } -static int count_leafs_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - int *count = data; + /* Make sure the field is OK for strings */ + if (!is_string_field(field)) { + parse_error(pe, FILT_ERR_EXPECT_DIGIT, pos + i); + goto err_free; + } - if ((move == MOVE_DOWN) && - (pred->left == FILTER_PRED_INVALID)) - (*count)++; + for (i++; str[i]; i++) { + if (str[i] == q) + break; + } + if (!str[i]) { + parse_error(pe, FILT_ERR_MISSING_QUOTE, pos + i); + goto err_free; + } - return WALK_PRED_DEFAULT; -} + /* Skip quotes */ + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } -static int count_leafs(struct filter_pred *preds, struct filter_pred *root) -{ - int count = 0, ret; + pred->regex.len = len; + strncpy(pred->regex.pattern, str + s, len); + pred->regex.pattern[len] = 0; - ret = walk_pred_tree(preds, root, count_leafs_cb, &count); - WARN_ON(ret); - return count; -} + filter_build_regex(pred); -struct fold_pred_data { - struct filter_pred *root; - int count; - int children; -}; + if (field->filter_type == FILTER_COMM) { + pred->fn = filter_pred_comm; -static int fold_pred_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct fold_pred_data *d = data; - struct filter_pred *root = d->root; + } else if (field->filter_type == FILTER_STATIC_STRING) { + pred->fn = filter_pred_string; + pred->regex.field_len = field->size; - if (move != MOVE_DOWN) - return WALK_PRED_DEFAULT; - if (pred->left != FILTER_PRED_INVALID) - return WALK_PRED_DEFAULT; + } else if (field->filter_type == FILTER_DYN_STRING) + pred->fn = filter_pred_strloc; + else + pred->fn = filter_pred_pchar; + /* go past the last quote */ + i++; - if (WARN_ON(d->count == d->children)) { - *err = -EINVAL; - return WALK_PRED_ABORT; - } + } else if (isdigit(str[i])) { - pred->index &= ~FILTER_PRED_FOLD; - root->ops[d->count++] = pred->index; - return WALK_PRED_DEFAULT; -} + /* Make sure the field is not a string */ + if (is_string_field(field)) { + parse_error(pe, FILT_ERR_EXPECT_STRING, pos + i); + goto err_free; + } -static int fold_pred(struct filter_pred *preds, struct filter_pred *root) -{ - struct fold_pred_data data = { - .root = root, - .count = 0, - }; - int children; + if (op == OP_GLOB) { + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } - /* No need to keep the fold flag */ - root->index &= ~FILTER_PRED_FOLD; + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; - /* If the root is a leaf then do nothing */ - if (root->left == FILTER_PRED_INVALID) - return 0; + len = i - s; + /* 0xfeedfacedeadbeef is 18 chars max */ + if (len >= sizeof(num_buf)) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } - /* count the children */ - children = count_leafs(preds, &preds[root->left]); - children += count_leafs(preds, &preds[root->right]); + strncpy(num_buf, str + s, len); + num_buf[len] = 0; - root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); - if (!root->ops) - return -ENOMEM; + /* Make sure it is a value */ + if (field->is_signed) + ret = kstrtoll(num_buf, 0, &val); + else + ret = kstrtoull(num_buf, 0, &val); + if (ret) { + parse_error(pe, FILT_ERR_ILLEGAL_INTVAL, pos + s); + goto err_free; + } - root->val = children; - data.children = children; - return walk_pred_tree(preds, root, fold_pred_cb, &data); -} + pred->val = val; -static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct filter_pred *preds = data; + if (field->filter_type == FILTER_CPU) + pred->fn = filter_pred_cpu; + else { + pred->fn = select_comparison_fn(pred->op, field->size, + field->is_signed); + if (pred->op == OP_NE) + pred->not = 1; + } - if (move != MOVE_DOWN) - return WALK_PRED_DEFAULT; - if (!(pred->index & FILTER_PRED_FOLD)) - return WALK_PRED_DEFAULT; + } else { + parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i); + goto err_free; + } - *err = fold_pred(preds, pred); - if (*err) - return WALK_PRED_ABORT; + *pred_ptr = pred; + return i; - /* eveyrhing below is folded, continue with parent */ - return WALK_PRED_PARENT; +err_free: + kfree(pred); + return -EINVAL; } +enum { + TOO_MANY_CLOSE = -1, + TOO_MANY_OPEN = -2, + MISSING_QUOTE = -3, +}; + /* - * To optimize the processing of the ops, if we have several "ors" or - * "ands" together, we can put them in an array and process them all - * together speeding up the filter logic. + * Read the filter string once to calculate the number of predicates + * as well as how deep the parentheses go. + * + * Returns: + * 0 - everything is fine (err is undefined) + * -1 - too many ')' + * -2 - too many '(' + * -3 - No matching quote */ -static int fold_pred_tree(struct event_filter *filter, - struct filter_pred *root) -{ - return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, - filter->preds); -} - -static int replace_preds(struct trace_event_call *call, - struct event_filter *filter, - struct filter_parse_state *ps, - bool dry_run) -{ - char *operand1 = NULL, *operand2 = NULL; - struct filter_pred *pred; - struct filter_pred *root; - struct postfix_elt *elt; - struct pred_stack stack = { }; /* init to NULL */ - int err; - int n_preds = 0; - - n_preds = count_preds(ps); - if (n_preds >= MAX_FILTER_PRED) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - err = check_preds(ps); - if (err) - return err; +static int calc_stack(const char *str, int *parens, int *preds, int *err) +{ + bool is_pred = false; + int nr_preds = 0; + int open = 1; /* Count the expression as "(E)" */ + int last_quote = 0; + int max_open = 1; + int quote = 0; + int i; - if (!dry_run) { - err = __alloc_pred_stack(&stack, n_preds); - if (err) - return err; - err = __alloc_preds(filter, n_preds); - if (err) - goto fail; - } + *err = 0; - n_preds = 0; - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) { - if (!operand1) - operand1 = elt->operand; - else if (!operand2) - operand2 = elt->operand; - else { - parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - err = -EINVAL; - goto fail; - } + for (i = 0; str[i]; i++) { + if (isspace(str[i])) + continue; + if (quote) { + if (str[i] == quote) + quote = 0; continue; } - if (elt->op == OP_NOT) { - if (!n_preds || operand1 || operand2) { - parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0); - err = -EINVAL; - goto fail; + switch (str[i]) { + case '\'': + case '"': + quote = str[i]; + last_quote = i; + break; + case '|': + case '&': + if (str[i+1] != str[i]) + break; + is_pred = false; + continue; + case '(': + is_pred = false; + open++; + if (open > max_open) + max_open = open; + continue; + case ')': + is_pred = false; + if (open == 1) { + *err = i; + return TOO_MANY_CLOSE; } - if (!dry_run) - filter->preds[n_preds - 1].not ^= 1; + open--; continue; } - - if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - err = -ENOSPC; - goto fail; + if (!is_pred) { + nr_preds++; + is_pred = true; } + } - pred = create_pred(ps, call, elt->op, operand1, operand2); - if (!pred) { - err = -EINVAL; - goto fail; - } + if (quote) { + *err = last_quote; + return MISSING_QUOTE; + } - if (!dry_run) { - err = filter_add_pred(ps, filter, pred, &stack); - if (err) - goto fail; - } + if (open != 1) { + int level = open; - operand1 = operand2 = NULL; + /* find the bad open */ + for (i--; i; i--) { + if (quote) { + if (str[i] == quote) + quote = 0; + continue; + } + switch (str[i]) { + case '(': + if (level == open) { + *err = i; + return TOO_MANY_OPEN; + } + level--; + break; + case ')': + level++; + break; + case '\'': + case '"': + quote = str[i]; + break; + } + } + /* First character is the '(' with missing ')' */ + *err = 0; + return TOO_MANY_OPEN; } - if (!dry_run) { - /* We should have one item left on the stack */ - pred = __pop_pred_stack(&stack); - if (!pred) - return -EINVAL; - /* This item is where we start from in matching */ - root = pred; - /* Make sure the stack is empty */ - pred = __pop_pred_stack(&stack); - if (WARN_ON(pred)) { - err = -EINVAL; - filter->root = NULL; - goto fail; + /* Set the size of the required stacks */ + *parens = max_open; + *preds = nr_preds; + return 0; +} + +static int process_preds(struct trace_event_call *call, + const char *filter_string, + struct event_filter *filter, + struct filter_parse_error *pe) +{ + struct prog_entry *prog; + int nr_parens; + int nr_preds; + int index; + int ret; + + ret = calc_stack(filter_string, &nr_parens, &nr_preds, &index); + if (ret < 0) { + switch (ret) { + case MISSING_QUOTE: + parse_error(pe, FILT_ERR_MISSING_QUOTE, index); + break; + case TOO_MANY_OPEN: + parse_error(pe, FILT_ERR_TOO_MANY_OPEN, index); + break; + default: + parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, index); } - err = check_pred_tree(filter, root); - if (err) - goto fail; - - /* Optimize the tree */ - err = fold_pred_tree(filter, root); - if (err) - goto fail; - - /* We don't set root until we know it works */ - barrier(); - filter->root = root; + return ret; } - err = 0; -fail: - __free_pred_stack(&stack); - return err; + if (!nr_preds) { + prog = NULL; + } else { + prog = predicate_parse(filter_string, nr_parens, nr_preds, + parse_pred, call, pe); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + rcu_assign_pointer(filter->prog, prog); + return 0; } static inline void event_set_filtered_flag(struct trace_event_file *file) @@ -1753,9 +1558,9 @@ struct filter_list { struct event_filter *filter; }; -static int replace_system_preds(struct trace_subsystem_dir *dir, +static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, - struct filter_parse_state *ps, + struct filter_parse_error *pe, char *filter_string) { struct trace_event_file *file; @@ -1766,29 +1571,11 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, bool fail = true; int err; - list_for_each_entry(file, &tr->events, list) { - if (file->system != dir) - continue; - - /* - * Try to see if the filter can be applied - * (filter arg is ignored on dry_run) - */ - err = replace_preds(file->event_call, NULL, ps, true); - if (err) - event_set_no_set_filter_flag(file); - else - event_clear_no_set_filter_flag(file); - } - list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; - if (event_no_set_filter_flag(file)) - continue; - filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (!filter) goto fail_mem; @@ -1797,11 +1584,11 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, if (!filter->filter_string) goto fail_mem; - err = replace_preds(file->event_call, filter, ps, false); + err = process_preds(file->event_call, filter_string, filter, pe); if (err) { filter_disable(file); - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - append_filter_err(ps, filter); + parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(pe, filter); } else event_set_filtered_flag(file); @@ -1843,7 +1630,7 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, list_del(&filter_item->list); kfree(filter_item); } - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: kfree(filter); @@ -1859,16 +1646,16 @@ static int replace_system_preds(struct trace_subsystem_dir *dir, } static int create_filter_start(char *filter_string, bool set_str, - struct filter_parse_state **psp, + struct filter_parse_error **pse, struct event_filter **filterp) { struct event_filter *filter; - struct filter_parse_state *ps = NULL; + struct filter_parse_error *pe = NULL; int err = 0; - WARN_ON_ONCE(*psp || *filterp); + if (WARN_ON_ONCE(*pse || *filterp)) + return -EINVAL; - /* allocate everything, and if any fails, free all and fail */ filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (filter && set_str) { filter->filter_string = kstrdup(filter_string, GFP_KERNEL); @@ -1876,32 +1663,24 @@ static int create_filter_start(char *filter_string, bool set_str, err = -ENOMEM; } - ps = kzalloc(sizeof(*ps), GFP_KERNEL); + pe = kzalloc(sizeof(*pe), GFP_KERNEL); - if (!filter || !ps || err) { - kfree(ps); + if (!filter || !pe || err) { + kfree(pe); __free_filter(filter); return -ENOMEM; } /* we're committed to creating a new filter */ *filterp = filter; - *psp = ps; + *pse = pe; - parse_init(ps, filter_ops, filter_string); - err = filter_parse(ps); - if (err && set_str) - append_filter_err(ps, filter); - return err; + return 0; } -static void create_filter_finish(struct filter_parse_state *ps) +static void create_filter_finish(struct filter_parse_error *pe) { - if (ps) { - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - } + kfree(pe); } /** @@ -1921,24 +1700,20 @@ static void create_filter_finish(struct filter_parse_state *ps) * freeing it. */ static int create_filter(struct trace_event_call *call, - char *filter_str, bool set_str, + char *filter_string, bool set_str, struct event_filter **filterp) { + struct filter_parse_error *pe = NULL; struct event_filter *filter = NULL; - struct filter_parse_state *ps = NULL; int err; - err = create_filter_start(filter_str, set_str, &ps, &filter); - if (!err) { - err = replace_preds(call, filter, ps, false); - if (err && set_str) - append_filter_err(ps, filter); - } - if (err && !set_str) { - free_event_filter(filter); - filter = NULL; - } - create_filter_finish(ps); + err = create_filter_start(filter_string, set_str, &pe, &filter); + if (err) + return err; + + err = process_preds(call, filter_string, filter, pe); + if (err && set_str) + append_filter_err(pe, filter); *filterp = filter; return err; @@ -1965,21 +1740,21 @@ static int create_system_filter(struct trace_subsystem_dir *dir, char *filter_str, struct event_filter **filterp) { struct event_filter *filter = NULL; - struct filter_parse_state *ps = NULL; + struct filter_parse_error *pe = NULL; int err; - err = create_filter_start(filter_str, true, &ps, &filter); + err = create_filter_start(filter_str, true, &pe, &filter); if (!err) { - err = replace_system_preds(dir, tr, ps, filter_str); + err = process_system_preds(dir, tr, pe, filter_str); if (!err) { /* System filters just show a default message */ kfree(filter->filter_string); filter->filter_string = NULL; } else { - append_filter_err(ps, filter); + append_filter_err(pe, filter); } } - create_filter_finish(ps); + create_filter_finish(pe); *filterp = filter; return err; @@ -2162,66 +1937,79 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len, return ret; } -static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +static int ftrace_function_check_pred(struct filter_pred *pred) { struct ftrace_event_field *field = pred->field; - if (leaf) { - /* - * Check the leaf predicate for function trace, verify: - * - only '==' and '!=' is used - * - the 'ip' field is used - */ - if ((pred->op != OP_EQ) && (pred->op != OP_NE)) - return -EINVAL; + /* + * Check the predicate for function trace, verify: + * - only '==' and '!=' is used + * - the 'ip' field is used + */ + if ((pred->op != OP_EQ) && (pred->op != OP_NE)) + return -EINVAL; - if (strcmp(field->name, "ip")) - return -EINVAL; - } else { - /* - * Check the non leaf predicate for function trace, verify: - * - only '||' is used - */ - if (pred->op != OP_OR) - return -EINVAL; - } + if (strcmp(field->name, "ip")) + return -EINVAL; return 0; } -static int ftrace_function_set_filter_cb(enum move_type move, - struct filter_pred *pred, - int *err, void *data) +static int ftrace_function_set_filter_pred(struct filter_pred *pred, + struct function_filter_data *data) { + int ret; + /* Checking the node is valid for function trace. */ - if ((move != MOVE_DOWN) || - (pred->left != FILTER_PRED_INVALID)) { - *err = ftrace_function_check_pred(pred, 0); - } else { - *err = ftrace_function_check_pred(pred, 1); - if (*err) - return WALK_PRED_ABORT; - - *err = __ftrace_function_set_filter(pred->op == OP_EQ, - pred->regex.pattern, - pred->regex.len, - data); - } + ret = ftrace_function_check_pred(pred); + if (ret) + return ret; + + return __ftrace_function_set_filter(pred->op == OP_EQ, + pred->regex.pattern, + pred->regex.len, + data); +} + +static bool is_or(struct prog_entry *prog, int i) +{ + int target; - return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; + /* + * Only "||" is allowed for function events, thus, + * all true branches should jump to true, and any + * false branch should jump to false. + */ + target = prog[i].target + 1; + /* True and false have NULL preds (all prog entries should jump to one */ + if (prog[target].pred) + return false; + + /* prog[target].target is 1 for TRUE, 0 for FALSE */ + return prog[i].when_to_branch == prog[target].target; } static int ftrace_function_set_filter(struct perf_event *event, struct event_filter *filter) { + struct prog_entry *prog = filter->prog; struct function_filter_data data = { .first_filter = 1, .first_notrace = 1, .ops = &event->ftrace_ops, }; + int i; + + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; + + if (!is_or(prog, i)) + return -EINVAL; - return walk_pred_tree(filter->preds, filter->root, - ftrace_function_set_filter_cb, &data); + if (ftrace_function_set_filter_pred(pred, &data) < 0) + return -EINVAL; + } + return 0; } #else static int ftrace_function_set_filter(struct perf_event *event, @@ -2364,26 +2152,27 @@ static int test_pred_visited_fn(struct filter_pred *pred, void *event) return 1; } -static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) +static void update_pred_fn(struct event_filter *filter, char *fields) { - char *fields = data; + struct prog_entry *prog = filter->prog; + int i; - if ((move == MOVE_DOWN) && - (pred->left == FILTER_PRED_INVALID)) { + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; struct ftrace_event_field *field = pred->field; + WARN_ON_ONCE(!pred->fn); + if (!field) { - WARN(1, "all leafs should have field defined"); - return WALK_PRED_DEFAULT; + WARN_ONCE(1, "all leafs should have field defined %d", i); + continue; } + if (!strchr(fields, *field->name)) - return WALK_PRED_DEFAULT; + continue; - WARN_ON(!pred->fn); pred->fn = test_pred_visited_fn; } - return WALK_PRED_DEFAULT; } static __init int ftrace_test_event_filter(void) @@ -2413,9 +2202,7 @@ static __init int ftrace_test_event_filter(void) */ preempt_disable(); if (*d->not_visited) - walk_pred_tree(filter->preds, filter->root, - test_walk_pred_cb, - d->not_visited); + update_pred_fn(filter, d->not_visited); test_pred_visited = 0; err = filter_match_preds(filter, &d->rec); -- cgit v1.2.3-71-gd317 From caacdbf4aa567ab5e8de1a4070195c5d3e8f1340 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Wed, 7 Mar 2018 15:57:27 -0800 Subject: genirq: Add CONFIG_GENERIC_IRQ_MULTI_HANDLER The arm multi irq handler registration mechanism has been copied into a handful of architectures, including arm64 and openrisc. RISC-V needs the same mechanism. Instead of adding yet another copy for RISC-V copy the arm implementation into the core code depending on a new Kconfig symbol: CONFIG_GENERIC_MULTI_IRQ_HANDLER. Subsequent patches will convert the various architectures. Signed-off-by: Palmer Dabbelt Signed-off-by: Thomas Gleixner Cc: jonas@southpole.se Cc: catalin.marinas@arm.com Cc: Will Deacon Cc: linux@armlinux.org.uk Cc: stefan.kristiansson@saunalahti.fi Cc: openrisc@lists.librecores.org Cc: shorne@gmail.com Cc: linux-riscv@lists.infradead.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lkml.kernel.org/r/20180307235731.22627-2-palmer@sifive.com --- include/linux/irq.h | 18 ++++++++++++++++++ kernel/irq/Kconfig | 5 +++++ kernel/irq/handle.c | 15 +++++++++++++++ 3 files changed, 38 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 979eed1b2654..65916a305f3d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1165,4 +1165,22 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest); int ipi_send_single(unsigned int virq, unsigned int cpu); int ipi_send_mask(unsigned int virq, const struct cpumask *dest); +#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER +/* + * Registers a generic IRQ handling function as the top-level IRQ handler in + * the system, which is generally the first C code called from an assembly + * architecture-specific interrupt handler. + * + * Returns 0 on success, or -EBUSY if an IRQ handler has already been + * registered. + */ +int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)); + +/* + * Allows interrupt handlers to find the irqchip that's been registered as the + * top-level IRQ handler. + */ +extern void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; +#endif + #endif /* _LINUX_IRQ_H */ diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 6fc87ccda1d7..5f3e2baefca9 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -132,3 +132,8 @@ config GENERIC_IRQ_DEBUGFS If you don't know what to do here, say N. endmenu + +config GENERIC_IRQ_MULTI_HANDLER + bool + help + Allow to specify the low level IRQ handler at run time. diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 79f987b942b8..3570c715c3e7 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -20,6 +20,10 @@ #include "internals.h" +#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER +void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; +#endif + /** * handle_bad_irq - handle spurious and unhandled irqs * @desc: description of the interrupt @@ -207,3 +211,14 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); return ret; } + +#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER +int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) +{ + if (handle_arch_irq) + return -EBUSY; + + handle_arch_irq = handle_irq; + return 0; +} +#endif -- cgit v1.2.3-71-gd317 From 615755a77b2461ed78dfafb8a6649456201949c7 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 14 Mar 2018 10:23:21 -0700 Subject: bpf: extend stackmap to save binary_build_id+offset instead of address Currently, bpf stackmap store address for each entry in the call trace. To map these addresses to user space files, it is necessary to maintain the mapping from these virtual address to symbols in the binary. Usually, the user space profiler (such as perf) has to scan /proc/pid/maps at the beginning of profiling, and monitor mmap2() calls afterwards. Given the cost of maintaining the address map, this solution is not practical for system wide profiling that is always on. This patch tries to solve this problem with a variation of stackmap. This variation is enabled by flag BPF_F_STACK_BUILD_ID. Instead of storing addresses, the variation stores ELF file build_id + offset. Build ID is a 20-byte unique identifier for ELF files. The following command shows the Build ID of /bin/bash: [user@]$ readelf -n /bin/bash ... Build ID: XXXXXXXXXX ... With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID for each entry in the call trace, and translate it into the following struct: struct bpf_stack_build_id_offset { __s32 status; unsigned char build_id[BPF_BUILD_ID_SIZE]; union { __u64 offset; __u64 ip; }; }; The search of build_id is limited to the first page of the file, and this page should be in page cache. Otherwise, we fallback to store ip for this entry (ip field in struct bpf_stack_build_id_offset). This requires the build_id to be stored in the first page. A quick survey of binary and dynamic library files in a few different systems shows that almost all binary and dynamic library files have build_id in the first page. Build_id is only meaningful for user stack. If a kernel stack is added to a stackmap with BPF_F_STACK_BUILD_ID, it will automatically fallback to only store ip (status == BPF_STACK_BUILD_ID_IP). Similarly, if build_id lookup failed for some reason, it will also fallback to store ip. User space can access struct bpf_stack_build_id_offset with bpf syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to maintain mapping from build id to binary files. This mostly static mapping is much easier to maintain than per process address maps. Note: Stackmap with build_id only works in non-nmi context at this time. This is because we need to take mm->mmap_sem for find_vma(). If this changes, we would like to allow build_id lookup in nmi context. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 22 ++++ kernel/bpf/stackmap.c | 257 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 257 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a66769e5875..1e15d1724d89 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -231,6 +231,28 @@ enum bpf_attach_type { #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) +/* Flag for stack_map, store build_id+offset instead of pointer */ +#define BPF_F_STACK_BUILD_ID (1U << 5) + +enum bpf_stack_build_id_status { + /* user space need an empty entry to identify end of a trace */ + BPF_STACK_BUILD_ID_EMPTY = 0, + /* with valid build_id and offset */ + BPF_STACK_BUILD_ID_VALID = 1, + /* couldn't get build_id, fallback to ip */ + BPF_STACK_BUILD_ID_IP = 2, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + union { + __u64 offset; + __u64 ip; + }; +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b0ecf43f5894..57eeb1234b67 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,16 +9,19 @@ #include #include #include +#include +#include #include "percpu_freelist.h" -#define STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ + BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; - u64 ip[]; + u64 data[]; }; struct bpf_stack_map { @@ -29,6 +32,17 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +static inline bool stack_map_use_build_id(struct bpf_map *map) +{ + return (map->map_flags & BPF_F_STACK_BUILD_ID); +} + +static inline int stack_map_data_size(struct bpf_map *map) +{ + return stack_map_use_build_id(map) ? + sizeof(struct bpf_stack_build_id) : sizeof(u64); +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; @@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - value_size < 8 || value_size % 8 || - value_size / 8 > sysctl_perf_event_max_stack) + value_size < 8 || value_size % 8) + return ERR_PTR(-EINVAL); + + BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); + if (attr->map_flags & BPF_F_STACK_BUILD_ID) { + if (value_size % sizeof(struct bpf_stack_build_id) || + value_size / sizeof(struct bpf_stack_build_id) + > sysctl_perf_event_max_stack) + return ERR_PTR(-EINVAL); + } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2 */ @@ -114,13 +136,184 @@ free_smap: return ERR_PTR(err); } +#define BPF_BUILD_ID 3 +/* + * Parse build id from the note segment. This logic can be shared between + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are + * identical. + */ +static inline int stack_map_parse_build_id(void *page_addr, + unsigned char *build_id, + void *note_start, + Elf32_Word note_size) +{ + Elf32_Word note_offs = 0, new_offs; + + /* check for overflow */ + if (note_start < page_addr || note_start + note_size < note_start) + return -EINVAL; + + /* only supports note that fits in the first page */ + if (note_start + note_size > page_addr + PAGE_SIZE) + return -EINVAL; + + while (note_offs + sizeof(Elf32_Nhdr) < note_size) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + + if (nhdr->n_type == BPF_BUILD_ID && + nhdr->n_namesz == sizeof("GNU") && + nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + memcpy(build_id, + note_start + note_offs + + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), + BPF_BUILD_ID_SIZE); + return 0; + } + new_offs = note_offs + sizeof(Elf32_Nhdr) + + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); + if (new_offs <= note_offs) /* overflow */ + break; + note_offs = new_offs; + } + return -EINVAL; +} + +/* Parse build ID from 32-bit ELF */ +static int stack_map_get_build_id_32(void *page_addr, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; + Elf32_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + return -EINVAL; + + phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID from 64-bit ELF */ +static int stack_map_get_build_id_64(void *page_addr, + unsigned char *build_id) +{ + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; + Elf64_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + return -EINVAL; + + phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID of ELF file mapped to vma */ +static int stack_map_get_build_id(struct vm_area_struct *vma, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr; + struct page *page; + void *page_addr; + int ret; + + /* only works for page backed storage */ + if (!vma->vm_file) + return -EINVAL; + + page = find_get_page(vma->vm_file->f_mapping, 0); + if (!page) + return -EFAULT; /* page not mapped */ + + ret = -EINVAL; + page_addr = page_address(page); + ehdr = (Elf32_Ehdr *)page_addr; + + /* compare magic x7f "ELF" */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* only support executable file and shared object file */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) + goto out; + + if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) + ret = stack_map_get_build_id_32(page_addr, build_id); + else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) + ret = stack_map_get_build_id_64(page_addr, build_id); +out: + put_page(page); + return ret; +} + +static void stack_map_get_build_id_offset(struct bpf_map *map, + struct stack_map_bucket *bucket, + u64 *ips, u32 trace_nr, bool user) +{ + int i; + struct vm_area_struct *vma; + struct bpf_stack_build_id *id_offs; + + bucket->nr = trace_nr; + id_offs = (struct bpf_stack_build_id *)bucket->data; + + /* + * We cannot do up_read() in nmi context, so build_id lookup is + * only supported for non-nmi events. If at some point, it is + * possible to run find_vma() without taking the semaphore, we + * would like to allow build_id lookup in nmi context. + * + * Same fallback is used for kernel stack (!user) on a stackmap + * with build_id. + */ + if (!user || !current || !current->mm || in_nmi() || + down_read_trylock(¤t->mm->mmap_sem) == 0) { + /* cannot access current->mm, fall back to ips */ + for (i = 0; i < trace_nr; i++) { + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + } + return; + } + + for (i = 0; i < trace_nr; i++) { + vma = find_vma(current->mm, ips[i]); + if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + /* per entry fall back to ips */ + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + continue; + } + id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] + - vma->vm_start; + id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + } + up_read(¤t->mm->mmap_sem); +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / 8; + u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, bool user = flags & BPF_F_USER_STACK; bool kernel = !user; u64 *ips; + bool hash_matches; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) @@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); - if (bucket && bucket->hash == hash) { - if (flags & BPF_F_FAST_STACK_CMP) + hash_matches = bucket && bucket->hash == hash; + /* fast cmp */ + if (hash_matches && flags & BPF_F_FAST_STACK_CMP) + return id; + + if (stack_map_use_build_id(map)) { + /* for build_id+offset, pop a bucket before slow cmp */ + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + stack_map_get_build_id_offset(map, new_bucket, ips, + trace_nr, user); + trace_len = trace_nr * sizeof(struct bpf_stack_build_id); + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, new_bucket->data, trace_len) == 0) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return id; - if (bucket->nr == trace_nr && - memcmp(bucket->ip, ips, trace_len) == 0) + } + if (bucket && !(flags & BPF_F_REUSE_STACKID)) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); + return -EEXIST; + } + } else { + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, ips, trace_len) == 0) return id; + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + memcpy(new_bucket->data, ips, trace_len); } - /* this call stack is not in the map, try to add it */ - if (bucket && !(flags & BPF_F_REUSE_STACKID)) - return -EEXIST; - - new_bucket = (struct stack_map_bucket *) - pcpu_freelist_pop(&smap->freelist); - if (unlikely(!new_bucket)) - return -ENOMEM; - - memcpy(new_bucket->ip, ips, trace_len); new_bucket->hash = hash; new_bucket->nr = trace_nr; @@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) if (!bucket) return -ENOENT; - trace_len = bucket->nr * sizeof(u64); - memcpy(value, bucket->ip, trace_len); + trace_len = bucket->nr * stack_map_data_size(map); + memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); -- cgit v1.2.3-71-gd317 From e36df28f532f882965404d58e240f2e058b61f45 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Tue, 13 Feb 2018 15:28:34 +0800 Subject: printk: move dump stack related code to lib/dump_stack.c dump_stack related stuff should belong to lib/dump_stack.c thus move them there. Also conditionally compile lib/dump_stack.c since dump_stack code does not make sense if printk is disabled. Link: http://lkml.kernel.org/r/20180213072834.GA24784@dhcp-128-65.nay.redhat.com To: Steven Rostedt Cc: linux-kernel@vger.kernel.org Cc: akpm@linux-foundation.org Cc: Andi Kleen Signed-off-by: Dave Young Suggested-by: Steven Rostedt Suggested-by: Sergey Senozhatsky Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- include/linux/printk.h | 7 ++++-- kernel/printk/printk.c | 60 -------------------------------------------------- lib/Makefile | 3 ++- lib/dump_stack.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index e9b603ee9953..6d7e800affd8 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -201,6 +201,7 @@ void __init setup_log_buf(int early); __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); +extern asmlinkage void dump_stack(void) __cold; extern void printk_safe_init(void); extern void printk_safe_flush(void); extern void printk_safe_flush_on_panic(void); @@ -264,6 +265,10 @@ static inline void show_regs_print_info(const char *log_lvl) { } +static inline asmlinkage void dump_stack(void) +{ +} + static inline void printk_safe_init(void) { } @@ -279,8 +284,6 @@ static inline void printk_safe_flush_on_panic(void) extern int kptr_restrict; -extern asmlinkage void dump_stack(void) __cold; - #ifndef pr_fmt #define pr_fmt(fmt) fmt #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fa3de5f10e0e..dc663f288463 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -42,13 +42,11 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include @@ -3257,62 +3255,4 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); -static char dump_stack_arch_desc_str[128]; - -/** - * dump_stack_set_arch_desc - set arch-specific str to show with task dumps - * @fmt: printf-style format string - * @...: arguments for the format string - * - * The configured string will be printed right after utsname during task - * dumps. Usually used to add arch-specific system identifiers. If an - * arch wants to make use of such an ID string, it should initialize this - * as soon as possible during boot. - */ -void __init dump_stack_set_arch_desc(const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), - fmt, args); - va_end(args); -} - -/** - * dump_stack_print_info - print generic debug info for dump_stack() - * @log_lvl: log level - * - * Arch-specific dump_stack() implementations can use this function to - * print out the same debug information as the generic dump_stack(). - */ -void dump_stack_print_info(const char *log_lvl) -{ - printk("%sCPU: %d PID: %d Comm: %.20s %s%s %s %.*s\n", - log_lvl, raw_smp_processor_id(), current->pid, current->comm, - kexec_crash_loaded() ? "Kdump: loaded " : "", - print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - if (dump_stack_arch_desc_str[0] != '\0') - printk("%sHardware name: %s\n", - log_lvl, dump_stack_arch_desc_str); - - print_worker_info(log_lvl, current); -} - -/** - * show_regs_print_info - print generic debug info for show_regs() - * @log_lvl: log level - * - * show_regs() implementations can use this function to print out generic - * debug information. - */ -void show_regs_print_info(const char *log_lvl) -{ - dump_stack_print_info(log_lvl); -} - #endif diff --git a/lib/Makefile b/lib/Makefile index 7adb066692b3..6ae3bd481379 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -18,7 +18,7 @@ KCOV_INSTRUMENT_debugobjects.o := n KCOV_INSTRUMENT_dynamic_debug.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o dump_stack.o timerqueue.o\ + rbtree.o radix-tree.o timerqueue.o\ idr.o int_sqrt.o extable.o \ sha1.o chacha20.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ @@ -26,6 +26,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ earlycpio.o seq_buf.o siphash.o \ nmi_backtrace.o nodemask.o win_minmax.o +lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o lib-$(CONFIG_DMA_DIRECT_OPS) += dma-direct.o diff --git a/lib/dump_stack.c b/lib/dump_stack.c index c5edbedd364d..5cff72f18c4a 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -10,6 +10,66 @@ #include #include #include +#include +#include + +static char dump_stack_arch_desc_str[128]; + +/** + * dump_stack_set_arch_desc - set arch-specific str to show with task dumps + * @fmt: printf-style format string + * @...: arguments for the format string + * + * The configured string will be printed right after utsname during task + * dumps. Usually used to add arch-specific system identifiers. If an + * arch wants to make use of such an ID string, it should initialize this + * as soon as possible during boot. + */ +void __init dump_stack_set_arch_desc(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), + fmt, args); + va_end(args); +} + +/** + * dump_stack_print_info - print generic debug info for dump_stack() + * @log_lvl: log level + * + * Arch-specific dump_stack() implementations can use this function to + * print out the same debug information as the generic dump_stack(). + */ +void dump_stack_print_info(const char *log_lvl) +{ + printk("%sCPU: %d PID: %d Comm: %.20s %s%s %s %.*s\n", + log_lvl, raw_smp_processor_id(), current->pid, current->comm, + kexec_crash_loaded() ? "Kdump: loaded " : "", + print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + + if (dump_stack_arch_desc_str[0] != '\0') + printk("%sHardware name: %s\n", + log_lvl, dump_stack_arch_desc_str); + + print_worker_info(log_lvl, current); +} + +/** + * show_regs_print_info - print generic debug info for show_regs() + * @log_lvl: log level + * + * show_regs() implementations can use this function to print out generic + * debug information. + */ +void show_regs_print_info(const char *log_lvl) +{ + dump_stack_print_info(log_lvl); +} static void __dump_stack(void) { -- cgit v1.2.3-71-gd317 From fcb3029a8d89487470f2e175645a9a993949233c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 15 Mar 2018 16:38:04 +0100 Subject: cpu/hotplug: Fix unused function warning The cpuhp_is_ap_state() function is no longer called outside of the CONFIG_SMP #ifdef section, causing a harmless warning: kernel/cpu.c:129:13: error: 'cpuhp_is_ap_state' defined but not used [-Werror=unused-function] This moves the function into the #ifdef to get a clean build again. Fixes: 17a2f1ced028 ("cpu/hotplug: Merge cpuhp_bp_states and cpuhp_ap_states") Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Lai Jiangshan Cc: "Paul E. McKenney" Link: https://lkml.kernel.org/r/20180315153829.3819606-1-arnd@arndb.de --- kernel/cpu.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a1860d42aacf..0db8938fbb23 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -126,15 +126,6 @@ struct cpuhp_step { static DEFINE_MUTEX(cpuhp_state_mutex); static struct cpuhp_step cpuhp_hp_states[]; -static bool cpuhp_is_ap_state(enum cpuhp_state state) -{ - /* - * The extra check for CPUHP_TEARDOWN_CPU is only for documentation - * purposes as that state is handled explicitly in cpu_down. - */ - return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; -} - static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { return cpuhp_hp_states + state; @@ -235,6 +226,15 @@ err: } #ifdef CONFIG_SMP +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitly in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; +} + static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) { struct completion *done = bringup ? &st->done_up : &st->done_down; -- cgit v1.2.3-71-gd317 From 1a8429132e1d2ada3832db5b4a0802c49affb750 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 9 Mar 2018 23:11:26 +0100 Subject: mm: remove blackfin MPU support The CONFIG_MPU option was only defined on blackfin, and that architecture is now being removed, so the respective code can be simplified. A lot of other microcontrollers have an MPU, but I suspect that if we want to bring that support back, we'd do it differently anyway. Signed-off-by: Arnd Bergmann --- kernel/module.c | 4 ---- mm/nommu.c | 20 -------------------- 2 files changed, 24 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ad2d420024f6..2c1df850029b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2181,10 +2181,6 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ disable_ro_nx(&mod->core_layout); module_memfree(mod->core_layout.base); - -#ifdef CONFIG_MPU - update_protections(current->mm); -#endif } void *__symbol_get(const char *symbol) diff --git a/mm/nommu.c b/mm/nommu.c index ebb6e618dade..838a8fdec5c2 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -662,22 +662,6 @@ static void put_nommu_region(struct vm_region *region) __put_nommu_region(region); } -/* - * update protection on a vma - */ -static void protect_vma(struct vm_area_struct *vma, unsigned long flags) -{ -#ifdef CONFIG_MPU - struct mm_struct *mm = vma->vm_mm; - long start = vma->vm_start & PAGE_MASK; - while (start < vma->vm_end) { - protect_page(mm, start, flags); - start += PAGE_SIZE; - } - update_protections(mm); -#endif -} - /* * add a VMA into a process's mm_struct in the appropriate place in the list * and tree and add to the address space's page tree also if not an anonymous @@ -695,8 +679,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; vma->vm_mm = mm; - protect_vma(vma, vma->vm_flags); - /* add the VMA to the mapping */ if (vma->vm_file) { mapping = vma->vm_file->f_mapping; @@ -757,8 +739,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) struct mm_struct *mm = vma->vm_mm; struct task_struct *curr = current; - protect_vma(vma, 0); - mm->map_count--; for (i = 0; i < VMACACHE_SIZE; i++) { /* if the vma is cached, invalidate the entire cache */ -- cgit v1.2.3-71-gd317 From edb39592a5877bd91b2e6ee15194268f35b04892 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Mar 2018 17:36:56 +0100 Subject: perf: Fix sibling iteration Mark noticed that the change to sibling_list changed some iteration semantics; because previously we used group_list as list entry, sibling events would always have an empty sibling_list. But because we now use sibling_list for both list head and list entry, siblings will report as having siblings. Fix this with a custom for_each_sibling_event() iterator. Fixes: 8343aae66167 ("perf/core: Remove perf_event::group_entry") Reported-by: Mark Rutland Suggested-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: vincent.weaver@maine.edu Cc: alexander.shishkin@linux.intel.com Cc: torvalds@linux-foundation.org Cc: alexey.budankov@linux.intel.com Cc: valery.cherepennikov@intel.com Cc: eranian@google.com Cc: acme@redhat.com Cc: linux-tip-commits@vger.kernel.org Cc: davidcc@google.com Cc: kan.liang@intel.com Cc: Dmitry.Prohorov@intel.com Cc: jolsa@redhat.com Link: https://lkml.kernel.org/r/20180315170129.GX4043@hirez.programming.kicks-ass.net --- arch/alpha/kernel/perf_event.c | 2 +- arch/arm/mach-imx/mmdc.c | 2 +- arch/arm/mm/cache-l2x0-pmu.c | 2 +- arch/mips/kernel/perf_event_mipsxx.c | 2 +- arch/powerpc/perf/core-book3s.c | 2 +- arch/powerpc/perf/core-fsl-emb.c | 2 +- arch/sparc/kernel/perf_event.c | 2 +- arch/x86/events/core.c | 2 +- arch/x86/events/intel/uncore.c | 2 +- drivers/bus/arm-cci.c | 2 +- drivers/bus/arm-ccn.c | 4 ++-- drivers/perf/arm_dsu_pmu.c | 2 +- drivers/perf/arm_pmu.c | 2 +- drivers/perf/hisilicon/hisi_uncore_pmu.c | 2 +- drivers/perf/qcom_l2_pmu.c | 7 +++---- drivers/perf/qcom_l3_pmu.c | 2 +- drivers/perf/xgene_pmu.c | 4 ++-- include/linux/perf_event.h | 4 ++++ kernel/events/core.c | 34 +++++++++++++++----------------- 19 files changed, 41 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 435864c24479..5613aa378a83 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -351,7 +351,7 @@ static int collect_events(struct perf_event *group, int max_count, evtype[n] = group->hw.event_base; current_idx[n++] = PMC_NO_INDEX; } - list_for_each_entry(pe, &group->sibling_list, sibling_list) { + for_each_sibling_event(pe, group) { if (!is_software_event(pe) && pe->state != PERF_EVENT_STATE_OFF) { if (n >= max_count) return -1; diff --git a/arch/arm/mach-imx/mmdc.c b/arch/arm/mach-imx/mmdc.c index 27a9ca20933e..04b3bf71de94 100644 --- a/arch/arm/mach-imx/mmdc.c +++ b/arch/arm/mach-imx/mmdc.c @@ -269,7 +269,7 @@ static bool mmdc_pmu_group_is_valid(struct perf_event *event) return false; } - list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { + for_each_sibling_event(sibling, leader) { if (!mmdc_pmu_group_event_is_valid(sibling, pmu, &counter_mask)) return false; } diff --git a/arch/arm/mm/cache-l2x0-pmu.c b/arch/arm/mm/cache-l2x0-pmu.c index 3a89ea4c2b57..afe5b4c7b164 100644 --- a/arch/arm/mm/cache-l2x0-pmu.c +++ b/arch/arm/mm/cache-l2x0-pmu.c @@ -293,7 +293,7 @@ static bool l2x0_pmu_group_is_valid(struct perf_event *event) else if (!is_software_event(leader)) return false; - list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { + for_each_sibling_event(sibling, leader) { if (sibling->pmu == pmu) num_hw++; else if (!is_software_event(sibling)) diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 46097ff3208b..ee73550f0b9a 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -711,7 +711,7 @@ static int validate_group(struct perf_event *event) if (mipsxx_pmu_alloc_counter(&fake_cpuc, &leader->hw) < 0) return -EINVAL; - list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { + for_each_sibling_event(sibling, leader) { if (mipsxx_pmu_alloc_counter(&fake_cpuc, &sibling->hw) < 0) return -EINVAL; } diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 7c1f66050433..f8908ea4ea73 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -1426,7 +1426,7 @@ static int collect_events(struct perf_event *group, int max_count, flags[n] = group->hw.event_base; events[n++] = group->hw.config; } - list_for_each_entry(event, &group->sibling_list, sibling_list) { + for_each_sibling_event(event, group) { if (event->pmu->task_ctx_nr == perf_hw_context && event->state != PERF_EVENT_STATE_OFF) { if (n >= max_count) diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c index 94c2e63662c6..85f1d18e5fd3 100644 --- a/arch/powerpc/perf/core-fsl-emb.c +++ b/arch/powerpc/perf/core-fsl-emb.c @@ -277,7 +277,7 @@ static int collect_events(struct perf_event *group, int max_count, ctrs[n] = group; n++; } - list_for_each_entry(event, &group->sibling_list, sibling_list) { + for_each_sibling_event(event, group) { if (!is_software_event(event) && event->state != PERF_EVENT_STATE_OFF) { if (n >= max_count) diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index a0a86d369119..d3149baaa33c 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1342,7 +1342,7 @@ static int collect_events(struct perf_event *group, int max_count, events[n] = group->hw.event_base; current_idx[n++] = PIC_NO_INDEX; } - list_for_each_entry(event, &group->sibling_list, sibling_list) { + for_each_sibling_event(event, group) { if (!is_software_event(event) && event->state != PERF_EVENT_STATE_OFF) { if (n >= max_count) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 77a4125b6b1f..bfc8f43909c1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -990,7 +990,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, if (!dogrp) return n; - list_for_each_entry(event, &leader->sibling_list, sibling_list) { + for_each_sibling_event(event, leader) { if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 9e374cd22ad2..a7956fc7ca1d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -354,7 +354,7 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, if (!dogrp) return n; - list_for_each_entry(event, &leader->sibling_list, sibling_list) { + for_each_sibling_event(event, leader) { if (!is_box_event(box, event) || event->state <= PERF_EVENT_STATE_OFF) continue; diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c index c98435bdb64f..c4c0c8560cce 100644 --- a/drivers/bus/arm-cci.c +++ b/drivers/bus/arm-cci.c @@ -1311,7 +1311,7 @@ validate_group(struct perf_event *event) if (!validate_event(event->pmu, &fake_pmu, leader)) return -EINVAL; - list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { + for_each_sibling_event(sibling, leader) { if (!validate_event(event->pmu, &fake_pmu, sibling)) return -EINVAL; } diff --git a/drivers/bus/arm-ccn.c b/drivers/bus/arm-ccn.c index 1c310a4be000..65b7e4042ece 100644 --- a/drivers/bus/arm-ccn.c +++ b/drivers/bus/arm-ccn.c @@ -846,11 +846,11 @@ static int arm_ccn_pmu_event_init(struct perf_event *event) !is_software_event(event->group_leader)) return -EINVAL; - list_for_each_entry(sibling, &event->group_leader->sibling_list, - sibling_list) + for_each_sibling_event(sibling, event->group_leader) { if (sibling->pmu != event->pmu && !is_software_event(sibling)) return -EINVAL; + } return 0; } diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index 660680d78147..660cb8ac886a 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -536,7 +536,7 @@ static bool dsu_pmu_validate_group(struct perf_event *event) memset(fake_hw.used_mask, 0, sizeof(fake_hw.used_mask)); if (!dsu_pmu_validate_event(event->pmu, &fake_hw, leader)) return false; - list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { + for_each_sibling_event(sibling, leader) { if (!dsu_pmu_validate_event(event->pmu, &fake_hw, sibling)) return false; } diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 628d7a7b9526..344e2083e941 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -311,7 +311,7 @@ validate_group(struct perf_event *event) if (!validate_event(event->pmu, &fake_pmu, leader)) return -EINVAL; - list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { + for_each_sibling_event(sibling, leader) { if (!validate_event(event->pmu, &fake_pmu, sibling)) return -EINVAL; } diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index e3356087fd76..44df61397a38 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -82,7 +82,7 @@ static bool hisi_validate_event_group(struct perf_event *event) counters++; } - list_for_each_entry(sibling, &event->group_leader->sibling_list, sibling_list) { + for_each_sibling_event(sibling, event->group_leader) { if (is_software_event(sibling)) continue; if (sibling->pmu != event->pmu) diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c index 5e535a718965..842135cf35a3 100644 --- a/drivers/perf/qcom_l2_pmu.c +++ b/drivers/perf/qcom_l2_pmu.c @@ -534,14 +534,14 @@ static int l2_cache_event_init(struct perf_event *event) return -EINVAL; } - list_for_each_entry(sibling, &event->group_leader->sibling_list, - sibling_list) + for_each_sibling_event(sibling, event->group_leader) { if (sibling->pmu != event->pmu && !is_software_event(sibling)) { dev_dbg_ratelimited(&l2cache_pmu->pdev->dev, "Can't create mixed PMU group\n"); return -EINVAL; } + } cluster = get_cluster_pmu(l2cache_pmu, event->cpu); if (!cluster) { @@ -571,8 +571,7 @@ static int l2_cache_event_init(struct perf_event *event) return -EINVAL; } - list_for_each_entry(sibling, &event->group_leader->sibling_list, - sibling_list) { + for_each_sibling_event(sibling, event->group_leader) { if ((sibling != event) && !is_software_event(sibling) && (L2_EVT_GROUP(sibling->attr.config) == diff --git a/drivers/perf/qcom_l3_pmu.c b/drivers/perf/qcom_l3_pmu.c index 5dedf4b1a552..2dc63d61f2ea 100644 --- a/drivers/perf/qcom_l3_pmu.c +++ b/drivers/perf/qcom_l3_pmu.c @@ -468,7 +468,7 @@ static bool qcom_l3_cache__validate_event_group(struct perf_event *event) counters = event_num_counters(event); counters += event_num_counters(leader); - list_for_each_entry(sibling, &leader->sibling_list, sibling_list) { + for_each_sibling_event(sibling, leader) { if (is_software_event(sibling)) continue; if (sibling->pmu != event->pmu) diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c index f1f4a56cab5e..6bdb1dad805f 100644 --- a/drivers/perf/xgene_pmu.c +++ b/drivers/perf/xgene_pmu.c @@ -949,11 +949,11 @@ static int xgene_perf_event_init(struct perf_event *event) !is_software_event(event->group_leader)) return -EINVAL; - list_for_each_entry(sibling, &event->group_leader->sibling_list, - sibling_list) + for_each_sibling_event(sibling, event->group_leader) { if (sibling->pmu != event->pmu && !is_software_event(sibling)) return -EINVAL; + } return 0; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2bb200e1bbea..ff39ab011376 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -536,6 +536,10 @@ struct pmu_event_list { struct list_head list; }; +#define for_each_sibling_event(sibling, event) \ + if ((event)->group_leader == (event)) \ + list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) + /** * struct perf_event - performance event kernel representation: */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 3b4c7792a6ac..4d7a460d6669 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; - list_for_each_entry(sibling, &leader->sibling_list, sibling_list) + for_each_sibling_event(sibling, leader) perf_event_update_time(sibling); } @@ -1828,7 +1828,7 @@ static void perf_group_attach(struct perf_event *event) perf_event__header_size(group_leader); - list_for_each_entry(pos, &group_leader->sibling_list, sibling_list) + for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } @@ -1928,7 +1928,7 @@ static void perf_group_detach(struct perf_event *event) out: perf_event__header_size(event->group_leader); - list_for_each_entry(tmp, &event->group_leader->sibling_list, sibling_list) + for_each_sibling_event(tmp, event->group_leader) perf_event__header_size(tmp); } @@ -1951,13 +1951,13 @@ static inline int __pmu_filter_match(struct perf_event *event) */ static inline int pmu_filter_match(struct perf_event *event) { - struct perf_event *child; + struct perf_event *sibling; if (!__pmu_filter_match(event)) return 0; - list_for_each_entry(child, &event->sibling_list, sibling_list) { - if (!__pmu_filter_match(child)) + for_each_sibling_event(sibling, event) { + if (!__pmu_filter_match(sibling)) return 0; } @@ -2031,7 +2031,7 @@ group_sched_out(struct perf_event *group_event, /* * Schedule out siblings (if any): */ - list_for_each_entry(event, &group_event->sibling_list, sibling_list) + for_each_sibling_event(event, group_event) event_sched_out(event, cpuctx, ctx); perf_pmu_enable(ctx->pmu); @@ -2310,7 +2310,7 @@ group_sched_in(struct perf_event *group_event, /* * Schedule in siblings as one group (if any): */ - list_for_each_entry(event, &group_event->sibling_list, sibling_list) { + for_each_sibling_event(event, group_event) { if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; @@ -2326,7 +2326,7 @@ group_error: * partial group before returning: * The events up to the failed event are scheduled out normally. */ - list_for_each_entry(event, &group_event->sibling_list, sibling_list) { + for_each_sibling_event(event, group_event) { if (event == partial_group) break; @@ -3863,7 +3863,7 @@ static void __perf_event_read(void *info) pmu->read(event); - list_for_each_entry(sub, &event->sibling_list, sibling_list) { + for_each_sibling_event(sub, event) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since @@ -4711,7 +4711,7 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - list_for_each_entry(sub, &leader->sibling_list, sibling_list) { + for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); @@ -4905,7 +4905,7 @@ static void perf_event_for_each(struct perf_event *event, event = event->group_leader; perf_event_for_each_child(event, func); - list_for_each_entry(sibling, &event->sibling_list, sibling_list) + for_each_sibling_event(sibling, event) perf_event_for_each_child(sibling, func); } @@ -6077,7 +6077,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, __output_copy(handle, values, n * sizeof(u64)); - list_for_each_entry(sub, &leader->sibling_list, sibling_list) { + for_each_sibling_event(sub, leader) { n = 0; if ((sub != event) && @@ -10662,8 +10662,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_remove_from_context(group_leader, 0); put_ctx(gctx); - list_for_each_entry(sibling, &group_leader->sibling_list, - sibling_list) { + for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); put_ctx(gctx); } @@ -10684,8 +10683,7 @@ SYSCALL_DEFINE5(perf_event_open, * By installing siblings first we NO-OP because they're not * reachable through the group lists. */ - list_for_each_entry(sibling, &group_leader->sibling_list, - sibling_list) { + for_each_sibling_event(sibling, group_leader) { perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); @@ -11324,7 +11322,7 @@ static int inherit_group(struct perf_event *parent_event, * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway. */ - list_for_each_entry(sub, &parent_event->sibling_list, sibling_list) { + for_each_sibling_event(sub, parent_event) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) -- cgit v1.2.3-71-gd317 From 24868367cdcac447232ebcb2aa06e1bf91291586 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 16 Mar 2018 12:51:40 +0000 Subject: perf/core: Clear sibling list of detached events When perf_group_dettach() is called on a group leader, it updates each sibling's group_leader field to point to that sibling, effectively upgrading each siblnig to a group leader. After perf_group_detach has completed, the caller may free the leader event. We only remove siblings from the group leader's sibling_list when the leader has a non-empty group_node. This was fine prior to commit: 8343aae66167df67 ("perf/core: Remove perf_event::group_entry") ... as the sibling's sibling_list would be empty. However, now that we use the sibling_list field as both the list head and the list entry, this leaves each sibling with a non-empty sibling list, including the stale leader event. If perf_group_detach() is subsequently called on a sibling, it will appear to be a group leader, and we'll walk the sibling_list, potentially dereferencing these stale events. In 0day testing, this has been observed to result in kernel panics. Let's avoid this by always removing siblings from the sibling list when we promote them to leaders. Fixes: 8343aae66167df67 ("perf/core: Remove perf_event::group_entry") Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Cc: vincent.weaver@maine.edu Cc: Peter Zijlstra Cc: torvalds@linux-foundation.org Cc: Alexey Budankov Cc: valery.cherepennikov@intel.com Cc: linux-tip-commits@vger.kernel.org Cc: eranian@google.com Cc: acme@redhat.com Cc: alexander.shishkin@linux.intel.com Cc: davidcc@google.com Cc: kan.liang@intel.com Cc: Dmitry.Prohorov@intel.com Cc: Jiri Olsa Link: https://lkml.kernel.org/r/20180316131741.3svgr64yibc6vsid@lakrids.cambridge.arm.com --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d7a460d6669..2776a660db15 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1906,12 +1906,12 @@ static void perf_group_detach(struct perf_event *event) list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { sibling->group_leader = sibling; + list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; if (!RB_EMPTY_NODE(&event->group_node)) { - list_del_init(&sibling->sibling_list); add_event_to_groups(sibling, event->ctx); if (sibling->state == PERF_EVENT_STATE_ACTIVE) { -- cgit v1.2.3-71-gd317 From a84d1169164b274f13b97a23ff235c000efe3b49 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 15 Mar 2018 17:12:40 +0100 Subject: y2038: Introduce struct __kernel_old_timeval Dealing with 'struct timeval' users in the y2038 series is a bit tricky: We have two definitions of timeval that are visible to user space, one comes from glibc (or some other C library), the other comes from linux/time.h. The kernel copy is what we want to be used for a number of structures defined by the kernel itself, e.g. elf_prstatus (used it core dumps), sysinfo and rusage (used in system calls). These generally tend to be used for passing time intervals rather than absolute (epoch-based) times, so they do not suffer from the y2038 overflow. Some of them could be changed to use 64-bit timestamps by creating new system calls, others like the core files cannot easily be changed. An application using these interfaces likely also uses gettimeofday() or other interfaces that use absolute times, and pass 'struct timeval' pointers directly into kernel interfaces, so glibc must redefine their timeval based on a 64-bit time_t when they introduce their y2038-safe interfaces. The only reasonable way forward I see is to remove the 'timeval' definion from the kernel's uapi headers, and change the interfaces that we do not want to (or cannot) duplicate for 64-bit times to use a new __kernel_old_timeval definition instead. This type should be avoided for all new interfaces (those can use 64-bit nanoseconds, or the 64-bit version of timespec instead), and should be used with great care when converting existing interfaces from timeval, to be sure they don't suffer from the y2038 overflow, and only with consensus for the particular user that using __kernel_old_timeval is better than moving to a 64-bit based interface. The structure name is intentionally chosen to not conflict with user space types, and to be ugly enough to discourage its use. Note that ioctl based interfaces that pass a bare 'timeval' pointer cannot change to '__kernel_old_timeval' because the user space source code refers to 'timeval' instead, and we don't want to modify the user space sources if possible. However, any application that relies on a structure to contain an embedded 'timeval' (e.g. by passing a pointer to the member into a function call that expects a timeval pointer) is broken when that structure gets converted to __kernel_old_timeval. I don't see any way around that, and we have to rely on the compiler to produce a warning or compile failure that will alert users when they recompile their sources against a new libc. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Stephen Boyd Cc: John Stultz Cc: Al Viro Link: https://lkml.kernel.org/r/20180315161739.576085-1-arnd@arndb.de --- include/linux/time32.h | 1 + include/uapi/linux/time.h | 12 ++++++++++++ kernel/time/time.c | 12 ++++++++++++ 3 files changed, 25 insertions(+) (limited to 'kernel') diff --git a/include/linux/time32.h b/include/linux/time32.h index 65b1de25198d..d2bcd4377b56 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -217,5 +217,6 @@ static inline s64 timeval_to_ns(const struct timeval *tv) * Returns the timeval representation of the nsec parameter. */ extern struct timeval ns_to_timeval(const s64 nsec); +extern struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec); #endif diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 61a187df8da2..16a296612ba4 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -42,6 +42,18 @@ struct itimerval { struct timeval it_value; /* current value */ }; +/* + * legacy timeval structure, only embedded in structures that + * traditionally used 'timeval' to pass time intervals (not absolute + * times). Do not add new users. If user space fails to compile + * here, this is probably because it is not y2038 safe and needs to + * be changed to use another interface. + */ +struct __kernel_old_timeval { + __kernel_long_t tv_sec; + __kernel_long_t tv_usec; +}; + /* * The IDs of the various system clocks (for POSIX.1b interval timers): */ diff --git a/kernel/time/time.c b/kernel/time/time.c index bd4e6c7dd689..3044d48ebe56 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -488,6 +488,18 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); +struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) +{ + struct timespec64 ts = ns_to_timespec64(nsec); + struct __kernel_old_timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_kernel_old_timeval); + /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * -- cgit v1.2.3-71-gd317 From 05f0fe6b74dbd7690a4cbd61810948b7d575576a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 12:45:13 -0700 Subject: RCU, workqueue: Implement rcu_work There are cases where RCU callback needs to be bounced to a sleepable context. This is currently done by the RCU callback queueing a work item, which can be cumbersome to write and confusing to read. This patch introduces rcu_work, a workqueue work variant which gets executed after a RCU grace period, and converts the open coded bouncing in fs/aio and kernel/cgroup. v3: Dropped queue_rcu_work_on(). Documented rcu grace period behavior after queue_rcu_work(). v2: Use rcu_barrier() instead of synchronize_rcu() to wait for completion of previously queued rcu callback as per Paul. Signed-off-by: Tejun Heo Acked-by: "Paul E. McKenney" Cc: Linus Torvalds --- include/linux/workqueue.h | 23 ++++++++++++++++++++ kernel/workqueue.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index bc0cda180c8b..d026f8f818cc 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,7 @@ #include #include #include +#include struct workqueue_struct; @@ -120,6 +121,14 @@ struct delayed_work { int cpu; }; +struct rcu_work { + struct work_struct work; + struct rcu_head rcu; + + /* target workqueue ->rcu uses to queue ->work */ + struct workqueue_struct *wq; +}; + /** * struct workqueue_attrs - A struct for workqueue attributes. * @@ -151,6 +160,11 @@ static inline struct delayed_work *to_delayed_work(struct work_struct *work) return container_of(work, struct delayed_work, work); } +static inline struct rcu_work *to_rcu_work(struct work_struct *work) +{ + return container_of(work, struct rcu_work, work); +} + struct execute_work { struct work_struct work; }; @@ -266,6 +280,12 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } #define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE) +#define INIT_RCU_WORK(_work, _func) \ + INIT_WORK(&(_work)->work, (_func)) + +#define INIT_RCU_WORK_ONSTACK(_work, _func) \ + INIT_WORK_ONSTACK(&(_work)->work, (_func)) + /** * work_pending - Find out whether a work item is currently pending * @work: The work item in question @@ -447,6 +467,7 @@ extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); +extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork); extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); @@ -463,6 +484,8 @@ extern bool flush_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work_sync(struct delayed_work *dwork); +extern bool flush_rcu_work(struct rcu_work *rwork); + extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); extern struct work_struct *current_work(void); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bb9a519cbf50..7df85fa9f651 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1604,6 +1604,40 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(mod_delayed_work_on); +static void rcu_work_rcufn(struct rcu_head *rcu) +{ + struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu); + + /* read the comment in __queue_work() */ + local_irq_disable(); + __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work); + local_irq_enable(); +} + +/** + * queue_rcu_work - queue work after a RCU grace period + * @wq: workqueue to use + * @rwork: work to queue + * + * Return: %false if @rwork was already pending, %true otherwise. Note + * that a full RCU grace period is guaranteed only after a %true return. + * While @rwork is guarnateed to be executed after a %false return, the + * execution may happen before a full RCU grace period has passed. + */ +bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) +{ + struct work_struct *work = &rwork->work; + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + rwork->wq = wq; + call_rcu(&rwork->rcu, rcu_work_rcufn); + return true; + } + + return false; +} +EXPORT_SYMBOL(queue_rcu_work); + /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state @@ -3001,6 +3035,26 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +/** + * flush_rcu_work - wait for a rwork to finish executing the last queueing + * @rwork: the rcu work to flush + * + * Return: + * %true if flush_rcu_work() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_rcu_work(struct rcu_work *rwork) +{ + if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) { + rcu_barrier(); + flush_work(&rwork->work); + return true; + } else { + return flush_work(&rwork->work); + } +} +EXPORT_SYMBOL(flush_rcu_work); + static bool __cancel_work(struct work_struct *work, bool is_dwork) { unsigned long flags; -- cgit v1.2.3-71-gd317 From 8f36aaec9c929f2864196b0799203491f6a67dc6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 12:45:14 -0700 Subject: cgroup: Use rcu_work instead of explicit rcu and work item Workqueue now has rcu_work. Use it instead of open-coding rcu -> work item bouncing. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 2 +- kernel/cgroup/cgroup.c | 21 +++++++-------------- 2 files changed, 8 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 9f242b876fde..92d7640632ef 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -151,8 +151,8 @@ struct cgroup_subsys_state { atomic_t online_cnt; /* percpu_ref killing and RCU release */ - struct rcu_head rcu_head; struct work_struct destroy_work; + struct rcu_work destroy_rwork; /* * PI: the parent css. Placed here for cache proximity to following diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8cda3bc3ae22..4c5d4ca0d4e4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4514,10 +4514,10 @@ static struct cftype cgroup_base_files[] = { * and thus involve punting to css->destroy_work adding two additional * steps to the already complex sequence. */ -static void css_free_work_fn(struct work_struct *work) +static void css_free_rwork_fn(struct work_struct *work) { - struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys_state *css = container_of(to_rcu_work(work), + struct cgroup_subsys_state, destroy_rwork); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; @@ -4563,15 +4563,6 @@ static void css_free_work_fn(struct work_struct *work) } } -static void css_free_rcu_fn(struct rcu_head *rcu_head) -{ - struct cgroup_subsys_state *css = - container_of(rcu_head, struct cgroup_subsys_state, rcu_head); - - INIT_WORK(&css->destroy_work, css_free_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); -} - static void css_release_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = @@ -4621,7 +4612,8 @@ static void css_release_work_fn(struct work_struct *work) mutex_unlock(&cgroup_mutex); - call_rcu(&css->rcu_head, css_free_rcu_fn); + INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); + queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -4755,7 +4747,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); err_free_css: - call_rcu(&css->rcu_head, css_free_rcu_fn); + INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); + queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); } -- cgit v1.2.3-71-gd317 From ffa35660017161524b7db79b0ce293fa1af16c4d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:56:54 -0700 Subject: sockmap: convert refcnt to an atomic refcnt The sockmap refcnt up until now has been wrapped in the sk_callback_lock(). So its not actually needed any locking of its own. The counter itself tracks the lifetime of the psock object. Sockets in a sockmap have a lifetime that is independent of the map they are part of. This is possible because a single socket may be in multiple maps. When this happens we can only release the psock data associated with the socket when the refcnt reaches zero. There are three possible delete sock reference decrement paths first through the normal sockmap process, the user deletes the socket from the map. Second the map is removed and all sockets in the map are removed, delete path is similar to case 1. The third case is an asyncronous socket event such as a closing the socket. The last case handles removing sockets that are no longer available. For completeness, although inc does not pose any problems in this patch series, the inc case only happens when a psock is added to a map. Next we plan to add another socket prog type to handle policy and monitoring on the TX path. When we do this however we will need to keep a reference count open across the sendmsg/sendpage call and holding the sk_callback_lock() here (on every send) seems less than ideal, also it may sleep in cases where we hit memory pressure. Instead of dealing with these issues in some clever way simply make the reference counting a refcnt_t type and do proper atomic ops. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index a927e89dad6e..051b224270e3 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -62,8 +62,7 @@ struct smap_psock_map_entry { struct smap_psock { struct rcu_head rcu; - /* refcnt is used inside sk_callback_lock */ - u32 refcnt; + refcount_t refcnt; /* datapath variables */ struct sk_buff_head rxqueue; @@ -373,15 +372,13 @@ static void smap_destroy_psock(struct rcu_head *rcu) static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { - psock->refcnt--; - if (psock->refcnt) - return; - - tcp_cleanup_ulp(sock); - smap_stop_sock(psock, sock); - clear_bit(SMAP_TX_RUNNING, &psock->state); - rcu_assign_sk_user_data(sock, NULL); - call_rcu_sched(&psock->rcu, smap_destroy_psock); + if (refcount_dec_and_test(&psock->refcnt)) { + tcp_cleanup_ulp(sock); + smap_stop_sock(psock, sock); + clear_bit(SMAP_TX_RUNNING, &psock->state); + rcu_assign_sk_user_data(sock, NULL); + call_rcu_sched(&psock->rcu, smap_destroy_psock); + } } static int smap_parse_func_strparser(struct strparser *strp, @@ -511,7 +508,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); INIT_LIST_HEAD(&psock->maps); - psock->refcnt = 1; + refcount_set(&psock->refcnt, 1); rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -772,7 +769,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -EBUSY; goto out_progs; } - psock->refcnt++; + refcount_inc(&psock->refcnt); } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { -- cgit v1.2.3-71-gd317 From 4f738adba30a7cfc006f605707e7aee847ffefa0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:10 -0700 Subject: bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data This implements a BPF ULP layer to allow policy enforcement and monitoring at the socket layer. In order to support this a new program type BPF_PROG_TYPE_SK_MSG is used to run the policy at the sendmsg/sendpage hook. To attach the policy to sockets a sockmap is used with a new program attach type BPF_SK_MSG_VERDICT. Similar to previous sockmap usages when a sock is added to a sockmap, via a map update, if the map contains a BPF_SK_MSG_VERDICT program type attached then the BPF ULP layer is created on the socket and the attached BPF_PROG_TYPE_SK_MSG program is run for every msg in sendmsg case and page/offset in sendpage case. BPF_PROG_TYPE_SK_MSG Semantics/API: BPF_PROG_TYPE_SK_MSG supports only two return codes SK_PASS and SK_DROP. Returning SK_DROP free's the copied data in the sendmsg case and in the sendpage case leaves the data untouched. Both cases return -EACESS to the user. Returning SK_PASS will allow the msg to be sent. In the sendmsg case data is copied into kernel space buffers before running the BPF program. The kernel space buffers are stored in a scatterlist object where each element is a kernel memory buffer. Some effort is made to coalesce data from the sendmsg call here. For example a sendmsg call with many one byte iov entries will likely be pushed into a single entry. The BPF program is run with data pointers (start/end) pointing to the first sg element. In the sendpage case data is not copied. We opt not to copy the data by default here, because the BPF infrastructure does not know what bytes will be needed nor when they will be needed. So copying all bytes may be wasteful. Because of this the initial start/end data pointers are (0,0). Meaning no data can be read or written. This avoids reading data that may be modified by the user. A new helper is added later in this series if reading and writing the data is needed. The helper call will do a copy by default so that the page is exclusively owned by the BPF call. The verdict from the BPF_PROG_TYPE_SK_MSG applies to the entire msg in the sendmsg() case and the entire page/offset in the sendpage case. This avoids ambiguity on how to handle mixed return codes in the sendmsg case. Again a helper is added later in the series if a verdict needs to apply to multiple system calls and/or only a subpart of the currently being processed message. The helper msg_redirect_map() can be used to select the socket to send the data on. This is used similar to existing redirect use cases. This allows policy to redirect msgs. Pseudo code simple example: The basic logic to attach a program to a socket is as follows, // load the programs bpf_prog_load(SOCKMAP_TCP_MSG_PROG, BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog); // lookup the sockmap bpf_map_msg = bpf_object__find_map_by_name(obj, "my_sock_map"); // get fd for sockmap map_fd_msg = bpf_map__fd(bpf_map_msg); // attach program to sockmap bpf_prog_attach(msg_prog, map_fd_msg, BPF_SK_MSG_VERDICT, 0); Adding sockets to the map is done in the normal way, // Add a socket 'fd' to sockmap at location 'i' bpf_map_update_elem(map_fd_msg, &i, fd, BPF_ANY); After the above any socket attached to "my_sock_map", in this case 'fd', will run the BPF msg verdict program (msg_prog) on every sendmsg and sendpage system call. For a complete example see BPF selftests or sockmap samples. Implementation notes: It seemed the simplest, to me at least, to use a refcnt to ensure psock is not lost across the sendmsg copy into the sg, the bpf program running on the data in sg_data, and the final pass to the TCP stack. Some performance testing may show a better method to do this and avoid the refcnt cost, but for now use the simpler method. Another item that will come after basic support is in place is supporting MSG_MORE flag. At the moment we call sendpages even if the MSG_MORE flag is set. An enhancement would be to collect the pages into a larger scatterlist and pass down the stack. Notice that bpf_tcp_sendmsg() could support this with some additional state saved across sendmsg calls. I built the code to support this without having to do refactoring work. Other features TBD include ZEROCOPY and the TCP_RECV_QUEUE/TCP_NO_QUEUE support. This will follow initial series shortly. Future work could improve size limits on the scatterlist rings used here. Currently, we use MAX_SKB_FRAGS simply because this was being used already in the TLS case. Future work could extend the kernel sk APIs to tune this depending on workload. This is a trade-off between memory usage and throughput performance. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/linux/filter.h | 17 ++ include/uapi/linux/bpf.h | 22 +- kernel/bpf/sockmap.c | 712 +++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 14 +- kernel/bpf/verifier.c | 5 +- net/core/filter.c | 106 +++++++ 8 files changed, 857 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 66df387106de..819229c80eca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -21,6 +21,7 @@ struct bpf_verifier_env; struct perf_event; struct bpf_prog; struct bpf_map; +struct sock; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 19b8349a3809..5e2e8a49fb21 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -13,6 +13,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) diff --git a/include/linux/filter.h b/include/linux/filter.h index fdb691b520c0..109d05ccea9a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -507,6 +507,22 @@ struct xdp_buff { struct xdp_rxq_info *rxq; }; +struct sk_msg_buff { + void *data; + void *data_end; + __u32 apply_bytes; + __u32 cork_bytes; + int sg_copybreak; + int sg_start; + int sg_curr; + int sg_end; + struct scatterlist sg_data[MAX_SKB_FRAGS]; + bool sg_copy[MAX_SKB_FRAGS]; + __u32 key; + __u32 flags; + struct bpf_map *map; +}; + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -771,6 +787,7 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); +struct sock *do_msg_redirect_map(struct sk_msg_buff *md); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1e15d1724d89..ef529539abde 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, }; enum bpf_attach_type { @@ -143,6 +144,7 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -718,6 +720,15 @@ union bpf_attr { * int bpf_override_return(pt_regs, rc) * @pt_regs: pointer to struct pt_regs * @rc: the return value to set + * + * int bpf_msg_redirect_map(map, key, flags) + * Redirect msg to a sock in map using key as a lookup key for the + * sock in map. + * @map: pointer to sockmap + * @key: key to lookup sock in map + * @flags: reserved for future use + * Return: SK_PASS + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -779,7 +790,8 @@ union bpf_attr { FN(perf_prog_read_value), \ FN(getsockopt), \ FN(override_return), \ - FN(sock_ops_cb_flags_set), + FN(sock_ops_cb_flags_set), \ + FN(msg_redirect_map), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -942,6 +954,14 @@ enum sk_action { SK_PASS, }; +/* user accessible metadata for SK_MSG packet hook, new fields must + * be added to the end of this structure + */ +struct sk_msg_md { + void *data; + void *data_end; +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 051b224270e3..69c5bccabd22 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; @@ -73,7 +75,16 @@ struct smap_psock { int save_off; struct sk_buff *save_skb; + /* datapath variables for tx_msg ULP */ + struct sock *sk_redir; + int apply_bytes; + int cork_bytes; + int sg_size; + int eval; + struct sk_msg_buff *cork; + struct strparser strp; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; @@ -91,6 +102,11 @@ struct smap_psock { void (*save_write_space)(struct sock *sk); }; +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); + static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { return rcu_dereference_sk_user_data(sk); @@ -115,27 +131,41 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; + + if (psock->bpf_tx_msg) { + tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; + tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + } + sk->sk_prot = &tcp_bpf_proto; rcu_read_unlock(); return 0; } +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); + static void bpf_tcp_release(struct sock *sk) { struct smap_psock *psock; rcu_read_lock(); psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; - if (likely(psock)) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; } + + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; +out: rcu_read_unlock(); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -174,6 +204,7 @@ enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, + __SK_NONE, }; static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { @@ -185,10 +216,621 @@ static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { .release = bpf_tcp_release, }; +static int memcopy_from_iter(struct sock *sk, + struct sk_msg_buff *md, + struct iov_iter *from, int bytes) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_curr, rc = -ENOSPC; + + do { + int copy; + char *to; + + if (md->sg_copybreak >= sg[i].length) { + md->sg_copybreak = 0; + + if (++i == MAX_SKB_FRAGS) + i = 0; + + if (i == md->sg_end) + break; + } + + copy = sg[i].length - md->sg_copybreak; + to = sg_virt(&sg[i]) + md->sg_copybreak; + md->sg_copybreak += copy; + + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + rc = copy_from_iter_nocache(to, copy, from); + else + rc = copy_from_iter(to, copy, from); + + if (rc != copy) { + rc = -EFAULT; + goto out; + } + + bytes -= copy; + if (!bytes) + break; + + md->sg_copybreak = 0; + if (++i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +out: + md->sg_curr = i; + return rc; +} + +static int bpf_tcp_push(struct sock *sk, int apply_bytes, + struct sk_msg_buff *md, + int flags, bool uncharge) +{ + bool apply = apply_bytes; + struct scatterlist *sg; + int offset, ret = 0; + struct page *p; + size_t size; + + while (1) { + sg = md->sg_data + md->sg_start; + size = (apply && apply_bytes < sg->length) ? + apply_bytes : sg->length; + offset = sg->offset; + + tcp_rate_check_app_limited(sk); + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret != size) { + if (ret > 0) { + if (apply) + apply_bytes -= ret; + size -= ret; + offset += ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + goto retry; + } + + sg->length = size; + sg->offset = offset; + return ret; + } + + if (apply) + apply_bytes -= ret; + sg->offset += ret; + sg->length -= ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + + if (!sg->length) { + put_page(p); + md->sg_start++; + if (md->sg_start == MAX_SKB_FRAGS) + md->sg_start = 0; + memset(sg, 0, sizeof(*sg)); + + if (md->sg_start == md->sg_end) + break; + } + + if (apply && !apply_bytes) + break; + } + return 0; +} + +static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data + md->sg_start; + + if (md->sg_copy[md->sg_start]) { + md->data = md->data_end = 0; + } else { + md->data = sg_virt(sg); + md->data_end = md->data + sg->length; + } +} + +static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start; + + do { + int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + if (!bytes) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +} + +static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start, free; + + while (bytes && sg[i].length) { + free = sg[i].length; + if (bytes < free) { + sg[i].length -= bytes; + sg[i].offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + bytes -= sg[i].length; + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } +} + +static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = start, free = 0; + + while (sg[i].length) { + free += sg[i].length; + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } + + return free; +} + +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +{ + int free = free_sg(sk, md->sg_start, md); + + md->sg_start = md->sg_end; + return free; +} + +static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) +{ + return free_sg(sk, md->sg_curr, md); +} + +static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) +{ + return ((_rc == SK_PASS) ? + (md->map ? __SK_REDIRECT : __SK_PASS) : + __SK_DROP); +} + +static unsigned int smap_do_tx_msg(struct sock *sk, + struct smap_psock *psock, + struct sk_msg_buff *md) +{ + struct bpf_prog *prog; + unsigned int rc, _rc; + + preempt_disable(); + rcu_read_lock(); + + /* If the policy was removed mid-send then default to 'accept' */ + prog = READ_ONCE(psock->bpf_tx_msg); + if (unlikely(!prog)) { + _rc = SK_PASS; + goto verdict; + } + + bpf_compute_data_pointers_sg(md); + rc = (*prog->bpf_func)(md, prog->insnsi); + psock->apply_bytes = md->apply_bytes; + + /* Moving return codes from UAPI namespace into internal namespace */ + _rc = bpf_map_msg_verdict(rc, md); + + /* The psock has a refcount on the sock but not on the map and because + * we need to drop rcu read lock here its possible the map could be + * removed between here and when we need it to execute the sock + * redirect. So do the map lookup now for future use. + */ + if (_rc == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = do_msg_redirect_map(md); + if (!psock->sk_redir) { + _rc = __SK_DROP; + goto verdict; + } + sock_hold(psock->sk_redir); + } +verdict: + rcu_read_unlock(); + preempt_enable(); + + return _rc; +} + +static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, + struct sk_msg_buff *md, + int flags) +{ + struct smap_psock *psock; + struct scatterlist *sg; + int i, err, free = 0; + + sg = md->sg_data; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out_rcu; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto out_rcu; + + rcu_read_unlock(); + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + smap_release_sock(psock, sk); + if (unlikely(err)) + goto out; + return 0; +out_rcu: + rcu_read_unlock(); +out: + i = md->sg_start; + while (sg[i].length) { + free += sg[i].length; + put_page(sg_page(&sg[i])); + sg[i].length = 0; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } + return free; +} + +static inline void bpf_md_init(struct smap_psock *psock) +{ + if (!psock->apply_bytes) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } +} + +static void apply_bytes_dec(struct smap_psock *psock, int i) +{ + if (psock->apply_bytes) { + if (psock->apply_bytes < i) + psock->apply_bytes = 0; + else + psock->apply_bytes -= i; + } +} + +static int bpf_exec_tx_verdict(struct smap_psock *psock, + struct sk_msg_buff *m, + struct sock *sk, + int *copied, int flags) +{ + bool cork = false, enospc = (m->sg_start == m->sg_end); + struct sock *redir; + int err = 0; + int send; + +more_data: + if (psock->eval == __SK_NONE) + psock->eval = smap_do_tx_msg(sk, psock, m); + + if (m->cork_bytes && + m->cork_bytes > psock->sg_size && !enospc) { + psock->cork_bytes = m->cork_bytes - psock->sg_size; + if (!psock->cork) { + psock->cork = kcalloc(1, + sizeof(struct sk_msg_buff), + GFP_ATOMIC | __GFP_NOWARN); + + if (!psock->cork) { + err = -ENOMEM; + goto out_err; + } + } + memcpy(psock->cork, m, sizeof(*m)); + goto out_err; + } + + send = psock->sg_size; + if (psock->apply_bytes && psock->apply_bytes < send) + send = psock->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + err = bpf_tcp_push(sk, send, m, flags, true); + if (unlikely(err)) { + *copied -= free_start_sg(sk, m); + break; + } + + apply_bytes_dec(psock, send); + psock->sg_size -= send; + break; + case __SK_REDIRECT: + redir = psock->sk_redir; + apply_bytes_dec(psock, send); + + if (psock->cork) { + cork = true; + psock->cork = NULL; + } + + return_mem_sg(sk, send, m); + release_sock(sk); + + err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); + lock_sock(sk); + + if (cork) { + free_start_sg(sk, m); + kfree(m); + m = NULL; + } + if (unlikely(err)) + *copied -= err; + else + psock->sg_size -= send; + break; + case __SK_DROP: + default: + free_bytes_sg(sk, send, m); + apply_bytes_dec(psock, send); + *copied -= send; + psock->sg_size -= send; + err = -EACCES; + break; + } + + if (likely(!err)) { + bpf_md_init(psock); + if (m && + m->sg_data[m->sg_start].page_link && + m->sg_data[m->sg_start].length) + goto more_data; + } + +out_err: + return err; +} + +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; + struct sk_msg_buff md = {0}; + unsigned int sg_copy = 0; + struct smap_psock *psock; + int copied = 0, err = 0; + struct scatterlist *sg; + long timeo; + + /* Its possible a sock event or user removed the psock _but_ the ops + * have not been reprogrammed yet so we get here. In this case fallback + * to tcp_sendmsg. Note this only works because we _only_ ever allow + * a single ULP there is no hierarchy here. + */ + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + /* Increment the psock refcnt to ensure its not released while sending a + * message. Required because sk lookup and bpf programs are used in + * separate rcu critical sections. Its OK if we lose the map entry + * but we can't lose the sock reference. + */ + if (!refcount_inc_not_zero(&psock->refcnt)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + sg = md.sg_data; + sg_init_table(sg, MAX_SKB_FRAGS); + rcu_read_unlock(); + + lock_sock(sk); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + while (msg_data_left(msg)) { + struct sk_msg_buff *m; + bool enospc = false; + int copy; + + if (sk->sk_err) { + err = sk->sk_err; + goto out_err; + } + + copy = msg_data_left(msg); + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + m = psock->cork_bytes ? psock->cork : &md; + m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; + err = sk_alloc_sg(sk, copy, m->sg_data, + m->sg_start, &m->sg_end, &sg_copy, + m->sg_end - 1); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + enospc = true; + copy = sg_copy; + } + + err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); + if (err < 0) { + free_curr_sg(sk, m); + goto out_err; + } + + psock->sg_size += copy; + copied += copy; + sg_copy = 0; + + /* When bytes are being corked skip running BPF program and + * applying verdict unless there is no more buffer space. In + * the ENOSPC case simply run BPF prorgram with currently + * accumulated data. We don't have much choice at this point + * we could try extending the page frags or chaining complex + * frags but even in these cases _eventually_ we will hit an + * OOM scenario. More complex recovery schemes may be + * implemented in the future, but BPF programs must handle + * the case where apply_cork requests are not honored. The + * canonical method to verify this is to check data length. + */ + if (psock->cork_bytes) { + if (copy > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= copy; + + if (psock->cork_bytes && !enospc) + goto out_cork; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); + if (unlikely(err < 0)) + goto out_err; + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_err; + } +out_err: + if (err < 0) + err = sk_stream_error(sk, msg->msg_flags, err); +out_cork: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +} + +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct sk_msg_buff md = {0}, *m = NULL; + int err = 0, copied = 0; + struct smap_psock *psock; + struct scatterlist *sg; + bool enospc = false; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto accept; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto accept; + rcu_read_unlock(); + + lock_sock(sk); + + if (psock->cork_bytes) + m = psock->cork; + else + m = &md; + + /* Catch case where ring is full and sendpage is stalled. */ + if (unlikely(m->sg_end == m->sg_start && + m->sg_data[m->sg_end].length)) + goto out_err; + + psock->sg_size += size; + sg = &m->sg_data[m->sg_end]; + sg_set_page(sg, page, size, offset); + get_page(page); + m->sg_copy[m->sg_end] = true; + sk_mem_charge(sk, size); + m->sg_end++; + copied = size; + + if (m->sg_end == MAX_SKB_FRAGS) + m->sg_end = 0; + + if (m->sg_end == m->sg_start) + enospc = true; + + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + + if (psock->cork_bytes && !enospc) + goto out_err; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); +out_err: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +accept: + rcu_read_unlock(); + return tcp_sendpage(sk, page, offset, size, flags); +} + +static void bpf_tcp_msg_add(struct smap_psock *psock, + struct sock *sk, + struct bpf_prog *tx_msg) +{ + struct bpf_prog *orig_tx_msg; + + orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); + if (orig_tx_msg) + bpf_prog_put(orig_tx_msg); +} + static int bpf_tcp_ulp_register(void) { tcp_bpf_proto = tcp_prot; tcp_bpf_proto.close = bpf_tcp_close; + /* Once BPF TX ULP is registered it is never unregistered. It + * will be in the ULP list for the lifetime of the system. Doing + * duplicate registers is not a problem. + */ return tcp_register_ulp(&bpf_tcp_ulp_ops); } @@ -412,7 +1054,6 @@ static int smap_parse_func_strparser(struct strparser *strp, return rc; } - static int smap_read_sock_done(struct strparser *strp, int err) { return err; @@ -482,12 +1123,22 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_parse); if (psock->bpf_verdict) bpf_prog_put(psock->bpf_verdict); + if (psock->bpf_tx_msg) + bpf_prog_put(psock->bpf_tx_msg); + + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + } list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); } + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sock); kfree(psock); } @@ -503,6 +1154,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, if (!psock) return ERR_PTR(-ENOMEM); + psock->eval = __SK_NONE; psock->sock = sock; skb_queue_head_init(&psock->rxqueue); INIT_WORK(&psock->tx_work, smap_tx_work); @@ -711,10 +1363,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct smap_psock_map_entry *e = NULL; - struct bpf_prog *verdict, *parse; + struct bpf_prog *verdict, *parse, *tx_msg; struct sock *osock, *sock; struct smap_psock *psock; u32 i = *(u32 *)key; + bool new = false; int err; if (unlikely(flags > BPF_EXIST)) @@ -737,6 +1390,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, */ verdict = READ_ONCE(stab->bpf_verdict); parse = READ_ONCE(stab->bpf_parse); + tx_msg = READ_ONCE(stab->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -755,6 +1409,17 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } } + if (tx_msg) { + tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + if (IS_ERR(tx_msg)) { + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); + return PTR_ERR(tx_msg); + } + } + write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); @@ -769,7 +1434,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -EBUSY; goto out_progs; } - refcount_inc(&psock->refcnt); + if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { + err = -EBUSY; + goto out_progs; + } + if (!refcount_inc_not_zero(&psock->refcnt)) { + err = -EAGAIN; + goto out_progs; + } } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { @@ -777,11 +1449,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } - err = tcp_set_ulp_id(sock, TCP_ULP_BPF); - if (err) - goto out_progs; - set_bit(SMAP_TX_RUNNING, &psock->state); + new = true; } e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); @@ -794,6 +1463,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ + if (tx_msg) + bpf_tcp_msg_add(psock, sock, tx_msg); + if (new) { + err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + if (err) + goto out_free; + } + if (parse && verdict && !psock->strp_enabled) { err = smap_init_sock(psock, sock); if (err) @@ -815,8 +1492,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - if (osock != sock && parse) - smap_stop_sock(opsock, osock); smap_list_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); @@ -829,6 +1504,8 @@ out_progs: bpf_prog_put(verdict); if (parse) bpf_prog_put(parse); + if (tx_msg) + bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; @@ -843,6 +1520,9 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return -EINVAL; switch (type) { + case BPF_SK_MSG_VERDICT: + orig = xchg(&stab->bpf_tx_msg, prog); + break; case BPF_SK_SKB_STREAM_PARSER: orig = xchg(&stab->bpf_parse, prog); break; @@ -904,6 +1584,10 @@ static void sock_map_release(struct bpf_map *map, struct file *map_file) orig = xchg(&stab->bpf_verdict, NULL); if (orig) bpf_prog_put(orig); + + orig = xchg(&stab->bpf_tx_msg, NULL); + if (orig) + bpf_prog_put(orig); } const struct bpf_map_ops sock_map_ops = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e24aa3241387..3aeb4ea2a93a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1315,7 +1315,8 @@ static int bpf_obj_get(const union bpf_attr *attr) #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) +static int sockmap_get_from_fd(const union bpf_attr *attr, + int type, bool attach) { struct bpf_prog *prog = NULL; int ufd = attr->target_fd; @@ -1329,8 +1330,7 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return PTR_ERR(map); if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_SK_SKB); + prog = bpf_prog_get_type(attr->attach_bpf_fd, type); if (IS_ERR(prog)) { fdput(f); return PTR_ERR(prog); @@ -1382,9 +1382,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, true); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); default: return -EINVAL; } @@ -1437,9 +1439,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); default: return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb79a34359c0..e9f7c20691c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1248,6 +1248,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: if (meta) return meta->pkt_access; @@ -2071,7 +2072,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_map) goto error; break; default: @@ -2109,6 +2111,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_sk_redirect_map: + case BPF_FUNC_msg_redirect_map: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; diff --git a/net/core/filter.c b/net/core/filter.c index 33edfa8372fd..2b6c47597eab 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1890,6 +1890,44 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + /* If user passes invalid input drop the packet. */ + if (unlikely(flags)) + return SK_DROP; + + msg->key = key; + msg->flags = flags; + msg->map = map; + + return SK_PASS; +} + +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ + struct sock *sk = NULL; + + if (msg->map) { + sk = __sock_map_lookup_elem(msg->map, msg->key); + + msg->key = 0; + msg->map = NULL; + } + + return sk; +} + +static const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3591,6 +3629,16 @@ static const struct bpf_func_proto * } } +static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_msg_redirect_map: + return &bpf_msg_redirect_map_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -3980,6 +4028,32 @@ static bool sk_skb_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, info); } +static bool sk_msg_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct sk_msg_md, data): + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct sk_msg_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + if (off < 0 || off >= sizeof(struct sk_msg_md)) + return false; + if (off % size != 0) + return false; + if (size != sizeof(__u64)) + return false; + + return true; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -4778,6 +4852,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_msg_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data)); + break; + case offsetof(struct sk_msg_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data_end)); + break; + } + + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -4868,6 +4965,15 @@ const struct bpf_verifier_ops sk_skb_verifier_ops = { const struct bpf_prog_ops sk_skb_prog_ops = { }; +const struct bpf_verifier_ops sk_msg_verifier_ops = { + .get_func_proto = sk_msg_func_proto, + .is_valid_access = sk_msg_is_valid_access, + .convert_ctx_access = sk_msg_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_msg_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; -- cgit v1.2.3-71-gd317 From 43d4cb47f6e854ea9fe868eedf8281f81b5a1252 Mon Sep 17 00:00:00 2001 From: Marc-André Lureau Date: Wed, 28 Feb 2018 16:06:13 +0100 Subject: crash: export paddr_vmcoreinfo_note() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following patch is going to use the symbol from the fw_cfg module, to call the function and write the note location details in the vmcoreinfo entry, so qemu can produce dumps with the vmcoreinfo note. CC: Andrew Morton CC: Hari Bathini CC: Tony Luck CC: Vivek Goyal Acked-by: Baoquan He Acked-by: Dave Young Signed-off-by: Marc-André Lureau Acked-by: Gabriel Somlo Signed-off-by: Michael S. Tsirkin --- kernel/crash_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 4f63597c824d..a93590cdd9e1 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -376,6 +376,7 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) { return __pa(vmcoreinfo_note); } +EXPORT_SYMBOL(paddr_vmcoreinfo_note); static int __init crash_save_vmcoreinfo_init(void) { -- cgit v1.2.3-71-gd317 From 45dbac0e288350f9a4226a5b4b651ed434dd9f85 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 15 Mar 2018 04:58:12 -0700 Subject: locking/mutex: Improve documentation On Wed, Mar 14, 2018 at 01:56:31PM -0700, Andrew Morton wrote: > My memory is weak and our documentation is awful. What does > mutex_lock_killable() actually do and how does it differ from > mutex_lock_interruptible()? Add kernel-doc for mutex_lock_killable() and mutex_lock_io(). Reword the kernel-doc for mutex_lock_interruptible(). Signed-off-by: Matthew Wilcox Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Jonathan Corbet Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cl@linux.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/20180315115812.GA9949@bombadil.infradead.org Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 858a07590e39..2048359f33d2 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1082,15 +1082,16 @@ static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock); /** - * mutex_lock_interruptible - acquire the mutex, interruptible - * @lock: the mutex to be acquired + * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals. + * @lock: The mutex to be acquired. * - * Lock the mutex like mutex_lock(), and return 0 if the mutex has - * been acquired or sleep until the mutex becomes available. If a - * signal arrives while waiting for the lock then this function - * returns -EINTR. + * Lock the mutex like mutex_lock(). If a signal is delivered while the + * process is sleeping, this function will return without acquiring the + * mutex. * - * This function is similar to (but not equivalent to) down_interruptible(). + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * signal arrived. */ int __sched mutex_lock_interruptible(struct mutex *lock) { @@ -1104,6 +1105,18 @@ int __sched mutex_lock_interruptible(struct mutex *lock) EXPORT_SYMBOL(mutex_lock_interruptible); +/** + * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals. + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock(). If a signal which will be fatal to + * the current process is delivered while the process is sleeping, this + * function will return without acquiring the mutex. + * + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * fatal signal arrived. + */ int __sched mutex_lock_killable(struct mutex *lock) { might_sleep(); @@ -1115,6 +1128,16 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); +/** + * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock(). While the task is waiting for this + * mutex, it will be accounted as being in the IO wait state by the + * scheduler. + * + * Context: Process context. + */ void __sched mutex_lock_io(struct mutex *lock) { int token; -- cgit v1.2.3-71-gd317 From 7f65ea42eb00bc902f1c37a71e984e4f4064cfa9 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 9 Mar 2018 09:52:42 +0000 Subject: sched/fair: Add util_est on top of PELT The util_avg signal computed by PELT is too variable for some use-cases. For example, a big task waking up after a long sleep period will have its utilization almost completely decayed. This introduces some latency before schedutil will be able to pick the best frequency to run a task. The same issue can affect task placement. Indeed, since the task utilization is already decayed at wakeup, when the task is enqueued in a CPU, this can result in a CPU running a big task as being temporarily represented as being almost empty. This leads to a race condition where other tasks can be potentially allocated on a CPU which just started to run a big task which slept for a relatively long period. Moreover, the PELT utilization of a task can be updated every [ms], thus making it a continuously changing value for certain longer running tasks. This means that the instantaneous PELT utilization of a RUNNING task is not really meaningful to properly support scheduler decisions. For all these reasons, a more stable signal can do a better job of representing the expected/estimated utilization of a task/cfs_rq. Such a signal can be easily created on top of PELT by still using it as an estimator which produces values to be aggregated on meaningful events. This patch adds a simple implementation of util_est, a new signal built on top of PELT's util_avg where: util_est(task) = max(task::util_avg, f(task::util_avg@dequeue)) This allows to remember how big a task has been reported by PELT in its previous activations via f(task::util_avg@dequeue), which is the new _task_util_est(struct task_struct*) function added by this patch. If a task should change its behavior and it runs longer in a new activation, after a certain time its util_est will just track the original PELT signal (i.e. task::util_avg). The estimated utilization of cfs_rq is defined only for root ones. That's because the only sensible consumer of this signal are the scheduler and schedutil when looking for the overall CPU utilization due to FAIR tasks. For this reason, the estimated utilization of a root cfs_rq is simply defined as: util_est(cfs_rq) = max(cfs_rq::util_avg, cfs_rq::util_est::enqueued) where: cfs_rq::util_est::enqueued = sum(_task_util_est(task)) for each RUNNABLE task on that root cfs_rq It's worth noting that the estimated utilization is tracked only for objects of interests, specifically: - Tasks: to better support tasks placement decisions - root cfs_rqs: to better support both tasks placement decisions as well as frequencies selection Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: http://lkml.kernel.org/r/20180309095245.11071-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 29 ++++++++++++ kernel/sched/debug.c | 4 ++ kernel/sched/fair.c | 122 +++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/features.h | 5 ++ 4 files changed, 154 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 21b1168da951..f228c6033832 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -274,6 +274,34 @@ struct load_weight { u32 inv_weight; }; +/** + * struct util_est - Estimation utilization of FAIR tasks + * @enqueued: instantaneous estimated utilization of a task/cpu + * @ewma: the Exponential Weighted Moving Average (EWMA) + * utilization of a task + * + * Support data structure to track an Exponential Weighted Moving Average + * (EWMA) of a FAIR task's utilization. New samples are added to the moving + * average each time a task completes an activation. Sample's weight is chosen + * so that the EWMA will be relatively insensitive to transient changes to the + * task's workload. + * + * The enqueued attribute has a slightly different meaning for tasks and cpus: + * - task: the task's util_avg at last task dequeue time + * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU + * Thus, the util_est.enqueued of a task represents the contribution on the + * estimated utilization of the CPU where that task is currently enqueued. + * + * Only for tasks we track a moving average of the past instantaneous + * estimated utilization. This allows to absorb sporadic drops in utilization + * of an otherwise almost periodic task. + */ +struct util_est { + unsigned int enqueued; + unsigned int ewma; +#define UTIL_EST_WEIGHT_SHIFT 2 +}; + /* * The load_avg/util_avg accumulates an infinite geometric series * (see __update_load_avg() in kernel/sched/fair.c). @@ -335,6 +363,7 @@ struct sched_avg { unsigned long load_avg; unsigned long runnable_load_avg; unsigned long util_avg; + struct util_est util_est; }; struct sched_statistics { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 644d9a464380..332303be4beb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -541,6 +541,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->avg.runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); + SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", + cfs_rq->avg.util_est.enqueued); SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", @@ -989,6 +991,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.runnable_load_avg); P(se.avg.util_avg); P(se.avg.last_update_time); + P(se.avg.util_est.ewma); + P(se.avg.util_est.enqueued); #endif P(policy); P(prio); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3582117e1580..22b59a7facd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3873,6 +3873,113 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) static int idle_balance(struct rq *this_rq, struct rq_flags *rf); +static inline unsigned long task_util(struct task_struct *p) +{ + return READ_ONCE(p->se.avg.util_avg); +} + +static inline unsigned long _task_util_est(struct task_struct *p) +{ + struct util_est ue = READ_ONCE(p->se.avg.util_est); + + return max(ue.ewma, ue.enqueued); +} + +static inline unsigned long task_util_est(struct task_struct *p) +{ + return max(task_util(p), _task_util_est(p)); +} + +static inline void util_est_enqueue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued += _task_util_est(p); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); +} + +/* + * Check if a (signed) value is within a specified (unsigned) margin, + * based on the observation that: + * + * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) + * + * NOTE: this only works when value + maring < INT_MAX. + */ +static inline bool within_margin(int value, int margin) +{ + return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); +} + +static void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +{ + long last_ewma_diff; + struct util_est ue; + + if (!sched_feat(UTIL_EST)) + return; + + /* + * Update root cfs_rq's estimated utilization + * + * If *p is the last task then the root cfs_rq's estimated utilization + * of a CPU is 0 by definition. + */ + ue.enqueued = 0; + if (cfs_rq->nr_running) { + ue.enqueued = cfs_rq->avg.util_est.enqueued; + ue.enqueued -= min_t(unsigned int, ue.enqueued, + _task_util_est(p)); + } + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + + /* + * Skip update of task's estimated utilization when the task has not + * yet completed an activation, e.g. being migrated. + */ + if (!task_sleep) + return; + + /* + * Skip update of task's estimated utilization when its EWMA is + * already ~1% close to its last activation value. + */ + ue = p->se.avg.util_est; + ue.enqueued = task_util(p); + last_ewma_diff = ue.enqueued - ue.ewma; + if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) + return; + + /* + * Update Task's estimated utilization + * + * When *p completes an activation we can consolidate another sample + * of the task size. This is done by storing the current PELT value + * as ue.enqueued and by using this value to update the Exponential + * Weighted Moving Average (EWMA): + * + * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) + * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) + * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) + * = w * ( last_ewma_diff ) + ewma(t-1) + * = w * (last_ewma_diff + ewma(t-1) / w) + * + * Where 'w' is the weight of new samples, which is configured to be + * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) + */ + ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; + ue.ewma += last_ewma_diff; + ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; + WRITE_ONCE(p->se.avg.util_est, ue); +} + #else /* CONFIG_SMP */ static inline int @@ -3902,6 +4009,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf) return 0; } +static inline void +util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} + #endif /* CONFIG_SMP */ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -5249,6 +5363,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) add_nr_running(rq, 1); + util_est_enqueue(&rq->cfs, p); hrtick_update(rq); } @@ -5308,6 +5423,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5835,7 +5951,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return target; } -static inline unsigned long task_util(struct task_struct *p); static unsigned long cpu_util_wake(int cpu, struct task_struct *p); static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) @@ -6351,11 +6466,6 @@ static unsigned long cpu_util(int cpu) return (util >= capacity) ? capacity : util; } -static inline unsigned long task_util(struct task_struct *p) -{ - return p->se.avg.util_avg; -} - /* * cpu_util_wake: Compute CPU utilization with any contributions from * the waking task p removed. diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9552fd5854bf..c459a4b61544 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) SCHED_FEAT(WA_IDLE, true) SCHED_FEAT(WA_WEIGHT, true) SCHED_FEAT(WA_BIAS, true) + +/* + * UtilEstimation. Use estimated CPU utilization. + */ +SCHED_FEAT(UTIL_EST, false) -- cgit v1.2.3-71-gd317 From f9be3e5961c5554879a491961187472e923f5ee0 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 9 Mar 2018 09:52:43 +0000 Subject: sched/fair: Use util_est in LB and WU paths When the scheduler looks at the CPU utilization, the current PELT value for a CPU is returned straight away. In certain scenarios this can have undesired side effects on task placement. For example, since the task utilization is decayed at wakeup time, when a long sleeping big task is enqueued it does not add immediately a significant contribution to the target CPU. As a result we generate a race condition where other tasks can be placed on the same CPU while it is still considered relatively empty. In order to reduce this kind of race conditions, this patch introduces the required support to integrate the usage of the CPU's estimated utilization in the wakeup path, via cpu_util_wake(), as well as in the load-balance path, via cpu_util() which is used by update_sg_lb_stats(). The estimated utilization of a CPU is defined to be the maximum between its PELT's utilization and the sum of the estimated utilization (at previous dequeue time) of all the tasks currently RUNNABLE on that CPU. This allows to properly represent the spare capacity of a CPU which, for example, has just got a big task running since a long sleep period. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: http://lkml.kernel.org/r/20180309095245.11071-3-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 84 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 22b59a7facd2..570b8d056282 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6432,11 +6432,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; } -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). +/** + * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks + * @cpu: the CPU to get the utilization of + * + * The unit of the return value must be the one of capacity so we can compare + * the utilization with the capacity of the CPU that is available for CFS task + * (ie cpu_capacity). * * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the * recent utilization of currently non-runnable tasks on a CPU. It represents @@ -6447,6 +6449,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * current capacity (capacity_curr <= capacity_orig) of the CPU because it is * the running time on this CPU scaled by capacity_curr. * + * The estimated utilization of a CPU is defined to be the maximum between its + * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks + * currently RUNNABLE on that CPU. + * This allows to properly represent the expected utilization of a CPU which + * has just got a big task running since a long sleep period. At the same time + * however it preserves the benefits of the "blocked utilization" in + * describing the potential for other tasks waking up on the same CPU. + * * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even * higher than capacity_orig because of unfortunate rounding in * cfs.avg.util_avg or just after migrating tasks and new task wakeups until @@ -6457,13 +6467,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * available capacity. We allow utilization to overshoot capacity_curr (but not * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). + * + * Return: the (estimated) utilization for the specified CPU */ -static unsigned long cpu_util(int cpu) +static inline unsigned long cpu_util(int cpu) { - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); + struct cfs_rq *cfs_rq; + unsigned int util; + + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); + + if (sched_feat(UTIL_EST)) + util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); - return (util >= capacity) ? capacity : util; + return min_t(unsigned long, util, capacity_orig_of(cpu)); } /* @@ -6472,16 +6490,54 @@ static unsigned long cpu_util(int cpu) */ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) { - unsigned long util, capacity; + struct cfs_rq *cfs_rq; + unsigned int util; /* Task has no contribution or is new */ - if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) return cpu_util(cpu); - capacity = capacity_orig_of(cpu); - util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); + + /* Discount task's blocked util from CPU's util */ + util -= min_t(unsigned int, util, task_util(p)); - return (util >= capacity) ? capacity : util; + /* + * Covered cases: + * + * a) if *p is the only task sleeping on this CPU, then: + * cpu_util (== task_util) > util_est (== 0) + * and thus we return: + * cpu_util_wake = (cpu_util - task_util) = 0 + * + * b) if other tasks are SLEEPING on this CPU, which is now exiting + * IDLE, then: + * cpu_util >= task_util + * cpu_util > util_est (== 0) + * and thus we discount *p's blocked utilization to return: + * cpu_util_wake = (cpu_util - task_util) >= 0 + * + * c) if other tasks are RUNNABLE on that CPU and + * util_est > cpu_util + * then we use util_est since it returns a more restrictive + * estimation of the spare capacity on that CPU, by just + * considering the expected utilization of tasks already + * runnable on that CPU. + * + * Cases a) and b) are covered by the above code, while case c) is + * covered by the following code when estimated utilization is + * enabled. + */ + if (sched_feat(UTIL_EST)) + util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + + /* + * Utilization (estimated) can exceed the CPU capacity, thus let's + * clamp to the maximum CPU capacity to ensure consistency with + * the cpu_util call. + */ + return min_t(unsigned long, util, capacity_orig_of(cpu)); } /* -- cgit v1.2.3-71-gd317 From a07630b8b2c16f82fd5b71d890079f4dd7599c1d Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 9 Mar 2018 09:52:44 +0000 Subject: sched/cpufreq/schedutil: Use util_est for OPP selection When schedutil looks at the CPU utilization, the current PELT value for that CPU is returned straight away. In certain scenarios this can have undesired side effects and delays on frequency selection. For example, since the task utilization is decayed at wakeup time, a long sleeping big task newly enqueued does not add immediately a significant contribution to the target CPU. This introduces some latency before schedutil will be able to detect the best frequency required by that task. Moreover, the PELT signal build-up time is a function of the current frequency, because of the scale invariant load tracking support. Thus, starting from a lower frequency, the utilization build-up time will increase even more and further delays the selection of the actual frequency which better serves the task requirements. In order to reduce these kind of latencies, we integrate the usage of the CPU's estimated utilization in the sugov_get_util function. This allows to properly consider the expected utilization of a CPU which, for example, has just got a big task running after a long sleep period. Ultimately this allows to select the best frequency to run a task right after its wake-up. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Acked-by: Rafael J. Wysocki Acked-by: Viresh Kumar Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Steve Muckle Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20180309095245.11071-4-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 22909ffc04fb..c3deaee7a7a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2163,6 +2163,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq) static inline unsigned long cpu_util_cfs(struct rq *rq) { - return rq->cfs.avg.util_avg; + unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + + if (sched_feat(UTIL_EST)) { + util = max_t(unsigned long, util, + READ_ONCE(rq->cfs.avg.util_est.enqueued)); + } + + return util; } #endif -- cgit v1.2.3-71-gd317 From d519329f72a6f36bc4f2b85452640cfe583b4f81 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 9 Mar 2018 09:52:45 +0000 Subject: sched/fair: Update util_est only on util_avg updates The estimated utilization of a task is currently updated every time the task is dequeued. However, to keep overheads under control, PELT signals are effectively updated at maximum once every 1ms. Thus, for really short running tasks, it can happen that their util_avg value has not been updates since their last enqueue. If such tasks are also frequently running tasks (e.g. the kind of workload generated by hackbench) it can also happen that their util_avg is updated only every few activations. This means that updating util_est at every dequeue potentially introduces not necessary overheads and it's also conceptually wrong if the util_avg signal has never been updated during a task activation. Let's introduce a throttling mechanism on task's util_est updates to sync them with util_avg updates. To make the solution memory efficient, both in terms of space and load/store operations, we encode a synchronization flag into the LSB of util_est.enqueued. This makes util_est an even values only metric, which is still considered good enough for its purpose. The synchronization bit is (re)set by __update_load_avg_se() once the PELT signal of a task has been updated during its last activation. Such a throttling mechanism allows to keep under control util_est overheads in the wakeup hot path, thus making it a suitable mechanism which can be enabled also on high-intensity workload systems. Thus, this now switches on by default the estimation utilization scheduler feature. Suggested-by: Chris Redpath Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: http://lkml.kernel.org/r/20180309095245.11071-5-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++---- kernel/sched/features.h | 2 +- 2 files changed, 39 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 570b8d056282..0951d1c58d2f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3242,6 +3242,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna sa->util_avg = sa->util_sum / divider; } +/* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. + * This flag is used to synchronize util_avg updates with util_est updates. + * We map this information into the LSB bit of the utilization saved at + * dequeue time (i.e. util_est.dequeued). + */ +#define UTIL_AVG_UNCHANGED 0x1 + +static inline void cfs_se_util_change(struct sched_avg *avg) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Avoid store if the flag has been already set */ + enqueued = avg->util_est.enqueued; + if (!(enqueued & UTIL_AVG_UNCHANGED)) + return; + + /* Reset flag to report util_avg has been updated */ + enqueued &= ~UTIL_AVG_UNCHANGED; + WRITE_ONCE(avg->util_est.enqueued, enqueued); +} + /* * sched_entity: * @@ -3293,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit cfs_rq->curr == se)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + cfs_se_util_change(&se->avg); return 1; } @@ -3900,7 +3927,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, /* Update root cfs_rq's estimated utilization */ enqueued = cfs_rq->avg.util_est.enqueued; - enqueued += _task_util_est(p); + enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); } @@ -3936,7 +3963,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) if (cfs_rq->nr_running) { ue.enqueued = cfs_rq->avg.util_est.enqueued; ue.enqueued -= min_t(unsigned int, ue.enqueued, - _task_util_est(p)); + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); } WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); @@ -3947,12 +3974,19 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) if (!task_sleep) return; + /* + * If the PELT values haven't changed since enqueue time, + * skip the util_est update. + */ + ue = p->se.avg.util_est; + if (ue.enqueued & UTIL_AVG_UNCHANGED) + return; + /* * Skip update of task's estimated utilization when its EWMA is * already ~1% close to its last activation value. */ - ue = p->se.avg.util_est; - ue.enqueued = task_util(p); + ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); last_ewma_diff = ue.enqueued - ue.ewma; if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) return; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index c459a4b61544..85ae8488039c 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -89,4 +89,4 @@ SCHED_FEAT(WA_BIAS, true) /* * UtilEstimation. Use estimated CPU utilization. */ -SCHED_FEAT(UTIL_EST, false) +SCHED_FEAT(UTIL_EST, true) -- cgit v1.2.3-71-gd317 From 6b2bb7265f0b62605e8caee3613449ed0db270b9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Mar 2018 11:40:33 +0100 Subject: sched/wait: Introduce wait_var_event() As a replacement for the wait_on_atomic_t() API provide the wait_var_event() API. The wait_var_event() API is based on the very same hashed-waitqueue idea, but doesn't care about the type (atomic_t) or the specific condition (atomic_read() == 0). IOW. it's much more widely applicable/flexible. It shares all the benefits/disadvantages of a hashed-waitqueue approach with the existing wait_on_atomic_t/wait_on_bit() APIs. The API is modeled after the existing wait_event() API, but instead of taking a wait_queue_head, it takes an address. This addresses is hashed to obtain a wait_queue_head from the bit_wait_table. Similar to the wait_event() API, it takes a condition expression as second argument and will wait until this expression becomes true. The following are (mostly) identical replacements: wait_on_atomic_t(&my_atomic, atomic_t_wait, TASK_UNINTERRUPTIBLE); wake_up_atomic_t(&my_atomic); wait_var_event(&my_atomic, !atomic_read(&my_atomic)); wake_up_var(&my_atomic); The only difference is that wake_up_var() is an unconditional wakeup and doesn't check the previously hard-coded (atomic_read() == 0) condition here. This is of little concequence, since most callers are already conditional on atomic_dec_and_test() and the ones that are not, are trivial to make so. Tested-by: Dan Williams Signed-off-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait_bit.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/wait_bit.c | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) (limited to 'kernel') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 61b39eaf7cad..3fcdb75d69cf 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -262,4 +262,74 @@ int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode return out_of_line_wait_on_atomic_t(val, action, mode); } +extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags); +extern void wake_up_var(void *var); +extern wait_queue_head_t *__var_waitqueue(void *p); + +#define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \ +({ \ + __label__ __out; \ + struct wait_queue_head *__wq_head = __var_waitqueue(var); \ + struct wait_bit_queue_entry __wbq_entry; \ + long __ret = ret; /* explicit shadow */ \ + \ + init_wait_var_entry(&__wbq_entry, var, \ + exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ + for (;;) { \ + long __int = prepare_to_wait_event(__wq_head, \ + &__wbq_entry.wq_entry, \ + state); \ + if (condition) \ + break; \ + \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ + goto __out; \ + } \ + \ + cmd; \ + } \ + finish_wait(__wq_head, &__wbq_entry.wq_entry); \ +__out: __ret; \ +}) + +#define __wait_var_event(var, condition) \ + ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + schedule()) + +#define wait_var_event(var, condition) \ +do { \ + might_sleep(); \ + if (condition) \ + break; \ + __wait_var_event(var, condition); \ +} while (0) + +#define __wait_var_event_killable(var, condition) \ + ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ + schedule()) + +#define wait_var_event_killable(var, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_var_event_killable(var, condition); \ + __ret; \ +}) + +#define __wait_var_event_timeout(var, condition, timeout) \ + ___wait_var_event(var, ___wait_cond_timeout(condition), \ + TASK_UNINTERRUPTIBLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) + +#define wait_var_event_timeout(var, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_var_event_timeout(var, condition, timeout); \ + __ret; \ +}) + #endif /* _LINUX_WAIT_BIT_H */ diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 4239c78f5cd3..ed84ab245a05 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -149,6 +149,54 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); +wait_queue_head_t *__var_waitqueue(void *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} +EXPORT_SYMBOL(__var_waitqueue); + +static int +var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, + int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wbq_entry = + container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + + if (wbq_entry->key.flags != key->flags || + wbq_entry->key.bit_nr != key->bit_nr) + return 0; + + return autoremove_wake_function(wq_entry, mode, sync, key); +} + +void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags) +{ + *wbq_entry = (struct wait_bit_queue_entry){ + .key = { + .flags = (var), + .bit_nr = -1, + }, + .wq_entry = { + .private = current, + .func = var_wake_function, + .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), + }, + }; +} +EXPORT_SYMBOL(init_wait_var_entry); + +void wake_up_var(void *var) +{ + __wake_up_bit(__var_waitqueue(var), var, -1); +} +EXPORT_SYMBOL(wake_up_var); + /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash -- cgit v1.2.3-71-gd317 From 9b8cce52c4b5c08297900bfdcafc6b08d9bc4a27 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Mar 2018 11:46:30 +0100 Subject: sched/wait: Remove the wait_on_atomic_t() API There are no users left (everyone got converted to wait_var_event()), remove it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait_bit.h | 27 ------------- kernel/sched/wait_bit.c | 101 ----------------------------------------------- 2 files changed, 128 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 3fcdb75d69cf..9318b2166439 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -10,7 +10,6 @@ struct wait_bit_key { void *flags; int bit_nr; -#define WAIT_ATOMIC_T_BIT_NR -1 unsigned long timeout; }; @@ -22,21 +21,15 @@ struct wait_bit_queue_entry { #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } -#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p) \ - { .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, } - typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); -typedef int wait_atomic_t_action_f(atomic_t *counter, unsigned int mode); void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit); int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); void wake_up_bit(void *word, int bit); -void wake_up_atomic_t(atomic_t *p); int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode); int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode); -int out_of_line_wait_on_atomic_t(atomic_t *p, wait_atomic_t_action_f action, unsigned int mode); struct wait_queue_head *bit_waitqueue(void *word, int bit); extern void __init wait_bit_init(void); @@ -57,7 +50,6 @@ extern int bit_wait(struct wait_bit_key *key, int mode); extern int bit_wait_io(struct wait_bit_key *key, int mode); extern int bit_wait_timeout(struct wait_bit_key *key, int mode); extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode); -extern int atomic_t_wait(atomic_t *counter, unsigned int mode); /** * wait_on_bit - wait for a bit to be cleared @@ -243,25 +235,6 @@ wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, return out_of_line_wait_on_bit_lock(word, bit, action, mode); } -/** - * wait_on_atomic_t - Wait for an atomic_t to become 0 - * @val: The atomic value being waited on, a kernel virtual address - * @action: the function used to sleep, which may take special actions - * @mode: the task state to sleep in - * - * Wait for an atomic_t to become 0. We abuse the bit-wait waitqueue table for - * the purpose of getting a waitqueue, but we set the key to a bit number - * outside of the target 'word'. - */ -static inline -int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode) -{ - might_sleep(); - if (atomic_read(val) == 0) - return 0; - return out_of_line_wait_on_atomic_t(val, action, mode); -} - extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags); extern void wake_up_var(void *var); extern wait_queue_head_t *__var_waitqueue(void *p); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index ed84ab245a05..60a84f5a6cb4 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -197,107 +197,6 @@ void wake_up_var(void *var) } EXPORT_SYMBOL(wake_up_var); -/* - * Manipulate the atomic_t address to produce a better bit waitqueue table hash - * index (we're keying off bit -1, but that would produce a horrible hash - * value). - */ -static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) -{ - if (BITS_PER_LONG == 64) { - unsigned long q = (unsigned long)p; - - return bit_waitqueue((void *)(q & ~1), q & 1); - } - return bit_waitqueue(p, 0); -} - -static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, - void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); - atomic_t *val = key->flags; - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - atomic_read(val) != 0) - return 0; - - return autoremove_wake_function(wq_entry, mode, sync, key); -} - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, - * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero - * return codes halt waiting and return. - */ -static __sched -int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - wait_atomic_t_action_f action, unsigned int mode) -{ - atomic_t *val; - int ret = 0; - - do { - prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); - val = wbq_entry->key.flags; - if (atomic_read(val) == 0) - break; - ret = (*action)(val, mode); - } while (!ret && atomic_read(val) != 0); - finish_wait(wq_head, &wbq_entry->wq_entry); - - return ret; -} - -#define DEFINE_WAIT_ATOMIC_T(name, p) \ - struct wait_bit_queue_entry name = { \ - .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ - .wq_entry = { \ - .private = current, \ - .func = wake_atomic_t_function, \ - .entry = \ - LIST_HEAD_INIT((name).wq_entry.entry), \ - }, \ - } - -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, - wait_atomic_t_action_f action, - unsigned int mode) -{ - struct wait_queue_head *wq_head = atomic_t_waitqueue(p); - DEFINE_WAIT_ATOMIC_T(wq_entry, p); - - return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); - -__sched int atomic_t_wait(atomic_t *counter, unsigned int mode) -{ - schedule(); - if (signal_pending_state(mode, current)) - return -EINTR; - - return 0; -} -EXPORT_SYMBOL(atomic_t_wait); - -/** - * wake_up_atomic_t - Wake up a waiter on a atomic_t - * @p: The atomic_t being waited on, a kernel virtual address - * - * Wake up anyone waiting for the atomic_t to go to zero. - * - * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t - * check is done by the waiter's wake function, not the by the waker itself). - */ -void wake_up_atomic_t(atomic_t *p) -{ - __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); -} -EXPORT_SYMBOL(wake_up_atomic_t); - __sched int bit_wait(struct wait_bit_key *word, int mode) { schedule(); -- cgit v1.2.3-71-gd317 From b3fc5c9bb373661224906bf434c09ca0de032e82 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Mar 2018 11:49:36 +0100 Subject: sched/wait: Improve __var_waitqueue() code generation Since we fixed hash_64() to not suck there is no need to play games to attempt to improve the hash value on 64-bit. Also, since we don't use the bit value for the variables, use hash_ptr() directly. No change in functionality. Signed-off-by: Peter Zijlstra (Intel) Cc: George Spelvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/wait_bit.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 60a84f5a6cb4..c67c6d24adc2 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -151,12 +151,7 @@ EXPORT_SYMBOL(wake_up_bit); wait_queue_head_t *__var_waitqueue(void *p) { - if (BITS_PER_LONG == 64) { - unsigned long q = (unsigned long)p; - - return bit_waitqueue((void *)(q & ~1), q & 1); - } - return bit_waitqueue(p, 0); + return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS); } EXPORT_SYMBOL(__var_waitqueue); -- cgit v1.2.3-71-gd317 From 83ac4ca943affce56a2c408ddb2f5232f478fb11 Mon Sep 17 00:00:00 2001 From: Uwe Kleine König Date: Mon, 19 Mar 2018 11:52:02 +0100 Subject: genirq: Pass desc to __irq_free instead of irq number MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Given that irq_to_desc() is a radix_tree_lookup and the reverse operation is only a pointer dereference and that all callers of __free_irq already have the desc, pass the desc instead of the irq number. Signed-off-by: Uwe Kleine-König Signed-off-by: Thomas Gleixner Cc: kernel@pengutronix.de Link: https://lkml.kernel.org/r/20180319105202.9794-1-u.kleine-koenig@pengutronix.de --- kernel/irq/manage.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d2b3c59f1200..b0aa23b2398e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1523,9 +1523,9 @@ EXPORT_SYMBOL_GPL(setup_irq); * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. */ -static struct irqaction *__free_irq(unsigned int irq, void *dev_id) +static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) { - struct irq_desc *desc = irq_to_desc(irq); + unsigned irq = desc->irq_data.irq; struct irqaction *action, **action_ptr; unsigned long flags; @@ -1655,7 +1655,7 @@ void remove_irq(unsigned int irq, struct irqaction *act) struct irq_desc *desc = irq_to_desc(irq); if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) - __free_irq(irq, act->dev_id); + __free_irq(desc, act->dev_id); } EXPORT_SYMBOL_GPL(remove_irq); @@ -1689,7 +1689,7 @@ const void *free_irq(unsigned int irq, void *dev_id) desc->affinity_notify = NULL; #endif - action = __free_irq(irq, dev_id); + action = __free_irq(desc, dev_id); if (!action) return NULL; -- cgit v1.2.3-71-gd317 From 578ae447e7e5d78c90ac40a06406c1741f79ba96 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 19 Mar 2018 13:18:57 -0500 Subject: jump_label: Disable jump labels in __exit code With the following commit: 333522447063 ("jump_label: Explicitly disable jump labels in __init code") ... we explicitly disabled jump labels in __init code, so they could be detected and not warned about in the following commit: dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt") In-kernel __exit code has the same issue. It's never used, so it's freed along with the rest of initmem. But jump label entries in __exit code aren't explicitly disabled, so we get the following warning when enabling pr_debug() in __exit code: can't patch jump_label at dmi_sysfs_exit+0x0/0x2d WARNING: CPU: 0 PID: 22572 at kernel/jump_label.c:376 __jump_label_update+0x9d/0xb0 Fix the warning by disabling all jump labels in initmem (which includes both __init and __exit code). Reported-and-tested-by: Li Wang Signed-off-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Jason Baron Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt") Link: http://lkml.kernel.org/r/7121e6e595374f06616c505b6e690e275c0054d1.1521483452.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 4 ++-- init/main.c | 2 +- kernel/jump_label.c | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 2168cc6b8b30..b46b541c67c4 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -151,7 +151,7 @@ extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); -extern void jump_label_invalidate_init(void); +extern void jump_label_invalidate_initmem(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, @@ -199,7 +199,7 @@ static __always_inline void jump_label_init(void) static_key_initialized = true; } -static inline void jump_label_invalidate_init(void) {} +static inline void jump_label_invalidate_initmem(void) {} static __always_inline bool static_key_false(struct static_key *key) { diff --git a/init/main.c b/init/main.c index 969eaf140ef0..21efbf6ace93 100644 --- a/init/main.c +++ b/init/main.c @@ -1001,7 +1001,7 @@ static int __ref kernel_init(void *unused) /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); ftrace_free_init_mem(); - jump_label_invalidate_init(); + jump_label_invalidate_initmem(); free_initmem(); mark_readonly(); system_state = SYSTEM_RUNNING; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index e7214093dcd1..01ebdf1f9f40 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef HAVE_JUMP_LABEL @@ -421,15 +422,15 @@ void __init jump_label_init(void) cpus_read_unlock(); } -/* Disable any jump label entries in __init code */ -void __init jump_label_invalidate_init(void) +/* Disable any jump label entries in __init/__exit code */ +void __init jump_label_invalidate_initmem(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct jump_entry *iter; for (iter = iter_start; iter < iter_stop; iter++) { - if (init_kernel_text(iter->code)) + if (init_section_contains((void *)(unsigned long)iter->code, 1)) iter->code = 0; } } -- cgit v1.2.3-71-gd317 From c917e0f259908e75bd2a65877e25f9d90c22c848 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 12 Mar 2018 09:59:43 -0700 Subject: perf/cgroup: Fix child event counting bug When a perf_event is attached to parent cgroup, it should count events for all children cgroups: parent_group <---- perf_event \ - child_group <---- process(es) However, in our tests, we found this perf_event cannot report reliable results. Here is an example case: # create cgroups mkdir -p /sys/fs/cgroup/p/c # start perf for parent group perf stat -e instructions -G "p" # on another console, run test process in child cgroup: stressapptest -s 2 -M 1000 & echo $! > /sys/fs/cgroup/p/c/cgroup.procs # after the test process is done, stop perf in the first console shows instructions p The instruction should not be "not counted" as the process runs in the child cgroup. We found this is because perf_event->cgrp and cpuctx->cgrp are not identical, thus perf_event->cgrp are not updated properly. This patch fixes this by updating perf_cgroup properly for ancestor cgroup(s). Reported-by: Ephraim Park Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20180312165943.1057894-1-songliubraving@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4b838470fac4..709a55b9ad97 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -724,9 +724,15 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp) static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { - struct perf_cgroup *cgrp_out = cpuctx->cgrp; - if (cgrp_out) - __update_cgrp_time(cgrp_out); + struct perf_cgroup *cgrp = cpuctx->cgrp; + struct cgroup_subsys_state *css; + + if (cgrp) { + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + __update_cgrp_time(cgrp); + } + } } static inline void update_cgrp_time_from_event(struct perf_event *event) @@ -754,6 +760,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, { struct perf_cgroup *cgrp; struct perf_cgroup_info *info; + struct cgroup_subsys_state *css; /* * ctx->lock held by caller @@ -764,8 +771,12 @@ perf_cgroup_set_timestamp(struct task_struct *task, return; cgrp = perf_cgroup_from_task(task, ctx); - info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + info = this_cpu_ptr(cgrp->info); + info->timestamp = ctx->timestamp; + } } static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); -- cgit v1.2.3-71-gd317 From ceb18132248d95b2c68e30c3df78e69175c4452f Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Sat, 10 Mar 2018 06:14:51 -0800 Subject: firmware: enable run time change of forcing fallback loader Currently one requires to test four kernel configurations to test the firmware API completely: 0) CONFIG_FW_LOADER=y 1) o CONFIG_FW_LOADER=y o CONFIG_FW_LOADER_USER_HELPER=y 2) o CONFIG_FW_LOADER=y o CONFIG_FW_LOADER_USER_HELPER=y o CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y 3) When CONFIG_FW_LOADER=m the built-in stuff is disabled, we have no current tests for this. We can reduce the requirements to three kernel configurations by making fw_config.force_sysfs_fallback a proc knob we flip on off. For kernels that disable CONFIG_IKCONFIG_PROC this can also enable one to inspect if CONFIG_FW_LOADER_USER_HELPER_FALLBACK was enabled at build time by checking the proc value at boot time. Acked-by: Kees Cook Signed-off-by: Luis R. Rodriguez Signed-off-by: Greg Kroah-Hartman --- drivers/base/firmware_loader/fallback.c | 1 + drivers/base/firmware_loader/fallback.h | 4 +++- drivers/base/firmware_loader/fallback_table.c | 17 +++++++++++++++++ kernel/sysctl.c | 11 +++++++++++ 4 files changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/drivers/base/firmware_loader/fallback.c b/drivers/base/firmware_loader/fallback.c index 9b65837256d6..45cc40933a47 100644 --- a/drivers/base/firmware_loader/fallback.c +++ b/drivers/base/firmware_loader/fallback.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "fallback.h" #include "firmware.h" diff --git a/drivers/base/firmware_loader/fallback.h b/drivers/base/firmware_loader/fallback.h index 550498c7fa4c..ca7e69a8417b 100644 --- a/drivers/base/firmware_loader/fallback.h +++ b/drivers/base/firmware_loader/fallback.h @@ -12,12 +12,14 @@ * * @force_sysfs_fallback: force the sysfs fallback mechanism to be used * as if one had enabled CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y. + * Useful to help debug a CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y + * functionality on a kernel where that config entry has been disabled. * @old_timeout: for internal use * @loading_timeout: the timeout to wait for the fallback mechanism before * giving up, in seconds. */ struct firmware_fallback_config { - const bool force_sysfs_fallback; + unsigned int force_sysfs_fallback; int old_timeout; int loading_timeout; }; diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c index 981419044c7e..92365e053e30 100644 --- a/drivers/base/firmware_loader/fallback_table.c +++ b/drivers/base/firmware_loader/fallback_table.c @@ -19,6 +19,9 @@ /* Module or buit-in */ #ifdef CONFIG_FW_LOADER_USER_HELPER +static unsigned int zero; +static unsigned int one = 1; + struct firmware_fallback_config fw_fallback_config = { .force_sysfs_fallback = IS_ENABLED(CONFIG_FW_LOADER_USER_HELPER_FALLBACK), .loading_timeout = 60, @@ -26,4 +29,18 @@ struct firmware_fallback_config fw_fallback_config = { }; EXPORT_SYMBOL_GPL(fw_fallback_config); +struct ctl_table firmware_config_table[] = { + { + .procname = "force_sysfs_fallback", + .data = &fw_fallback_config.force_sysfs_fallback, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { } +}; +EXPORT_SYMBOL_GPL(firmware_config_table); + #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f98f28c12020..bdf7090b106d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -253,6 +253,10 @@ extern struct ctl_table random_table[]; extern struct ctl_table epoll_table[]; #endif +#ifdef CONFIG_FW_LOADER_USER_HELPER +extern struct ctl_table firmware_config_table[]; +#endif + #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT int sysctl_legacy_va_layout; #endif @@ -748,6 +752,13 @@ static struct ctl_table kern_table[] = { .mode = 0555, .child = usermodehelper_table, }, +#ifdef CONFIG_FW_LOADER_USER_HELPER + { + .procname = "firmware_config", + .mode = 0555, + .child = firmware_config_table, + }, +#endif { .procname = "overflowuid", .data = &overflowuid, -- cgit v1.2.3-71-gd317 From a8c024cd9b9683d25ae1f459525dd2c6bec75e79 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Mon, 19 Mar 2018 14:35:54 -0400 Subject: sched/debug: Fix per-task line continuation for console output When the SEQ_printf() macro prints to the console, it runs a simple printk() without KERN_CONT "continued" line printing. The result of this is oddly wrapped task info, for example: % echo t > /proc/sysrq-trigger % dmesg ... runnable tasks: ... [ 29.608611] I [ 29.608613] rcu_sched 8 3252.013846 4087 120 [ 29.608614] 0.000000 29.090111 0.000000 [ 29.608615] 0 0 [ 29.608616] / Modify SEQ_printf to use pr_cont() for expected one-line results: % echo t > /proc/sysrq-trigger % dmesg ... runnable tasks: ... [ 106.716329] S cpuhp/5 37 2006.315026 14 120 0.000000 0.496893 0.000000 0 0 / Signed-off-by: Joe Lawrence Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1521484555-8620-2-git-send-email-joe.lawrence@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1ca0130ed4f9..50026aa2d81e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -32,7 +32,7 @@ static DEFINE_SPINLOCK(sched_debug_lock); if (m) \ seq_printf(m, x); \ else \ - printk(x); \ + pr_cont(x); \ } while (0) /* -- cgit v1.2.3-71-gd317 From e9ca267096674eadd1fd479279bcb58df1486049 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Mon, 19 Mar 2018 14:35:55 -0400 Subject: sched/debug: Adjust newlines for better alignment Scheduler debug stats include newlines that display out of alignment when prefixed by timestamps. For example, the dmesg utility: % echo t > /proc/sysrq-trigger % dmesg ... [ 83.124251] runnable tasks: S task PID tree-key switches prio wait-time sum-exec sum-sleep ----------------------------------------------------------------------------------------------------------- At the same time, some syslog utilities (like rsyslog by default) don't like the additional newlines control characters, saving lines like this to /var/log/messages: Mar 16 16:02:29 localhost kernel: #012runnable tasks:#012 S task PID tree-key ... ^^^^ ^^^^ Clean these up by moving newline characters to their own SEQ_printf invocation. This leaves the /proc/sched_debug unchanged, but brings the entire output into alignment when prefixed: % echo t > /proc/sysrq-trigger % dmesg ... [ 62.410368] runnable tasks: [ 62.410368] S task PID tree-key switches prio wait-time sum-exec sum-sleep [ 62.410369] ----------------------------------------------------------------------------------------------------------- [ 62.410369] I kworker/u12:0 5 1932.215593 332 120 0.000000 3.621252 0.000000 0 0 / and no escaped control characters from rsyslog in /var/log/messages: Mar 16 16:15:06 localhost kernel: runnable tasks: Mar 16 16:15:06 localhost kernel: S task PID tree-key ... Signed-off-by: Joe Lawrence Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1521484555-8620-3-git-send-email-joe.lawrence@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 50026aa2d81e..72c401b3b15c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -501,12 +501,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) { struct task_struct *g, *p; - SEQ_printf(m, - "\nrunnable tasks:\n" - " S task PID tree-key switches prio" - " wait-time sum-exec sum-sleep\n" - "-------------------------------------------------------" - "----------------------------------------------------\n"); + SEQ_printf(m, "\n"); + SEQ_printf(m, "runnable tasks:\n"); + SEQ_printf(m, " S task PID tree-key switches prio" + " wait-time sum-exec sum-sleep\n"); + SEQ_printf(m, "-------------------------------------------------------" + "----------------------------------------------------\n"); rcu_read_lock(); for_each_process_thread(g, p) { @@ -527,9 +527,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED - SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); + SEQ_printf(m, "\n"); + SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); #else - SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "cfs_rq[%d]:\n", cpu); #endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -595,9 +597,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #ifdef CONFIG_RT_GROUP_SCHED - SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); + SEQ_printf(m, "\n"); + SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); #else - SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "rt_rq[%d]:\n", cpu); #endif #define P(x) \ @@ -624,7 +628,8 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) { struct dl_bw *dl_bw; - SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "dl_rq[%d]:\n", cpu); #define PU(x) \ SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) -- cgit v1.2.3-71-gd317 From 99bfce5db9c071800bdc7e9658a68e6d11aeecf6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Mar 2018 22:15:16 +0100 Subject: genirq: Cleanup top of file comments Remove pointless references to the file name itself and condense the information so it wastes less space. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Link: https://lkml.kernel.org/r/20180314212030.412095827@linutronix.de --- kernel/irq/autoprobe.c | 2 -- kernel/irq/chip.c | 9 +++------ kernel/irq/handle.c | 7 ++----- kernel/irq/ipi.c | 2 -- kernel/irq/irqdesc.c | 5 ++--- kernel/irq/manage.c | 2 -- kernel/irq/msi.c | 2 -- kernel/irq/pm.c | 2 -- kernel/irq/proc.c | 2 -- kernel/irq/resend.c | 2 -- kernel/irq/spurious.c | 2 -- kernel/irq/timings.c | 2 -- 12 files changed, 7 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 8c82ea26e837..16cbf6beb276 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/autoprobe.c - * * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar * * This file contains the interrupt probing code and driver APIs. diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c69357a43849..b575c7a93b76 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1,13 +1,10 @@ /* - * linux/kernel/irq/chip.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * - * This file contains the core interrupt handling code, for irq-chip - * based architectures. - * - * Detailed information is available in Documentation/core-api/genericirq.rst + * This file contains the core interrupt handling code, for irq-chip based + * architectures. Detailed information is available in + * Documentation/core-api/genericirq.rst */ #include diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3570c715c3e7..1bf69b6571a3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -1,12 +1,9 @@ /* - * linux/kernel/irq/handle.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * - * This file contains the core interrupt handling code. - * - * Detailed information is available in Documentation/core-api/genericirq.rst + * This file contains the core interrupt handling code. Detailed + * information is available in Documentation/core-api/genericirq.rst * */ diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 259a22aa9934..4da9cf0a935c 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -1,6 +1,4 @@ /* - * linux/kernel/irq/ipi.c - * * Copyright (C) 2015 Imagination Technologies Ltd * Author: Qais Yousef * diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index d9ded088d336..55515cce1cc2 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -2,9 +2,8 @@ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * - * This file contains the interrupt descriptor management code - * - * Detailed information is available in Documentation/core-api/genericirq.rst + * This file contains the interrupt descriptor management code. Detailed + * information is available in Documentation/core-api/genericirq.rst * */ #include diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b0aa23b2398e..99eb1977e784 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1,6 +1,4 @@ /* - * linux/kernel/irq/manage.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006 Thomas Gleixner * diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2f3c4f5382cc..8429c88427ff 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1,6 +1,4 @@ /* - * linux/kernel/irq/msi.c - * * Copyright (C) 2014 Intel Corp. * Author: Jiang Liu * diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 6bd9b58429cc..474d3fa32a28 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -1,6 +1,4 @@ /* - * linux/kernel/irq/pm.c - * * Copyright (C) 2009 Rafael J. Wysocki , Novell Inc. * * This file contains power management functions related to interrupts. diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index e8f374971e37..7cb091d81d91 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/proc.c - * * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar * * This file contains the /proc/irq/ handling code. diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 1d08f45135c2..95414ad3506a 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/resend.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner * diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 6cdecc6f4c53..d867d6ddafdd 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/spurious.c - * * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar * * This file contains spurious interrupt handling. diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index e0923fa4927a..c87099b2ea50 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -1,6 +1,4 @@ /* - * linux/kernel/irq/timings.c - * * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano * * This program is free software; you can redistribute it and/or modify -- cgit v1.2.3-71-gd317 From 90cafdd521721eb588a738fae17ac70e50675686 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Mar 2018 22:15:17 +0100 Subject: genirq/matrix: Cleanup SPDX identifier Use the proper SPDX-Identifier format. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Link: https://lkml.kernel.org/r/20180314212030.492674761@linutronix.de --- kernel/irq/matrix.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 4c5770407031..5092494bf261 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -1,8 +1,6 @@ -/* - * Copyright (C) 2017 Thomas Gleixner - * - * SPDX-License-Identifier: GPL-2.0 - */ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2017 Thomas Gleixner + #include #include #include -- cgit v1.2.3-71-gd317 From 52a65ff5603e685e9b19c2e108b3f0826dc7a86b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Mar 2018 22:15:19 +0100 Subject: genirq: Add missing SPDX identifiers Add SPDX identifiers to files - which contain an explicit license boiler plate or reference - which do not contain a license reference and were not updated in the initial SPDX conversion because the license was deduced by the scanners via EXPORT_SYMBOL_GPL as GPL2.0 only. [ tglx: Moved adding identifiers from the patch which removes the references/boilerplate ] Signed-off-by: Thomas Gleixner Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Link: https://lkml.kernel.org/r/20180314212030.668321222@linutronix.de --- kernel/irq/chip.c | 1 + kernel/irq/cpuhotplug.c | 1 + kernel/irq/debugfs.c | 1 + kernel/irq/devres.c | 1 + kernel/irq/dummychip.c | 1 + kernel/irq/generic-chip.c | 1 + kernel/irq/handle.c | 1 + kernel/irq/ipi.c | 1 + kernel/irq/irq_sim.c | 1 + kernel/irq/irqdesc.c | 1 + kernel/irq/irqdomain.c | 2 ++ kernel/irq/manage.c | 1 + kernel/irq/msi.c | 1 + kernel/irq/pm.c | 1 + kernel/irq/timings.c | 1 + 15 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b575c7a93b76..a2b3d9de999c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 9eb09aef0313..5b1072e394b2 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generic cpu hotunplug interrupt migration code copied from the * arch/arm implementation diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index acfaaef8672a..c8cc6ce9c1c3 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright 2017 Thomas Gleixner * diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 194c506d9d20..6a682c229e10 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 326a67f2410b..0b0cdf206dc4 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 508c03dfef25..e2999a070a99 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Library implementing the most common irq chip callback functions * diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 1bf69b6571a3..38554bc35375 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 4da9cf0a935c..8b778e37dc6d 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015 Imagination Technologies Ltd * Author: Qais Yousef diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 85690859a2a8..fc4f361a86bb 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Bartosz Golaszewski * diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 55515cce1cc2..afc7f902d74a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 82b8b18ee1eb..5d9fc01b60a6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + #define pr_fmt(fmt) "irq: " fmt #include diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 99eb1977e784..e3336d904f64 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006 Thomas Gleixner diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 8429c88427ff..2a8571f72b17 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2014 Intel Corp. * Author: Jiang Liu diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 474d3fa32a28..d6961d3c6f9e 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Rafael J. Wysocki , Novell Inc. * diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index c87099b2ea50..223d47937c48 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano * -- cgit v1.2.3-71-gd317 From f3f59fbc54b76945ebc92772bd86f60728205da3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Mar 2018 14:17:04 +0100 Subject: genirq: Remove license boilerplate/references Now that SPDX identifiers are in place, remove the boilerplate or references. The change in timings.c has been acked by the author. Signed-off-by: Thomas Gleixner Acked-by: Daniel Lezcano Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Link: https://lkml.kernel.org/r/20180314212030.668321222@linutronix.de --- kernel/irq/debugfs.c | 7 ++----- kernel/irq/timings.c | 10 ++-------- 2 files changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index c8cc6ce9c1c3..4dadeb3d6666 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2017 Thomas Gleixner - * - * This file is licensed under the GPL V2. - */ +// Copyright 2017 Thomas Gleixner + #include #include #include diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 223d47937c48..1e4cb63a5c82 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -1,12 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ +// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano + #include #include #include -- cgit v1.2.3-71-gd317 From 5826cc8f5a6094ce6565a8e0008c9f07ebf724b1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 Mar 2018 17:24:05 +0800 Subject: workqueue: fix the comments of nr_idle Since the worker rebinding behavior was refactored, there is no idle worker off the idle_list now. The comment is outdated and can be just removed. It also groups nr_workers and nr_idle together. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7df85fa9f651..eeebfa65b0ef 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -153,10 +153,9 @@ struct worker_pool { unsigned long watchdog_ts; /* L: watchdog timestamp */ struct list_head worklist; /* L: list of pending works */ - int nr_workers; /* L: total number of workers */ - /* nr_idle includes the ones off idle_list for rebinding */ - int nr_idle; /* L: currently idle ones */ + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle workers */ struct list_head idle_list; /* X: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ -- cgit v1.2.3-71-gd317 From f75da8a8a918d7c343a2ec95d1ed99e5689e0f23 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 Mar 2018 17:24:32 +0800 Subject: workqueue: remove the comment about the old manager_arb mutex The manager_arb mutex doesn't exist any more. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index eeebfa65b0ef..636c6d702b93 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -165,7 +165,6 @@ struct worker_pool { DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ - /* see manage_workers() for details on the two manager mutexes */ struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ -- cgit v1.2.3-71-gd317 From f005afede992e265bb98534b86912bb669ccd0d2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 20 Mar 2018 11:19:17 -0700 Subject: trace/bpf: remove helper bpf_perf_prog_read_value from tracepoint type programs Commit 4bebdc7a85aa ("bpf: add helper bpf_perf_prog_read_value") added helper bpf_perf_prog_read_value so that perf_event type program can read event counter and enabled/running time. This commit, however, introduced a bug which allows this helper for tracepoint type programs. This is incorrect as bpf_perf_prog_read_value needs to access perf_event through its bpf_perf_event_data_kern type context, which is not available for tracepoint type program. This patch fixed the issue by separating bpf_func_proto between tracepoint and perf_event type programs and removed bpf_perf_prog_read_value from tracepoint func prototype. Fixes: 4bebdc7a85aa ("bpf: add helper bpf_perf_prog_read_value") Reported-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 68 ++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c0a9e310d715..01e6b3a38871 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -661,7 +661,41 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, +static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_tp; + default: + return tracing_func_proto(func_id); + } +} + +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); + return true; +} + +const struct bpf_verifier_ops tracepoint_verifier_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops tracepoint_prog_ops = { +}; + +BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx, struct bpf_perf_event_value *, buf, u32, size) { int err = -EINVAL; @@ -678,8 +712,8 @@ clear: return err; } -static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { - .func = bpf_perf_prog_read_value_tp, +static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { + .func = bpf_perf_prog_read_value, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -687,7 +721,7 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -695,34 +729,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; case BPF_FUNC_perf_prog_read_value: - return &bpf_perf_prog_read_value_proto_tp; + return &bpf_perf_prog_read_value_proto; default: return tracing_func_proto(func_id); } } -static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) - return false; - if (type != BPF_READ) - return false; - if (off % size != 0) - return false; - - BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); - return true; -} - -const struct bpf_verifier_ops tracepoint_verifier_ops = { - .get_func_proto = tp_prog_func_proto, - .is_valid_access = tp_prog_is_valid_access, -}; - -const struct bpf_prog_ops tracepoint_prog_ops = { -}; - static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -779,7 +791,7 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, } const struct bpf_verifier_ops perf_event_verifier_ops = { - .get_func_proto = tp_prog_func_proto, + .get_func_proto = pe_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; -- cgit v1.2.3-71-gd317 From 0fa4fe85f4724fff89b09741c437cbee9cf8b008 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Mon, 19 Mar 2018 17:57:27 -0700 Subject: bpf: skip unnecessary capability check The current check statement in BPF syscall will do a capability check for CAP_SYS_ADMIN before checking sysctl_unprivileged_bpf_disabled. This code path will trigger unnecessary security hooks on capability checking and cause false alarms on unprivileged process trying to get CAP_SYS_ADMIN access. This can be resolved by simply switch the order of the statement and CAP_SYS_ADMIN is not required anyway if unprivileged bpf syscall is allowed. Signed-off-by: Chenbo Feng Acked-by: Lorenzo Colitti Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e24aa3241387..43f95d190eea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1845,7 +1845,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr = {}; int err; - if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) + if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; err = check_uarg_tail_zero(uattr, sizeof(attr), size); -- cgit v1.2.3-71-gd317 From dd206bec9a446884805370b1c16c1d7a97036777 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 20 Mar 2018 21:51:06 +0300 Subject: pidns: simpler allocation of pid_* caches Those pid_* caches are created on demand when a process advances to the new level of pid namespace. Which means pointers are stable, write only and thus can be packed into an array instead of spreading them over and using lists(!) to find them. Both first and subsequent clone/unshare(CLONE_NEWPID) become faster. Signed-off-by: Alexey Dobriyan Signed-off-by: Eric W. Biederman --- kernel/pid_namespace.c | 67 ++++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0b53eef7d34b..773b2b31734f 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,55 +23,39 @@ #include #include -struct pid_cache { - int nr_ids; - char name[16]; - struct kmem_cache *cachep; - struct list_head list; -}; - -static LIST_HEAD(pid_caches_lh); static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 +/* Write once array, filled from the beginning. */ +static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; /* * creates the kmem cache to allocate pids from. - * @nr_ids: the number of numerical ids this pid will have to carry + * @level: pid namespace level */ -static struct kmem_cache *create_pid_cachep(int nr_ids) +static struct kmem_cache *create_pid_cachep(unsigned int level) { - struct pid_cache *pcache; - struct kmem_cache *cachep; - + /* Level 0 is init_pid_ns.pid_cachep */ + struct kmem_cache **pkc = &pid_cache[level - 1]; + struct kmem_cache *kc; + char name[4 + 10 + 1]; + unsigned int len; + + kc = READ_ONCE(*pkc); + if (kc) + return kc; + + snprintf(name, sizeof(name), "pid_%u", level + 1); + len = sizeof(struct pid) + level * sizeof(struct upid); mutex_lock(&pid_caches_mutex); - list_for_each_entry(pcache, &pid_caches_lh, list) - if (pcache->nr_ids == nr_ids) - goto out; - - pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL); - if (pcache == NULL) - goto err_alloc; - - snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); - cachep = kmem_cache_create(pcache->name, - sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (cachep == NULL) - goto err_cachep; - - pcache->nr_ids = nr_ids; - pcache->cachep = cachep; - list_add(&pcache->list, &pid_caches_lh); -out: + /* Name collision forces to do allocation under mutex. */ + if (!*pkc) + *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0); mutex_unlock(&pid_caches_mutex); - return pcache->cachep; - -err_cachep: - kfree(pcache); -err_alloc: - mutex_unlock(&pid_caches_mutex); - return NULL; + /* current can fail, but someone else can succeed. */ + return READ_ONCE(*pkc); } static void proc_cleanup_work(struct work_struct *work) @@ -80,9 +64,6 @@ static void proc_cleanup_work(struct work_struct *work) pid_ns_release_proc(ns); } -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ -#define MAX_PID_NS_LEVEL 32 - static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); @@ -119,7 +100,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns idr_init(&ns->idr); - ns->pid_cachep = create_pid_cachep(level + 1); + ns->pid_cachep = create_pid_cachep(level); if (ns->pid_cachep == NULL) goto out_free_idr; -- cgit v1.2.3-71-gd317 From 94b9d9b7a14cbb1640868d53b27f403ed2e5b4a9 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 21 Mar 2018 04:42:20 -0400 Subject: audit: remove path param from link denied function In commit 45b578fe4c3cade6f4ca1fc934ce199afd857edc ("audit: link denied should not directly generate PATH record") the need for the struct path *link parameter was removed. Remove the now useless struct path argument. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- fs/namei.c | 4 ++-- include/linux/audit.h | 6 ++---- kernel/audit.c | 3 +-- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/fs/namei.c b/fs/namei.c index 9cc91fb7f156..e3682bb72cb5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -945,7 +945,7 @@ static inline int may_follow_link(struct nameidata *nd) if (nd->flags & LOOKUP_RCU) return -ECHILD; - audit_log_link_denied("follow_link", &nd->stack[0].link); + audit_log_link_denied("follow_link"); return -EACCES; } @@ -1011,7 +1011,7 @@ static int may_linkat(struct path *link) if (safe_hardlink_source(inode) || inode_owner_or_capable(inode)) return 0; - audit_log_link_denied("linkat", link); + audit_log_link_denied("linkat"); return -EPERM; } diff --git a/include/linux/audit.h b/include/linux/audit.h index af410d9fbf2d..75d5b031e802 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -146,8 +146,7 @@ extern void audit_log_d_path(struct audit_buffer *ab, const struct path *path); extern void audit_log_key(struct audit_buffer *ab, char *key); -extern void audit_log_link_denied(const char *operation, - const struct path *link); +extern void audit_log_link_denied(const char *operation); extern void audit_log_lost(const char *message); extern int audit_log_task_context(struct audit_buffer *ab); @@ -194,8 +193,7 @@ static inline void audit_log_d_path(struct audit_buffer *ab, { } static inline void audit_log_key(struct audit_buffer *ab, char *key) { } -static inline void audit_log_link_denied(const char *string, - const struct path *link) +static inline void audit_log_link_denied(const char *string) { } static inline int audit_log_task_context(struct audit_buffer *ab) { diff --git a/kernel/audit.c b/kernel/audit.c index 3f2f143edadf..e8bf8d78ac4a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2308,9 +2308,8 @@ EXPORT_SYMBOL(audit_log_task_info); /** * audit_log_link_denied - report a link restriction denial * @operation: specific link operation - * @link: the path that triggered the restriction */ -void audit_log_link_denied(const char *operation, const struct path *link) +void audit_log_link_denied(const char *operation) { struct audit_buffer *ab; -- cgit v1.2.3-71-gd317 From 19b558db12f9f4e45a22012bae7b4783e62224da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Feb 2018 17:21:55 +0100 Subject: posix-timers: Protect posix clock array access against speculation The clockid argument of clockid_to_kclock() comes straight from user space via various syscalls and is used as index into the posix_clocks array. Protect it against spectre v1 array out of bounds speculation. Remove the redundant check for !posix_clock[id] as this is another source for speculation and does not provide any advantage over the return posix_clock[id] path which returns NULL in that case anyway. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Acked-by: Dan Williams Cc: Rasmus Villemoes Cc: Greg KH Cc: stable@vger.kernel.org Cc: Linus Torvalds Cc: David Woodhouse Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1802151718320.1296@nanos.tec.linutronix.de --- kernel/time/posix-timers.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 75043046914e..10b7186d0638 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "timekeeping.h" #include "posix-timers.h" @@ -1346,11 +1347,15 @@ static const struct k_clock * const posix_clocks[] = { static const struct k_clock *clockid_to_kclock(const clockid_t id) { - if (id < 0) + clockid_t idx = id; + + if (id < 0) { return (id & CLOCKFD_MASK) == CLOCKFD ? &clock_posix_dynamic : &clock_posix_cpu; + } - if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) + if (id >= ARRAY_SIZE(posix_clocks)) return NULL; - return posix_clocks[id]; + + return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } -- cgit v1.2.3-71-gd317 From 47319f7186b8746521b0ee674054b29d7b35578b Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Thu, 22 Mar 2018 14:58:33 +0100 Subject: printk: change message to pr_info To allow userspace to prevent this message from appearing in the console by changing the log priority. This matches other informative messages that the power subsystem emits when the system changes power states. Link: http://lkml.kernel.org/r/20180322135833.16602-1-tomeu.vizoso@collabora.com To: linux-kernel@vger.kernel.org Cc: kernel@collabora.com Cc: Steven Rostedt Signed-off-by: Tomeu Vizoso Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index dc663f288463..333f58eac420 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2161,7 +2161,7 @@ void suspend_console(void) { if (!console_suspend_enabled) return; - printk("Suspending console(s) (use no_console_suspend to debug)\n"); + pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); console_lock(); console_suspended = 1; up_console_sem(); -- cgit v1.2.3-71-gd317 From 5e4cf2bf6d1c198a90ccc0df5ffd8e0d4ea36b48 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 23 Mar 2018 14:37:36 +0300 Subject: tracing: Fix a potential NULL dereference We forgot to set the error code on this path so we return ERR_PTR(0) which is NULL. It results in a NULL dereference in the caller. Link: http://lkml.kernel.org/r/20180323113735.GC28518@mwanda Fixes: 100719dcef44 ("tracing: Add simple expression support to hist triggers") Acked-by: Tom Zanussi Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 4f027642ceef..a02bc09d765a 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2776,6 +2776,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->fn = hist_field_plus; break; default: + ret = -EINVAL; goto free; } -- cgit v1.2.3-71-gd317 From c5d343b6b7badd1f5fe0873eff2e8d63a193e732 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 17 Mar 2018 21:38:10 +0900 Subject: tracing: probeevent: Fix to support minus offset from symbol In Documentation/trace/kprobetrace.txt, it says @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) However, the parser doesn't parse minus offset correctly, since commit 2fba0c8867af ("tracing/kprobes: Fix probe offset to be unsigned") drops minus ("-") offset support for kprobe probe address usage. This fixes the traceprobe_split_symbol_offset() to parse minus offset again with checking the offset range, and add a minus offset check in kprobe probe address usage. Link: http://lkml.kernel.org/r/152129028983.31874.13419301530285775521.stgit@devbox Cc: Ingo Molnar Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Ravi Bangoria Cc: stable@vger.kernel.org Fixes: 2fba0c8867af ("tracing/kprobes: Fix probe offset to be unsigned") Acked-by: Namhyung Kim Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_probe.c | 8 +++----- kernel/trace/trace_probe.h | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1fad24acd444..ae4147eaebd4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -659,7 +659,7 @@ static int create_trace_kprobe(int argc, char **argv) char *symbol = NULL, *event = NULL, *group = NULL; int maxactive = 0; char *arg; - unsigned long offset = 0; + long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -747,7 +747,7 @@ static int create_trace_kprobe(int argc, char **argv) symbol = argv[1]; /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); - if (ret) { + if (ret || offset < 0 || offset > UINT_MAX) { pr_info("Failed to parse either an address or a symbol.\n"); return ret; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d59357308677..daf54bda4dc8 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -320,7 +320,7 @@ static fetch_func_t get_fetch_size_function(const struct fetch_type *type, } /* Split symbol and offset. */ -int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) +int traceprobe_split_symbol_offset(char *symbol, long *offset) { char *tmp; int ret; @@ -328,13 +328,11 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) if (!offset) return -EINVAL; - tmp = strchr(symbol, '+'); + tmp = strpbrk(symbol, "+-"); if (tmp) { - /* skip sign because kstrtoul doesn't accept '+' */ - ret = kstrtoul(tmp + 1, 0, offset); + ret = kstrtol(tmp, 0, offset); if (ret) return ret; - *tmp = '\0'; } else *offset = 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index e101c5bb9eda..6a4d3fa94042 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -365,7 +365,7 @@ extern int traceprobe_conflict_field_name(const char *name, extern void traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); -extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); +extern int traceprobe_split_symbol_offset(char *symbol, long *offset); /* Sum up total data length for dynamic arraies (strings) */ static nokprobe_inline int -- cgit v1.2.3-71-gd317 From abe0884011f1a569bc1254b70c41525b755e8037 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Mar 2018 11:41:28 +0100 Subject: bpf: Remove struct bpf_verifier_env argument from print_bpf_insn We use print_bpf_insn in user space (bpftool and soon perf), so it'd be nice to keep it generic and strip it off the kernel struct bpf_verifier_env argument. This argument can be safely removed, because its users can use the struct bpf_insn_cbs::private_data to pass it. By changing the argument type we can no longer have clean 'verbose' alias to 'bpf_verifier_log_write' in verifier.c. Instead we're adding the 'verbose' cb_print callback and removing the alias. This way we have new cb_print callback in place, and all the 'verbose(env, ...) calls in verifier.c will cleanly cast to 'verbose(void *, ...)' so no other change is needed. Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann --- kernel/bpf/disasm.c | 52 +++++++++++++++++++++++++-------------------------- kernel/bpf/disasm.h | 5 +---- kernel/bpf/verifier.c | 44 ++++++++++++++++++++++++++----------------- 3 files changed, 54 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 8740406df2cd..d6b76377cb6e 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -113,16 +113,16 @@ static const char *const bpf_jmp_string[16] = { }; static void print_bpf_end_insn(bpf_insn_print_t verbose, - struct bpf_verifier_env *env, + void *private_data, const struct bpf_insn *insn) { - verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + verbose(private_data, "(%02x) r%d = %s%d r%d\n", + insn->code, insn->dst_reg, BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", insn->imm, insn->dst_reg); } void print_bpf_insn(const struct bpf_insn_cbs *cbs, - struct bpf_verifier_env *env, const struct bpf_insn *insn, bool allow_ptr_leaks) { @@ -132,23 +132,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) - verbose(env, "BUG_alu64_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code); else - print_bpf_end_insn(verbose, env, insn); + print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(env, "(%02x) r%d = %s-r%d\n", + verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", insn->code, insn->dst_reg, class == BPF_ALU ? "(u32) " : "", insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) %sr%d %s %sr%d\n", + verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); } else { - verbose(env, "(%02x) %sr%d %s %s%d\n", + verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], @@ -157,46 +157,46 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) - verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_XADD) - verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else - verbose(env, "BUG_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_%02x\n", insn->code); } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_st_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); return; } - verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_ldx_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); return; } - verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", + verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { - verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->imm); } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); @@ -212,12 +212,12 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, if (map_ptr && !allow_ptr_leaks) imm = 0; - verbose(env, "(%02x) r%d = %s\n", + verbose(cbs->private_data, "(%02x) r%d = %s\n", insn->code, insn->dst_reg, __func_imm_name(cbs, insn, imm, tmp, sizeof(tmp))); } else { - verbose(env, "BUG_ld_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } } else if (class == BPF_JMP) { @@ -227,35 +227,35 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, char tmp[64]; if (insn->src_reg == BPF_PSEUDO_CALL) { - verbose(env, "(%02x) call pc%s\n", + verbose(cbs->private_data, "(%02x) call pc%s\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp))); } else { strcpy(tmp, "unknown"); - verbose(env, "(%02x) call %s#%d\n", insn->code, + verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp)), insn->imm); } } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose(env, "(%02x) goto pc%+d\n", + verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose(env, "(%02x) exit\n", insn->code); + verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", + verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->src_reg, insn->off); } else { - verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", + verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } } else { - verbose(env, "(%02x) %s\n", + verbose(cbs->private_data, "(%02x) %s\n", insn->code, bpf_class_string[class]); } } diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index 266fe8ee542b..e1324a834a24 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -22,14 +22,12 @@ #include #endif -struct bpf_verifier_env; - extern const char *const bpf_alu_string[16]; extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env, +typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data, const char *, ...); typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, const struct bpf_insn *insn); @@ -45,7 +43,6 @@ struct bpf_insn_cbs { }; void print_bpf_insn(const struct bpf_insn_cbs *cbs, - struct bpf_verifier_env *env, const struct bpf_insn *insn, bool allow_ptr_leaks); #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9f7c20691c1..e93a6e48641b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -168,23 +168,16 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); -/* log_level controls verbosity level of eBPF verifier. - * bpf_verifier_log_write() is used to dump the verification trace to the log, - * so the user can figure out what's wrong with the program - */ -__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, - const char *fmt, ...) +static void log_write(struct bpf_verifier_env *env, const char *fmt, + va_list args) { struct bpf_verifer_log *log = &env->log; unsigned int n; - va_list args; if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) return; - va_start(args, fmt); n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - va_end(args); WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, "verifier log line truncated - local buffer too short\n"); @@ -197,14 +190,30 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, else log->ubuf = NULL; } -EXPORT_SYMBOL_GPL(bpf_verifier_log_write); -/* Historically bpf_verifier_log_write was called verbose, but the name was too - * generic for symbol export. The function was renamed, but not the calls in - * the verifier to avoid complicating backports. Hence the alias below. + +/* log_level controls verbosity level of eBPF verifier. + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program */ -static __printf(2, 3) void verbose(struct bpf_verifier_env *env, - const char *fmt, ...) - __attribute__((alias("bpf_verifier_log_write"))); +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + log_write(env, fmt, args); + va_end(args); +} +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); + +__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + log_write(private_data, fmt, args); + va_end(args); +} static bool type_is_pkt_pointer(enum bpf_reg_type type) { @@ -4600,10 +4609,11 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level) { const struct bpf_insn_cbs cbs = { .cb_print = verbose, + .private_data = env, }; verbose(env, "%d: ", insn_idx); - print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } if (bpf_prog_is_dev_bound(env->prog->aux)) { -- cgit v1.2.3-71-gd317 From e97a90f7069b740575bcb1dae86596e0484b8957 Mon Sep 17 00:00:00 2001 From: Claudio Scordino Date: Tue, 13 Mar 2018 11:35:40 +0100 Subject: sched/cpufreq: Rate limits for SCHED_DEADLINE When the SCHED_DEADLINE scheduling class increases the CPU utilization, it should not wait for the rate limit, otherwise it may miss some deadline. Tests using rt-app on Exynos5422 with up to 10 SCHED_DEADLINE tasks have shown reductions of even 10% of deadline misses with a negligible increase of energy consumption (measured through Baylibre Cape). Signed-off-by: Claudio Scordino Signed-off-by: Thomas Gleixner Reviewed-by: Rafael J. Wysocki Acked-by: Viresh Kumar Cc: Juri Lelli Cc: Joel Fernandes Cc: Vincent Guittot Cc: linux-pm@vger.kernel.org Cc: Peter Zijlstra Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Todd Kjos Cc: Dietmar Eggemann Link: https://lkml.kernel.org/r/1520937340-2755-1-git-send-email-claudio@evidence.eu.com --- kernel/sched/cpufreq_schedutil.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 89fe78ecb88c..2b124811947d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -267,6 +267,16 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } #endif /* CONFIG_NO_HZ_COMMON */ +/* + * Make sugov_should_update_freq() ignore the rate limit when DL + * has increased the utilization. + */ +static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) +{ + if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) + sg_policy->need_freq_update = true; +} + static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int flags) { @@ -279,6 +289,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; + ignore_dl_rate_limit(sg_cpu, sg_policy); + if (!sugov_should_update_freq(sg_policy, time)) return; @@ -356,6 +368,8 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; + ignore_dl_rate_limit(sg_cpu, sg_policy); + if (sugov_should_update_freq(sg_policy, time)) { next_f = sugov_next_freq_shared(sg_cpu, time); sugov_update_commit(sg_policy, time, next_f); -- cgit v1.2.3-71-gd317 From b9193c1b61ddb97da4713155b0d580e41fb544ac Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 24 Mar 2018 11:44:22 -0700 Subject: bpf: Rename bpf_verifer_log bpf_verifer_log => bpf_verifier_log Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 6 +++--- kernel/bpf/verifier.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6b66cd1aa0b9..c30668414b22 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -153,7 +153,7 @@ struct bpf_insn_aux_data { #define BPF_VERIFIER_TMP_LOG_SIZE 1024 -struct bpf_verifer_log { +struct bpf_verifier_log { u32 level; char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; char __user *ubuf; @@ -161,7 +161,7 @@ struct bpf_verifer_log { u32 len_total; }; -static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log) +static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) { return log->len_used >= log->len_total - 1; } @@ -185,7 +185,7 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ - struct bpf_verifer_log log; + struct bpf_verifier_log log; u32 subprog_starts[BPF_MAX_SUBPROGS]; /* computes the stack depth of each bpf function */ u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e93a6e48641b..1e84e02ff733 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -171,7 +171,7 @@ static DEFINE_MUTEX(bpf_verifier_lock); static void log_write(struct bpf_verifier_env *env, const char *fmt, va_list args) { - struct bpf_verifer_log *log = &env->log; + struct bpf_verifier_log *log = &env->log; unsigned int n; if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) @@ -5611,7 +5611,7 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { struct bpf_verifier_env *env; - struct bpf_verifer_log *log; + struct bpf_verifier_log *log; int ret = -EINVAL; /* no program is valid */ -- cgit v1.2.3-71-gd317 From 77d2e05abd45886dcad2b632c738cf46b9f7c19e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 24 Mar 2018 11:44:23 -0700 Subject: bpf: Add bpf_verifier_vlog() and bpf_verifier_log_needed() The BTF (BPF Type Format) verifier needs to reuse the current BPF verifier log. Hence, it requires the following changes: (1) Expose log_write() in verifier.c for other users. Its name is renamed to bpf_verifier_vlog(). (2) The BTF verifier also needs to check 'log->level && log->ubuf && !bpf_verifier_log_full(log);' independently outside of the current log_write(). It is because the BTF verifier will do one-check before making multiple calls to btf_verifier_vlog to log the details of a type. Hence, this check is also re-factored to a new function bpf_verifier_log_needed(). Since it is re-factored, we can check it before va_start() in the current bpf_verifier_log_write() and verbose(). Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 7 +++++++ kernel/bpf/verifier.c | 19 +++++++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c30668414b22..7e61c395fddf 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -166,6 +166,11 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) return log->len_used >= log->len_total - 1; } +static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) +{ + return log->level && log->ubuf && !bpf_verifier_log_full(log); +} + #define BPF_MAX_SUBPROGS 256 /* single container for all structs @@ -192,6 +197,8 @@ struct bpf_verifier_env { u32 subprog_cnt; }; +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, + va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1e84e02ff733..8acd2207e412 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -168,15 +168,11 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); -static void log_write(struct bpf_verifier_env *env, const char *fmt, - va_list args) +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, + va_list args) { - struct bpf_verifier_log *log = &env->log; unsigned int n; - if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) - return; - n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, @@ -200,18 +196,25 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, { va_list args; + if (!bpf_verifier_log_needed(&env->log)) + return; + va_start(args, fmt); - log_write(env, fmt, args); + bpf_verifier_vlog(&env->log, fmt, args); va_end(args); } EXPORT_SYMBOL_GPL(bpf_verifier_log_write); __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) { + struct bpf_verifier_env *env = private_data; va_list args; + if (!bpf_verifier_log_needed(&env->log)) + return; + va_start(args, fmt); - log_write(private_data, fmt, args); + bpf_verifier_vlog(&env->log, fmt, args); va_end(args); } -- cgit v1.2.3-71-gd317 From 447a5647c9e7ab97780541f546a90e2620caa487 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 21 Mar 2018 15:09:32 -0700 Subject: treewide: Align function definition open/close braces Some functions definitions have either the initial open brace and/or the closing brace outside of column 1. Move those braces to column 1. This allows various function analyzers like gnu complexity to work properly for these modified functions. Signed-off-by: Joe Perches Acked-by: Andy Shevchenko Acked-by: Paul Moore Acked-by: Alex Deucher Acked-by: Dave Chinner Reviewed-by: Darrick J. Wong Acked-by: Alexandre Belloni Acked-by: Martin K. Petersen Acked-by: Takashi Iwai Acked-by: Mauro Carvalho Chehab Acked-by: Rafael J. Wysocki Acked-by: Nicolin Chen Acked-by: Martin K. Petersen Acked-by: Steven Rostedt (VMware) Signed-off-by: Jiri Kosina --- arch/x86/include/asm/atomic64_32.h | 2 +- drivers/acpi/custom_method.c | 2 +- drivers/acpi/fan.c | 2 +- drivers/gpu/drm/amd/display/dc/core/dc.c | 2 +- drivers/media/i2c/msp3400-kthreads.c | 2 +- drivers/message/fusion/mptsas.c | 2 +- drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c | 2 +- drivers/net/wireless/ath/ath9k/xmit.c | 2 +- drivers/platform/x86/eeepc-laptop.c | 2 +- drivers/rtc/rtc-ab-b5ze-s3.c | 2 +- drivers/scsi/dpt_i2o.c | 2 +- drivers/scsi/sym53c8xx_2/sym_glue.c | 2 +- fs/locks.c | 2 +- fs/ocfs2/stack_user.c | 2 +- fs/xfs/xfs_export.c | 2 +- kernel/audit.c | 6 +++--- kernel/trace/trace_printk.c | 4 ++-- lib/raid6/sse2.c | 14 +++++++------- sound/soc/fsl/fsl_dma.c | 2 +- 19 files changed, 28 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 97c46b8169b7..d4d4883080fa 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -122,7 +122,7 @@ static inline long long atomic64_read(const atomic64_t *v) long long r; alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); return r; - } +} /** * atomic64_add_return - add and return diff --git a/drivers/acpi/custom_method.c b/drivers/acpi/custom_method.c index c68e72414a67..e967c1173ba3 100644 --- a/drivers/acpi/custom_method.c +++ b/drivers/acpi/custom_method.c @@ -94,7 +94,7 @@ static void __exit acpi_custom_method_exit(void) { if (cm_dentry) debugfs_remove(cm_dentry); - } +} module_init(acpi_custom_method_init); module_exit(acpi_custom_method_exit); diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index 6cf4988206f2..3563103590c6 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -219,7 +219,7 @@ fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) return fan_set_state_acpi4(device, state); else return fan_set_state(device, state); - } +} static const struct thermal_cooling_device_ops fan_cooling_ops = { .get_max_state = fan_get_max_state, diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 12868c769606..5b15c04ce751 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -463,7 +463,7 @@ static void disable_dangling_plane(struct dc *dc, struct dc_state *context) ******************************************************************************/ struct dc *dc_create(const struct dc_init_data *init_params) - { +{ struct dc *dc = kzalloc(sizeof(*dc), GFP_KERNEL); unsigned int full_pipe_count; diff --git a/drivers/media/i2c/msp3400-kthreads.c b/drivers/media/i2c/msp3400-kthreads.c index 4dd01e9f553b..dc6cb8d475b3 100644 --- a/drivers/media/i2c/msp3400-kthreads.c +++ b/drivers/media/i2c/msp3400-kthreads.c @@ -885,7 +885,7 @@ static int msp34xxg_modus(struct i2c_client *client) } static void msp34xxg_set_source(struct i2c_client *client, u16 reg, int in) - { +{ struct msp_state *state = to_state(i2c_get_clientdata(client)); int source, matrix; diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 439ee9c5f535..231f3a1e27bf 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -2967,7 +2967,7 @@ out_unlock: mutex_unlock(&ioc->sas_mgmt.mutex); out: return ret; - } +} static void mptsas_parse_device_info(struct sas_identify *identify, diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c index 3dd973475125..0ea141ece19e 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c @@ -603,7 +603,7 @@ static struct uni_table_desc *nx_get_table_desc(const u8 *unirom, int section) static int netxen_nic_validate_header(struct netxen_adapter *adapter) - { +{ const u8 *unirom = adapter->fw->data; struct uni_table_desc *directory = (struct uni_table_desc *) &unirom[0]; u32 fw_file_size = adapter->fw->size; diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c index 396bf05c6bf6..88be55ed5b4d 100644 --- a/drivers/net/wireless/ath/ath9k/xmit.c +++ b/drivers/net/wireless/ath/ath9k/xmit.c @@ -252,7 +252,7 @@ ath_tid_pull(struct ath_atx_tid *tid) } return skb; - } +} static bool ath_tid_has_buffered(struct ath_atx_tid *tid) diff --git a/drivers/platform/x86/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c index 5a681962899c..4c38904a8a32 100644 --- a/drivers/platform/x86/eeepc-laptop.c +++ b/drivers/platform/x86/eeepc-laptop.c @@ -492,7 +492,7 @@ static void eeepc_platform_exit(struct eeepc_laptop *eeepc) * potentially bad time, such as a timer interrupt. */ static void tpd_led_update(struct work_struct *work) - { +{ struct eeepc_laptop *eeepc; eeepc = container_of(work, struct eeepc_laptop, tpd_led_work); diff --git a/drivers/rtc/rtc-ab-b5ze-s3.c b/drivers/rtc/rtc-ab-b5ze-s3.c index a319bf1e49de..ef5c16dfabfa 100644 --- a/drivers/rtc/rtc-ab-b5ze-s3.c +++ b/drivers/rtc/rtc-ab-b5ze-s3.c @@ -648,7 +648,7 @@ static int abb5zes3_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) ret); return ret; - } +} /* Enable or disable battery low irq generation */ static inline int _abb5zes3_rtc_battery_low_irq_enable(struct regmap *regmap, diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c index fd172b0890d3..a00d822e3142 100644 --- a/drivers/scsi/dpt_i2o.c +++ b/drivers/scsi/dpt_i2o.c @@ -3524,7 +3524,7 @@ static int adpt_i2o_systab_send(adpt_hba* pHba) #endif return ret; - } +} /*============================================================================ diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c index 791a2182de53..7320d5fe4cbc 100644 --- a/drivers/scsi/sym53c8xx_2/sym_glue.c +++ b/drivers/scsi/sym53c8xx_2/sym_glue.c @@ -1393,7 +1393,7 @@ static struct Scsi_Host *sym_attach(struct scsi_host_template *tpnt, int unit, scsi_host_put(shost); return NULL; - } +} /* diff --git a/fs/locks.c b/fs/locks.c index d6ff4beb70ce..62bbe8b31f26 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -559,7 +559,7 @@ static const struct lock_manager_operations lease_manager_ops = { * Initialize a lease, use the default lock manager operations */ static int lease_init(struct file *filp, long type, struct file_lock *fl) - { +{ if (assign_type(fl, type) != 0) return -EINVAL; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index dae9eb7c441e..d2fb97b173da 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -398,7 +398,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file, static int ocfs2_control_do_setversion_msg(struct file *file, struct ocfs2_control_message_setv *msg) - { +{ long major, minor; char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index fe1bfee35898..7d5c355d78b5 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -122,7 +122,7 @@ xfs_nfs_get_inode( struct super_block *sb, u64 ino, u32 generation) - { +{ xfs_mount_t *mp = XFS_M(sb); xfs_inode_t *ip; int error; diff --git a/kernel/audit.c b/kernel/audit.c index 227db99b0f19..d97e8f0f73ca 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -443,15 +443,15 @@ static int audit_set_failure(u32 state) * Drop any references inside the auditd connection tracking struct and free * the memory. */ - static void auditd_conn_free(struct rcu_head *rcu) - { +static void auditd_conn_free(struct rcu_head *rcu) +{ struct auditd_connection *ac; ac = container_of(rcu, struct auditd_connection, rcu); put_pid(ac->pid); put_net(ac->net); kfree(ac); - } +} /** * auditd_set - Set/Reset the auditd connection state diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index ad1d6164e946..50f44b7b2b32 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -196,7 +196,7 @@ struct notifier_block module_trace_bprintk_format_nb = { }; int __trace_bprintk(unsigned long ip, const char *fmt, ...) - { +{ int ret; va_list ap; @@ -214,7 +214,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...) EXPORT_SYMBOL_GPL(__trace_bprintk); int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) - { +{ if (unlikely(!fmt)) return 0; diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c index 1d2276b007ee..8191e1d0d2fb 100644 --- a/lib/raid6/sse2.c +++ b/lib/raid6/sse2.c @@ -91,7 +91,7 @@ static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) static void raid6_sse21_xor_syndrome(int disks, int start, int stop, size_t bytes, void **ptrs) - { +{ u8 **dptr = (u8 **)ptrs; u8 *p, *q; int d, z, z0; @@ -200,9 +200,9 @@ static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } - static void raid6_sse22_xor_syndrome(int disks, int start, int stop, +static void raid6_sse22_xor_syndrome(int disks, int start, int stop, size_t bytes, void **ptrs) - { +{ u8 **dptr = (u8 **)ptrs; u8 *p, *q; int d, z, z0; @@ -265,7 +265,7 @@ static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) asm volatile("sfence" : : : "memory"); kernel_fpu_end(); - } +} const struct raid6_calls raid6_sse2x2 = { raid6_sse22_gen_syndrome, @@ -366,9 +366,9 @@ static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } - static void raid6_sse24_xor_syndrome(int disks, int start, int stop, +static void raid6_sse24_xor_syndrome(int disks, int start, int stop, size_t bytes, void **ptrs) - { +{ u8 **dptr = (u8 **)ptrs; u8 *p, *q; int d, z, z0; @@ -471,7 +471,7 @@ static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) } asm volatile("sfence" : : : "memory"); kernel_fpu_end(); - } +} const struct raid6_calls raid6_sse2x4 = { diff --git a/sound/soc/fsl/fsl_dma.c b/sound/soc/fsl/fsl_dma.c index 8c2981b70f64..5326c5addd1c 100644 --- a/sound/soc/fsl/fsl_dma.c +++ b/sound/soc/fsl/fsl_dma.c @@ -879,7 +879,7 @@ static const struct snd_pcm_ops fsl_dma_ops = { }; static int fsl_soc_dma_probe(struct platform_device *pdev) - { +{ struct dma_object *dma; struct device_node *np = pdev->dev.of_node; struct device_node *ssi_np; -- cgit v1.2.3-71-gd317 From b720342849fe685310fca01748a32730a6eca5aa Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 26 Mar 2018 14:09:26 -0700 Subject: sched/core: Update preempt_notifier_key to modern API No changes in refcount semantics, use DEFINE_STATIC_KEY_FALSE() for initialization and replace: static_key_slow_inc|dec() => static_branch_inc|dec() static_key_false() => static_branch_unlikely() Signed-off-by: Davidlohr Bueso Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Link: http://lkml.kernel.org/r/20180326210929.5244-4-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b249adbf2a48..de440456f15c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2462,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS -static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; +static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); void preempt_notifier_inc(void) { - static_key_slow_inc(&preempt_notifier_key); + static_branch_inc(&preempt_notifier_key); } EXPORT_SYMBOL_GPL(preempt_notifier_inc); void preempt_notifier_dec(void) { - static_key_slow_dec(&preempt_notifier_key); + static_branch_dec(&preempt_notifier_key); } EXPORT_SYMBOL_GPL(preempt_notifier_dec); @@ -2482,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec); */ void preempt_notifier_register(struct preempt_notifier *notifier) { - if (!static_key_false(&preempt_notifier_key)) + if (!static_branch_unlikely(&preempt_notifier_key)) WARN(1, "registering preempt_notifier while notifiers disabled\n"); hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); @@ -2511,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { - if (static_key_false(&preempt_notifier_key)) + if (static_branch_unlikely(&preempt_notifier_key)) __fire_sched_in_preempt_notifiers(curr); } @@ -2529,7 +2529,7 @@ static __always_inline void fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { - if (static_key_false(&preempt_notifier_key)) + if (static_branch_unlikely(&preempt_notifier_key)) __fire_sched_out_preempt_notifiers(curr, next); } -- cgit v1.2.3-71-gd317 From 13cf912b2da3d79377d8335492077b7225ce67f9 Mon Sep 17 00:00:00 2001 From: Rohit Visavalia Date: Mon, 29 Jan 2018 15:11:26 +0530 Subject: tracing: Block comments should align the * on each line Resolved Block comments use * on subsequent lines checkpatch warning. Issue found by checkpatch. Signed-off-by: Rohit Visavalia Signed-off-by: Jiri Kosina --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 20a2300ae4e8..a392bb00889d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2380,7 +2380,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit); * trace_buffer_unlock_commit_regs() * trace_event_buffer_commit() * trace_event_raw_event_xxx() -*/ + */ # define STACK_SKIP 3 void trace_buffer_unlock_commit_regs(struct trace_array *tr, -- cgit v1.2.3-71-gd317 From 2f635ceeb22ba13c307236d69795fbb29cfa3e7c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 27 Mar 2018 18:02:13 +0300 Subject: net: Drop pernet_operations::async Synchronous pernet_operations are not allowed anymore. All are asynchronous. So, drop the structure member. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- drivers/infiniband/core/cma.c | 1 - drivers/net/bonding/bond_main.c | 1 - drivers/net/geneve.c | 1 - drivers/net/gtp.c | 1 - drivers/net/ipvlan/ipvlan_main.c | 1 - drivers/net/loopback.c | 1 - drivers/net/ppp/ppp_generic.c | 1 - drivers/net/ppp/pppoe.c | 1 - drivers/net/vrf.c | 1 - drivers/net/vxlan.c | 1 - drivers/net/wireless/mac80211_hwsim.c | 1 - fs/lockd/svc.c | 1 - fs/nfs/blocklayout/rpc_pipefs.c | 1 - fs/nfs/dns_resolve.c | 1 - fs/nfs/inode.c | 1 - fs/nfs_common/grace.c | 1 - fs/nfsd/nfsctl.c | 1 - fs/proc/proc_net.c | 1 - include/net/net_namespace.h | 6 ------ kernel/audit.c | 1 - lib/kobject_uevent.c | 1 - net/8021q/vlan.c | 1 - net/bridge/br.c | 1 - net/bridge/br_netfilter_hooks.c | 1 - net/bridge/netfilter/ebtable_broute.c | 1 - net/bridge/netfilter/ebtable_filter.c | 1 - net/bridge/netfilter/ebtable_nat.c | 1 - net/bridge/netfilter/nf_log_bridge.c | 1 - net/caif/caif_dev.c | 1 - net/can/af_can.c | 1 - net/can/bcm.c | 1 - net/can/gw.c | 1 - net/core/dev.c | 2 -- net/core/fib_notifier.c | 1 - net/core/fib_rules.c | 1 - net/core/net-procfs.c | 2 -- net/core/net_namespace.c | 2 -- net/core/pktgen.c | 1 - net/core/rtnetlink.c | 1 - net/core/sock.c | 2 -- net/core/sock_diag.c | 1 - net/core/sysctl_net_core.c | 1 - net/dccp/ipv4.c | 1 - net/dccp/ipv6.c | 1 - net/ieee802154/6lowpan/reassembly.c | 1 - net/ieee802154/core.c | 1 - net/ipv4/af_inet.c | 2 -- net/ipv4/arp.c | 1 - net/ipv4/devinet.c | 1 - net/ipv4/fib_frontend.c | 1 - net/ipv4/fou.c | 1 - net/ipv4/icmp.c | 1 - net/ipv4/igmp.c | 1 - net/ipv4/ip_fragment.c | 1 - net/ipv4/ip_gre.c | 3 --- net/ipv4/ip_vti.c | 1 - net/ipv4/ipip.c | 1 - net/ipv4/ipmr.c | 1 - net/ipv4/netfilter/arp_tables.c | 1 - net/ipv4/netfilter/arptable_filter.c | 1 - net/ipv4/netfilter/ip_tables.c | 1 - net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 - net/ipv4/netfilter/iptable_filter.c | 1 - net/ipv4/netfilter/iptable_mangle.c | 1 - net/ipv4/netfilter/iptable_nat.c | 1 - net/ipv4/netfilter/iptable_raw.c | 1 - net/ipv4/netfilter/iptable_security.c | 1 - net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 1 - net/ipv4/netfilter/nf_defrag_ipv4.c | 1 - net/ipv4/netfilter/nf_log_arp.c | 1 - net/ipv4/netfilter/nf_log_ipv4.c | 1 - net/ipv4/ping.c | 1 - net/ipv4/proc.c | 1 - net/ipv4/raw.c | 1 - net/ipv4/route.c | 4 ---- net/ipv4/sysctl_net_ipv4.c | 1 - net/ipv4/tcp_ipv4.c | 2 -- net/ipv4/tcp_metrics.c | 1 - net/ipv4/udp.c | 2 -- net/ipv4/udplite.c | 1 - net/ipv4/xfrm4_policy.c | 1 - net/ipv6/addrconf.c | 2 -- net/ipv6/addrlabel.c | 1 - net/ipv6/af_inet6.c | 1 - net/ipv6/fib6_rules.c | 1 - net/ipv6/icmp.c | 1 - net/ipv6/ila/ila_xlat.c | 1 - net/ipv6/ip6_fib.c | 1 - net/ipv6/ip6_flowlabel.c | 1 - net/ipv6/ip6_gre.c | 1 - net/ipv6/ip6_tunnel.c | 1 - net/ipv6/ip6_vti.c | 1 - net/ipv6/ip6mr.c | 1 - net/ipv6/mcast.c | 1 - net/ipv6/ndisc.c | 1 - net/ipv6/netfilter/ip6_tables.c | 1 - net/ipv6/netfilter/ip6table_filter.c | 1 - net/ipv6/netfilter/ip6table_mangle.c | 1 - net/ipv6/netfilter/ip6table_nat.c | 1 - net/ipv6/netfilter/ip6table_raw.c | 1 - net/ipv6/netfilter/ip6table_security.c | 1 - net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 1 - net/ipv6/netfilter/nf_conntrack_reasm.c | 1 - net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 1 - net/ipv6/netfilter/nf_log_ipv6.c | 1 - net/ipv6/ping.c | 1 - net/ipv6/proc.c | 1 - net/ipv6/raw.c | 1 - net/ipv6/reassembly.c | 1 - net/ipv6/route.c | 3 --- net/ipv6/seg6.c | 1 - net/ipv6/sit.c | 1 - net/ipv6/sysctl_net_ipv6.c | 1 - net/ipv6/tcp_ipv6.c | 1 - net/ipv6/udplite.c | 1 - net/ipv6/xfrm6_policy.c | 1 - net/ipv6/xfrm6_tunnel.c | 1 - net/kcm/kcmproc.c | 1 - net/kcm/kcmsock.c | 1 - net/key/af_key.c | 1 - net/l2tp/l2tp_core.c | 1 - net/l2tp/l2tp_ppp.c | 1 - net/mpls/af_mpls.c | 1 - net/netfilter/core.c | 1 - net/netfilter/ipset/ip_set_core.c | 1 - net/netfilter/ipvs/ip_vs_core.c | 2 -- net/netfilter/ipvs/ip_vs_ftp.c | 1 - net/netfilter/ipvs/ip_vs_lblc.c | 1 - net/netfilter/ipvs/ip_vs_lblcr.c | 1 - net/netfilter/nf_conntrack_netlink.c | 1 - net/netfilter/nf_conntrack_proto_gre.c | 1 - net/netfilter/nf_conntrack_standalone.c | 1 - net/netfilter/nf_log.c | 1 - net/netfilter/nf_log_netdev.c | 1 - net/netfilter/nf_synproxy_core.c | 1 - net/netfilter/nf_tables_api.c | 1 - net/netfilter/nfnetlink.c | 1 - net/netfilter/nfnetlink_acct.c | 1 - net/netfilter/nfnetlink_cttimeout.c | 1 - net/netfilter/nfnetlink_log.c | 1 - net/netfilter/nfnetlink_queue.c | 1 - net/netfilter/x_tables.c | 1 - net/netfilter/xt_hashlimit.c | 1 - net/netfilter/xt_recent.c | 1 - net/netlink/af_netlink.c | 2 -- net/netlink/genetlink.c | 1 - net/openvswitch/datapath.c | 1 - net/packet/af_packet.c | 1 - net/phonet/pn_dev.c | 1 - net/rds/tcp.c | 1 - net/rxrpc/net_ns.c | 1 - net/sched/act_api.c | 1 - net/sched/act_bpf.c | 1 - net/sched/act_connmark.c | 1 - net/sched/act_csum.c | 1 - net/sched/act_gact.c | 1 - net/sched/act_ife.c | 1 - net/sched/act_ipt.c | 2 -- net/sched/act_mirred.c | 1 - net/sched/act_nat.c | 1 - net/sched/act_pedit.c | 1 - net/sched/act_police.c | 1 - net/sched/act_sample.c | 1 - net/sched/act_simple.c | 1 - net/sched/act_skbedit.c | 1 - net/sched/act_skbmod.c | 1 - net/sched/act_tunnel_key.c | 1 - net/sched/act_vlan.c | 1 - net/sched/cls_api.c | 1 - net/sched/sch_api.c | 1 - net/sctp/protocol.c | 2 -- net/sunrpc/auth_gss/auth_gss.c | 1 - net/sunrpc/sunrpc_syms.c | 1 - net/sysctl_net.c | 1 - net/tipc/core.c | 1 - net/unix/af_unix.c | 1 - net/wireless/core.c | 1 - net/wireless/wext-core.c | 1 - net/xfrm/xfrm_policy.c | 1 - net/xfrm/xfrm_user.c | 1 - security/selinux/hooks.c | 1 - security/smack/smack_netfilter.c | 1 - 182 files changed, 206 deletions(-) (limited to 'kernel') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 66f203730e80..6ab1059fed66 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4554,7 +4554,6 @@ static struct pernet_operations cma_pernet_operations = { .exit = cma_exit_net, .id = &cma_pernet_id, .size = sizeof(struct cma_pernet), - .async = true, }; static int __init cma_init(void) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 4c19d23dd282..c669554d70bb 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4791,7 +4791,6 @@ static struct pernet_operations bond_net_ops = { .exit = bond_net_exit, .id = &bond_net_id, .size = sizeof(struct bond_net), - .async = true, }; static int __init bonding_init(void) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 516dd59249d7..b919e89a9b93 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1694,7 +1694,6 @@ static struct pernet_operations geneve_net_ops = { .exit_batch = geneve_exit_batch_net, .id = &geneve_net_id, .size = sizeof(struct geneve_net), - .async = true, }; static int __init geneve_init_module(void) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 127edd23018f..f38e32a7ec9c 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1325,7 +1325,6 @@ static struct pernet_operations gtp_net_ops = { .exit = gtp_net_exit, .id = >p_net_id, .size = sizeof(struct gtp_net), - .async = true, }; static int __init gtp_init(void) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 743d37fb034a..450eec264a5e 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -1040,7 +1040,6 @@ static struct pernet_operations ipvlan_net_ops = { .id = &ipvlan_netid, .size = sizeof(struct ipvlan_netns), .exit = ipvlan_ns_exit, - .async = true, }; static int __init ipvlan_init_module(void) diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index b97a907ea5aa..30612497643c 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -230,5 +230,4 @@ out: /* Registered in net/core/dev.c */ struct pernet_operations __net_initdata loopback_net_ops = { .init = loopback_net_init, - .async = true, }; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 22fcff3c7a9a..dc7c7ec43202 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -970,7 +970,6 @@ static struct pernet_operations ppp_net_ops = { .exit = ppp_exit_net, .id = &ppp_net_id, .size = sizeof(struct ppp_net), - .async = true, }; static int ppp_unit_register(struct ppp *ppp, int unit, bool ifname_is_set) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index f9552a400271..1483bc7b01e1 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -1161,7 +1161,6 @@ static struct pernet_operations pppoe_net_ops = { .exit = pppoe_exit_net, .id = &pppoe_net_id, .size = sizeof(struct pppoe_net), - .async = true, }; static int __init pppoe_init(void) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index c6be49d3a9eb..102582459bef 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1435,7 +1435,6 @@ static struct pernet_operations vrf_net_ops __net_initdata = { .init = vrf_netns_init, .id = &vrf_net_id, .size = sizeof(bool), - .async = true, }; static int __init vrf_init_module(void) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index aa5f034d6ad1..fab7a4db249e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -3752,7 +3752,6 @@ static struct pernet_operations vxlan_net_ops = { .exit_batch = vxlan_exit_batch_net, .id = &vxlan_net_id, .size = sizeof(struct vxlan_net), - .async = true, }; static int __init vxlan_init_module(void) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 100cf42db65d..a37f4b1d9d30 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3542,7 +3542,6 @@ static struct pernet_operations hwsim_net_ops = { .exit = hwsim_exit_net, .id = &hwsim_net_id, .size = sizeof(struct hwsim_net), - .async = true, }; static void hwsim_exit_netlink(void) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2dee4e03ff1c..9c36d614bf89 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -709,7 +709,6 @@ static struct pernet_operations lockd_net_ops = { .exit = lockd_exit_net, .id = &lockd_net_id, .size = sizeof(struct lockd_net), - .async = true, }; diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index ef9fa111b009..9fb067a6f7e0 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -261,7 +261,6 @@ static void nfs4blocklayout_net_exit(struct net *net) static struct pernet_operations nfs4blocklayout_net_ops = { .init = nfs4blocklayout_net_init, .exit = nfs4blocklayout_net_exit, - .async = true, }; int __init bl_init_pipefs(void) diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index e90bd69ab653..060c658eab66 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -410,7 +410,6 @@ static void nfs4_dns_net_exit(struct net *net) static struct pernet_operations nfs4_dns_resolver_ops = { .init = nfs4_dns_net_init, .exit = nfs4_dns_net_exit, - .async = true, }; static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6c3083c992e5..7d893543cf3b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2122,7 +2122,6 @@ static struct pernet_operations nfs_net_ops = { .exit = nfs_net_exit, .id = &nfs_net_id, .size = sizeof(struct nfs_net), - .async = true, }; /* diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 8c743a405df6..5be08f02a76b 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -118,7 +118,6 @@ static struct pernet_operations grace_net_ops = { .exit = grace_exit_net, .id = &grace_net_id, .size = sizeof(struct list_head), - .async = true, }; static int __init diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1e3824e6cce0..d107b4426f7e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1263,7 +1263,6 @@ static struct pernet_operations nfsd_net_ops = { .exit = nfsd_exit_net, .id = &nfsd_net_id, .size = sizeof(struct nfsd_net), - .async = true, }; static int __init init_nfsd(void) diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index da6f8733c9c5..68c06ae7888c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -237,7 +237,6 @@ static __net_exit void proc_net_ns_exit(struct net *net) static struct pernet_operations __net_initdata proc_net_ns_ops = { .init = proc_net_ns_init, .exit = proc_net_ns_exit, - .async = true, }; int __init proc_net_init(void) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 09e30bdc7876..37bcf8382b61 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -333,12 +333,6 @@ struct pernet_operations { void (*exit_batch)(struct list_head *net_exit_list); unsigned int *id; size_t size; - /* - * Indicates above methods are allowed to be executed in parallel - * with methods of any other pernet_operations, i.e. they are not - * need write locked net_sem. - */ - bool async; }; /* diff --git a/kernel/audit.c b/kernel/audit.c index 5e49b614d0e6..227db99b0f19 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1526,7 +1526,6 @@ static struct pernet_operations audit_net_ops __net_initdata = { .exit = audit_net_exit, .id = &audit_net_id, .size = sizeof(struct audit_net), - .async = true, }; /* Initialize audit support at boot time. */ diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index fa10ad8e9b17..15ea216a67ce 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -724,7 +724,6 @@ static void uevent_net_exit(struct net *net) static struct pernet_operations uevent_net_ops = { .init = uevent_net_init, .exit = uevent_net_exit, - .async = true, }; static int __init kobject_uevent_init(void) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index bd0ed39f65fb..bad01b14a4ad 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -729,7 +729,6 @@ static struct pernet_operations vlan_net_ops = { .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), - .async = true, }; static int __init vlan_proto_init(void) diff --git a/net/bridge/br.c b/net/bridge/br.c index a3f95ab9d6a3..26e1616b2c90 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -188,7 +188,6 @@ static void __net_exit br_net_exit(struct net *net) static struct pernet_operations br_net_ops = { .exit = br_net_exit, - .async = true, }; static const struct stp_proto br_stp_proto = { diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c2120eb889a9..9b16eaf33819 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -969,7 +969,6 @@ static struct pernet_operations brnf_net_ops __read_mostly = { .exit = brnf_exit_net, .id = &brnf_net_id, .size = sizeof(struct brnf_net), - .async = true, }; static struct notifier_block brnf_notifier __read_mostly = { diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index f070b5e5b9dd..276b60262981 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -77,7 +77,6 @@ static void __net_exit broute_net_exit(struct net *net) static struct pernet_operations broute_net_ops = { .init = broute_net_init, .exit = broute_net_exit, - .async = true, }; static int __init ebtable_broute_init(void) diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 4151afc8efcc..c41da5fac84f 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -105,7 +105,6 @@ static void __net_exit frame_filter_net_exit(struct net *net) static struct pernet_operations frame_filter_net_ops = { .init = frame_filter_net_init, .exit = frame_filter_net_exit, - .async = true, }; static int __init ebtable_filter_init(void) diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index b8da2dfe2ec5..08df7406ecb3 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -105,7 +105,6 @@ static void __net_exit frame_nat_net_exit(struct net *net) static struct pernet_operations frame_nat_net_ops = { .init = frame_nat_net_init, .exit = frame_nat_net_exit, - .async = true, }; static int __init ebtable_nat_init(void) diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c index 91bfc2ac055a..bd2b3c78f59b 100644 --- a/net/bridge/netfilter/nf_log_bridge.c +++ b/net/bridge/netfilter/nf_log_bridge.c @@ -48,7 +48,6 @@ static void __net_exit nf_log_bridge_net_exit(struct net *net) static struct pernet_operations nf_log_bridge_net_ops = { .init = nf_log_bridge_net_init, .exit = nf_log_bridge_net_exit, - .async = true, }; static int __init nf_log_bridge_init(void) diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 7a78268cc572..e0adcd123f48 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -544,7 +544,6 @@ static struct pernet_operations caif_net_ops = { .exit = caif_exit_net, .id = &caif_net_id, .size = sizeof(struct caif_net), - .async = true, }; /* Initialize Caif devices list */ diff --git a/net/can/af_can.c b/net/can/af_can.c index 2f0d0a72e4b5..1684ba5b51eb 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -954,7 +954,6 @@ static struct notifier_block can_netdev_notifier __read_mostly = { static struct pernet_operations can_pernet_ops __read_mostly = { .init = can_pernet_init, .exit = can_pernet_exit, - .async = true, }; static __init int can_init(void) diff --git a/net/can/bcm.c b/net/can/bcm.c index 26730d39e048..ac5e5e34fee3 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1717,7 +1717,6 @@ static void canbcm_pernet_exit(struct net *net) static struct pernet_operations canbcm_pernet_ops __read_mostly = { .init = canbcm_pernet_init, .exit = canbcm_pernet_exit, - .async = true, }; static int __init bcm_module_init(void) diff --git a/net/can/gw.c b/net/can/gw.c index 8d71e199d5b3..faa3da88a127 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1010,7 +1010,6 @@ static void __net_exit cangw_pernet_exit(struct net *net) static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, .exit = cangw_pernet_exit, - .async = true, }; static __init int cgw_module_init(void) diff --git a/net/core/dev.c b/net/core/dev.c index 97a96df4b6da..e13807b5c84d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8883,7 +8883,6 @@ static void __net_exit netdev_exit(struct net *net) static struct pernet_operations __net_initdata netdev_net_ops = { .init = netdev_init, .exit = netdev_exit, - .async = true, }; static void __net_exit default_device_exit(struct net *net) @@ -8984,7 +8983,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) static struct pernet_operations __net_initdata default_device_ops = { .exit = default_device_exit, .exit_batch = default_device_exit_batch, - .async = true, }; /* diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 5ace0705a3f9..0c048bdeb016 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -171,7 +171,6 @@ static void __net_exit fib_notifier_net_exit(struct net *net) static struct pernet_operations fib_notifier_net_ops = { .init = fib_notifier_net_init, .exit = fib_notifier_net_exit, - .async = true, }; static int __init fib_notifier_init(void) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index f6f04fc0f629..9d87ce868402 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1130,7 +1130,6 @@ static void __net_exit fib_rules_net_exit(struct net *net) static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, .exit = fib_rules_net_exit, - .async = true, }; static int __init fib_rules_init(void) diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index dd6ae431d038..9737302907b1 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -349,7 +349,6 @@ static void __net_exit dev_proc_net_exit(struct net *net) static struct pernet_operations __net_initdata dev_proc_ops = { .init = dev_proc_net_init, .exit = dev_proc_net_exit, - .async = true, }; static int dev_mc_seq_show(struct seq_file *seq, void *v) @@ -406,7 +405,6 @@ static void __net_exit dev_mc_net_exit(struct net *net) static struct pernet_operations __net_initdata dev_mc_net_ops = { .init = dev_mc_net_init, .exit = dev_mc_net_exit, - .async = true, }; int __init dev_proc_init(void) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 0f614523a13f..eef17ad29dea 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -338,7 +338,6 @@ static int __net_init net_defaults_init_net(struct net *net) static struct pernet_operations net_defaults_ops = { .init = net_defaults_init_net, - .async = true, }; static __init int net_defaults_init(void) @@ -628,7 +627,6 @@ static __net_exit void net_ns_net_exit(struct net *net) static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, .exit = net_ns_net_exit, - .async = true, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 545cf08cd558..7e4ede34cc52 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3852,7 +3852,6 @@ static struct pernet_operations pg_net_ops = { .exit = pg_net_exit, .id = &pg_net_id, .size = sizeof(struct pktgen_net), - .async = true, }; static int __init pg_init(void) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 87079eaa871b..31438b63d4b4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4730,7 +4730,6 @@ static void __net_exit rtnetlink_net_exit(struct net *net) static struct pernet_operations rtnetlink_net_ops = { .init = rtnetlink_net_init, .exit = rtnetlink_net_exit, - .async = true, }; void __init rtnetlink_init(void) diff --git a/net/core/sock.c b/net/core/sock.c index 8cee2920a47f..6444525f610c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3175,7 +3175,6 @@ static void __net_exit sock_inuse_exit_net(struct net *net) static struct pernet_operations net_inuse_ops = { .init = sock_inuse_init_net, .exit = sock_inuse_exit_net, - .async = true, }; static __init int net_inuse_init(void) @@ -3470,7 +3469,6 @@ static __net_exit void proto_exit_net(struct net *net) static __net_initdata struct pernet_operations proto_net_ops = { .init = proto_init_net, .exit = proto_exit_net, - .async = true, }; static int __init proto_init(void) diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a3392a8f9276..c37b5be7c5e4 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -324,7 +324,6 @@ static void __net_exit diag_net_exit(struct net *net) static struct pernet_operations diag_net_ops = { .init = diag_net_init, .exit = diag_net_exit, - .async = true, }; static int __init sock_diag_init(void) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 4f47f92459cc..b3b609f0eeb5 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -584,7 +584,6 @@ static __net_exit void sysctl_core_net_exit(struct net *net) static __net_initdata struct pernet_operations sysctl_core_ops = { .init = sysctl_core_net_init, .exit = sysctl_core_net_exit, - .async = true, }; static __init int sysctl_core_init(void) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 13ad28ab1e79..e65fcb45c3f6 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1031,7 +1031,6 @@ static struct pernet_operations dccp_v4_ops = { .init = dccp_v4_init_net, .exit = dccp_v4_exit_net, .exit_batch = dccp_v4_exit_batch, - .async = true, }; static int __init dccp_v4_init(void) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 2f48c020f8c3..5df7857fc0f3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1116,7 +1116,6 @@ static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, .exit_batch = dccp_v6_exit_batch, - .async = true, }; static int __init dccp_v6_init(void) diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index a9ccb1322f69..85bf86ad6b18 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -603,7 +603,6 @@ static void __net_exit lowpan_frags_exit_net(struct net *net) static struct pernet_operations lowpan_frags_ops = { .init = lowpan_frags_init_net, .exit = lowpan_frags_exit_net, - .async = true, }; int __init lowpan_net_frag_init(void) diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 9104943c15ba..cb7176cd4cd6 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -345,7 +345,6 @@ static void __net_exit cfg802154_pernet_exit(struct net *net) static struct pernet_operations cfg802154_pernet_ops = { .exit = cfg802154_pernet_exit, - .async = true, }; static int __init wpan_phy_class_init(void) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e8c7fad8c329..f98e2f0db841 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1735,7 +1735,6 @@ static __net_exit void ipv4_mib_exit_net(struct net *net) static __net_initdata struct pernet_operations ipv4_mib_ops = { .init = ipv4_mib_init_net, .exit = ipv4_mib_exit_net, - .async = true, }; static int __init init_ipv4_mibs(void) @@ -1789,7 +1788,6 @@ static __net_exit void inet_exit_net(struct net *net) static __net_initdata struct pernet_operations af_inet_ops = { .init = inet_init_net, .exit = inet_exit_net, - .async = true, }; static int __init init_inet_pernet_ops(void) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 4c6ba0fb2630..be4c595edccb 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1447,7 +1447,6 @@ static void __net_exit arp_net_exit(struct net *net) static struct pernet_operations arp_net_ops = { .init = arp_net_init, .exit = arp_net_exit, - .async = true, }; static int __init arp_proc_init(void) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5ae0d1f097ca..40f001782c1b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2469,7 +2469,6 @@ static __net_exit void devinet_exit_net(struct net *net) static __net_initdata struct pernet_operations devinet_ops = { .init = devinet_init_net, .exit = devinet_exit_net, - .async = true, }; static struct rtnl_af_ops inet_af_ops __read_mostly = { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ac71c3d496c0..f05afaf3235c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1362,7 +1362,6 @@ static void __net_exit fib_net_exit(struct net *net) static struct pernet_operations fib_net_ops = { .init = fib_net_init, .exit = fib_net_exit, - .async = true, }; void __init ip_fib_init(void) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index d3e1a9af478b..1540db65241a 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -1081,7 +1081,6 @@ static struct pernet_operations fou_net_ops = { .exit = fou_exit_net, .id = &fou_net_id, .size = sizeof(struct fou_net), - .async = true, }; static int __init fou_init(void) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index cc56efa64d5c..1617604c9284 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1257,7 +1257,6 @@ fail: static struct pernet_operations __net_initdata icmp_sk_ops = { .init = icmp_sk_init, .exit = icmp_sk_exit, - .async = true, }; int __init icmp_init(void) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f17cd83ba164..b26a81a7de42 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -3028,7 +3028,6 @@ static void __net_exit igmp_net_exit(struct net *net) static struct pernet_operations igmp_net_ops = { .init = igmp_net_init, .exit = igmp_net_exit, - .async = true, }; #endif diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 5e843ae5e468..bbf1b94942c0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -885,7 +885,6 @@ static void __net_exit ipv4_frags_exit_net(struct net *net) static struct pernet_operations ip4_frags_ops = { .init = ipv4_frags_init_net, .exit = ipv4_frags_exit_net, - .async = true, }; void __init ipfrag_init(void) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9ab1aa2f7660..a8772a978224 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1044,7 +1044,6 @@ static struct pernet_operations ipgre_net_ops = { .exit_batch = ipgre_exit_batch_net, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), - .async = true, }; static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], @@ -1628,7 +1627,6 @@ static struct pernet_operations ipgre_tap_net_ops = { .exit_batch = ipgre_tap_exit_batch_net, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), - .async = true, }; static int __net_init erspan_init_net(struct net *net) @@ -1647,7 +1645,6 @@ static struct pernet_operations erspan_net_ops = { .exit_batch = erspan_exit_batch_net, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), - .async = true, }; static int __init ipgre_init(void) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index b10bf563afd9..51b1669334fe 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -454,7 +454,6 @@ static struct pernet_operations vti_net_ops = { .exit_batch = vti_exit_batch_net, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), - .async = true, }; static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 9c5a4d164f09..c891235b4966 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -669,7 +669,6 @@ static struct pernet_operations ipip_net_ops = { .exit_batch = ipip_exit_batch_net, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), - .async = true, }; static int __init ipip_init(void) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e79211a8537c..2fb4de3f7f66 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3009,7 +3009,6 @@ static void __net_exit ipmr_net_exit(struct net *net) static struct pernet_operations ipmr_net_ops = { .init = ipmr_net_init, .exit = ipmr_net_exit, - .async = true, }; int __init ip_mr_init(void) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c36ffce3c812..e3e420f3ba7b 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1635,7 +1635,6 @@ static void __net_exit arp_tables_net_exit(struct net *net) static struct pernet_operations arp_tables_net_ops = { .init = arp_tables_net_init, .exit = arp_tables_net_exit, - .async = true, }; static int __init arp_tables_init(void) diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 49c2490193ae..8f8713b4388f 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -65,7 +65,6 @@ static void __net_exit arptable_filter_net_exit(struct net *net) static struct pernet_operations arptable_filter_net_ops = { .exit = arptable_filter_net_exit, - .async = true, }; static int __init arptable_filter_init(void) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d4f7584d2dbe..e38395a8dcf2 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1916,7 +1916,6 @@ static void __net_exit ip_tables_net_exit(struct net *net) static struct pernet_operations ip_tables_net_ops = { .init = ip_tables_net_init, .exit = ip_tables_net_exit, - .async = true, }; static int __init ip_tables_init(void) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 31b4cca588d0..2c8d313ae216 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -845,7 +845,6 @@ static struct pernet_operations clusterip_net_ops = { .exit = clusterip_net_exit, .id = &clusterip_net_id, .size = sizeof(struct clusterip_net), - .async = true, }; static int __init clusterip_tg_init(void) diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index c1c136a93911..9ac92ea7b93c 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -87,7 +87,6 @@ static void __net_exit iptable_filter_net_exit(struct net *net) static struct pernet_operations iptable_filter_net_ops = { .init = iptable_filter_net_init, .exit = iptable_filter_net_exit, - .async = true, }; static int __init iptable_filter_init(void) diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index f6074059531a..dea138ca8925 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -113,7 +113,6 @@ static void __net_exit iptable_mangle_net_exit(struct net *net) static struct pernet_operations iptable_mangle_net_ops = { .exit = iptable_mangle_net_exit, - .async = true, }; static int __init iptable_mangle_init(void) diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index b771af74be79..0f7255cc65ee 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -129,7 +129,6 @@ static void __net_exit iptable_nat_net_exit(struct net *net) static struct pernet_operations iptable_nat_net_ops = { .exit = iptable_nat_net_exit, - .async = true, }; static int __init iptable_nat_init(void) diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 963753e50842..960625aabf04 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -76,7 +76,6 @@ static void __net_exit iptable_raw_net_exit(struct net *net) static struct pernet_operations iptable_raw_net_ops = { .exit = iptable_raw_net_exit, - .async = true, }; static int __init iptable_raw_init(void) diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index c40d6b3d8b6a..e5379fe57b64 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -76,7 +76,6 @@ static void __net_exit iptable_security_net_exit(struct net *net) static struct pernet_operations iptable_security_net_ops = { .exit = iptable_security_net_exit, - .async = true, }; static int __init iptable_security_init(void) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 6531f69db010..b50721d9d30e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -399,7 +399,6 @@ static struct pernet_operations ipv4_net_ops = { .exit = ipv4_net_exit, .id = &conntrack4_net_id, .size = sizeof(struct conntrack4_net), - .async = true, }; static int __init nf_conntrack_l3proto_ipv4_init(void) diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 57244b62a4fc..a0d3ad60a411 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -118,7 +118,6 @@ static void __net_exit defrag4_net_exit(struct net *net) static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, - .async = true, }; static int __init nf_defrag_init(void) diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index 162293469ac2..df5c2a2061a4 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -122,7 +122,6 @@ static void __net_exit nf_log_arp_net_exit(struct net *net) static struct pernet_operations nf_log_arp_net_ops = { .init = nf_log_arp_net_init, .exit = nf_log_arp_net_exit, - .async = true, }; static int __init nf_log_arp_init(void) diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 7a06de140f3c..4388de0e5380 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -358,7 +358,6 @@ static void __net_exit nf_log_ipv4_net_exit(struct net *net) static struct pernet_operations nf_log_ipv4_net_ops = { .init = nf_log_ipv4_net_init, .exit = nf_log_ipv4_net_exit, - .async = true, }; static int __init nf_log_ipv4_init(void) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 1f24bc8273a0..05e47d777009 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1204,7 +1204,6 @@ static void __net_exit ping_v4_proc_exit_net(struct net *net) static struct pernet_operations ping_v4_net_ops = { .init = ping_v4_proc_init_net, .exit = ping_v4_proc_exit_net, - .async = true, }; int __init ping_proc_init(void) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 80de2e659dcb..adfb75340275 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -549,7 +549,6 @@ static __net_exit void ip_proc_exit_net(struct net *net) static __net_initdata struct pernet_operations ip_proc_ops = { .init = ip_proc_init_net, .exit = ip_proc_exit_net, - .async = true, }; int __init ip_misc_proc_init(void) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0ee2501a9027..1b4d3355624a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -1154,7 +1154,6 @@ static __net_exit void raw_exit_net(struct net *net) static __net_initdata struct pernet_operations raw_net_ops = { .init = raw_init_net, .exit = raw_exit_net, - .async = true, }; int __init raw_proc_init(void) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ce9bd5380d21..8322e479f299 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -418,7 +418,6 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) static struct pernet_operations ip_rt_proc_ops __net_initdata = { .init = ip_rt_do_proc_init, .exit = ip_rt_do_proc_exit, - .async = true, }; static int __init ip_rt_proc_init(void) @@ -3017,7 +3016,6 @@ static __net_exit void sysctl_route_net_exit(struct net *net) static __net_initdata struct pernet_operations sysctl_route_ops = { .init = sysctl_route_net_init, .exit = sysctl_route_net_exit, - .async = true, }; #endif @@ -3031,7 +3029,6 @@ static __net_init int rt_genid_init(struct net *net) static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, - .async = true, }; static int __net_init ipv4_inetpeer_init(struct net *net) @@ -3057,7 +3054,6 @@ static void __net_exit ipv4_inetpeer_exit(struct net *net) static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { .init = ipv4_inetpeer_init, .exit = ipv4_inetpeer_exit, - .async = true, }; #ifdef CONFIG_IP_ROUTE_CLASSID diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5b72d97693f8..4b195bac8ac0 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1219,7 +1219,6 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) static __net_initdata struct pernet_operations ipv4_sysctl_ops = { .init = ipv4_sysctl_init_net, .exit = ipv4_sysctl_exit_net, - .async = true, }; static __init int sysctl_ipv4_init(void) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fec8b1fd7b63..9639334ebb7c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2391,7 +2391,6 @@ static void __net_exit tcp4_proc_exit_net(struct net *net) static struct pernet_operations tcp4_net_ops = { .init = tcp4_proc_init_net, .exit = tcp4_proc_exit_net, - .async = true, }; int __init tcp4_proc_init(void) @@ -2578,7 +2577,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { .init = tcp_sk_init, .exit = tcp_sk_exit, .exit_batch = tcp_sk_exit_batch, - .async = true, }; void __init tcp_v4_init(void) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index aa6fea9f3328..03b51cdcc731 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1024,7 +1024,6 @@ static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_lis static __net_initdata struct pernet_operations tcp_net_metrics_ops = { .init = tcp_net_metrics_init, .exit_batch = tcp_net_metrics_exit_batch, - .async = true, }; void __init tcp_metrics_init(void) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fb8f3a36bd14..f49e14cd3891 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2756,7 +2756,6 @@ static void __net_exit udp4_proc_exit_net(struct net *net) static struct pernet_operations udp4_net_ops = { .init = udp4_proc_init_net, .exit = udp4_proc_exit_net, - .async = true, }; int __init udp4_proc_init(void) @@ -2843,7 +2842,6 @@ static int __net_init udp_sysctl_init(struct net *net) static struct pernet_operations __net_initdata udp_sysctl_ops = { .init = udp_sysctl_init, - .async = true, }; void __init udp_init(void) diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 72f2c3806408..f96614e9b9a5 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -104,7 +104,6 @@ static void __net_exit udplite4_proc_exit_net(struct net *net) static struct pernet_operations udplite4_net_ops = { .init = udplite4_proc_init_net, .exit = udplite4_proc_exit_net, - .async = true, }; static __init int udplite4_proc_init(void) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 6c76a757fa4a..d73a6d6652f6 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -367,7 +367,6 @@ static void __net_exit xfrm4_net_exit(struct net *net) static struct pernet_operations __net_initdata xfrm4_net_ops = { .init = xfrm4_net_init, .exit = xfrm4_net_exit, - .async = true, }; static void __init xfrm4_policy_init(void) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 189eac80f4ef..78cef00c9596 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4282,7 +4282,6 @@ static void __net_exit if6_proc_net_exit(struct net *net) static struct pernet_operations if6_proc_net_ops = { .init = if6_proc_net_init, .exit = if6_proc_net_exit, - .async = true, }; int __init if6_proc_init(void) @@ -6592,7 +6591,6 @@ static void __net_exit addrconf_exit_net(struct net *net) static struct pernet_operations addrconf_ops = { .init = addrconf_init_net, .exit = addrconf_exit_net, - .async = true, }; static struct rtnl_af_ops inet6_ops __read_mostly = { diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index ba2e63633370..1d6ced37ad71 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -344,7 +344,6 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) static struct pernet_operations ipv6_addr_label_ops = { .init = ip6addrlbl_net_init, .exit = ip6addrlbl_net_exit, - .async = true, }; int __init ipv6_addr_label_init(void) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index dbbe04018813..c1e292db04db 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,6 @@ static void __net_exit inet6_net_exit(struct net *net) static struct pernet_operations inet6_net_ops = { .init = inet6_net_init, .exit = inet6_net_exit, - .async = true, }; static const struct ipv6_stub ipv6_stub_impl = { diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 00ef9467f3c0..df113c7b5fc8 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -397,7 +397,6 @@ static void __net_exit fib6_rules_net_exit(struct net *net) static struct pernet_operations fib6_rules_net_ops = { .init = fib6_rules_net_init, .exit = fib6_rules_net_exit, - .async = true, }; int __init fib6_rules_init(void) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 6f84668be6ea..d8c4b6374377 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -998,7 +998,6 @@ static void __net_exit icmpv6_sk_exit(struct net *net) static struct pernet_operations icmpv6_sk_ops = { .init = icmpv6_sk_init, .exit = icmpv6_sk_exit, - .async = true, }; int __init icmpv6_init(void) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index e438699f000f..44c39c5f0638 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -613,7 +613,6 @@ static struct pernet_operations ila_net_ops = { .exit = ila_exit_net, .id = &ila_net_id, .size = sizeof(struct ila_net), - .async = true, }; static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 2f995e9e3050..908b8e5b615a 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2161,7 +2161,6 @@ static void fib6_net_exit(struct net *net) static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, - .async = true, }; int __init fib6_init(void) diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index f75b06ba8325..c05c4e82a7ca 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -873,7 +873,6 @@ static void __net_exit ip6_flowlabel_net_exit(struct net *net) static struct pernet_operations ip6_flowlabel_net_ops = { .init = ip6_flowlabel_proc_init, .exit = ip6_flowlabel_net_exit, - .async = true, }; int ip6_flowlabel_init(void) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3a98c694da5f..22e86557aca4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1528,7 +1528,6 @@ static struct pernet_operations ip6gre_net_ops = { .exit_batch = ip6gre_exit_batch_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), - .async = true, }; static int ip6gre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 456fcf942f95..df4c29f7d59f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2260,7 +2260,6 @@ static struct pernet_operations ip6_tnl_net_ops = { .exit_batch = ip6_tnl_exit_batch_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), - .async = true, }; /** diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index a482b854eeea..60b771f49fb5 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1148,7 +1148,6 @@ static struct pernet_operations vti6_net_ops = { .exit_batch = vti6_exit_batch_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), - .async = true, }; static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 1c8fa29d155a..298fd8b6ed17 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1348,7 +1348,6 @@ static void __net_exit ip6mr_net_exit(struct net *net) static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, - .async = true, }; int __init ip6_mr_init(void) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 1e4c2b6ebd78..793159d77d8a 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2997,7 +2997,6 @@ static void __net_exit igmp6_net_exit(struct net *net) static struct pernet_operations igmp6_net_ops = { .init = igmp6_net_init, .exit = igmp6_net_exit, - .async = true, }; int __init igmp6_init(void) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d1d0b2fa7a07..9de4dfb126ba 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1883,7 +1883,6 @@ static void __net_exit ndisc_net_exit(struct net *net) static struct pernet_operations ndisc_net_ops = { .init = ndisc_net_init, .exit = ndisc_net_exit, - .async = true, }; int __init ndisc_init(void) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 4de8ac1e5af4..62358b93bbac 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1928,7 +1928,6 @@ static void __net_exit ip6_tables_net_exit(struct net *net) static struct pernet_operations ip6_tables_net_ops = { .init = ip6_tables_net_init, .exit = ip6_tables_net_exit, - .async = true, }; static int __init ip6_tables_init(void) diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 06561c84c0bc..1343077dde93 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -87,7 +87,6 @@ static void __net_exit ip6table_filter_net_exit(struct net *net) static struct pernet_operations ip6table_filter_net_ops = { .init = ip6table_filter_net_init, .exit = ip6table_filter_net_exit, - .async = true, }; static int __init ip6table_filter_init(void) diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index a11e25936b45..b0524b18c4fb 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -107,7 +107,6 @@ static void __net_exit ip6table_mangle_net_exit(struct net *net) static struct pernet_operations ip6table_mangle_net_ops = { .exit = ip6table_mangle_net_exit, - .async = true, }; static int __init ip6table_mangle_init(void) diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 4475fd300bb6..47306e45a80a 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -131,7 +131,6 @@ static void __net_exit ip6table_nat_net_exit(struct net *net) static struct pernet_operations ip6table_nat_net_ops = { .exit = ip6table_nat_net_exit, - .async = true, }; static int __init ip6table_nat_init(void) diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index a88f3b1995b1..710fa0806c37 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -75,7 +75,6 @@ static void __net_exit ip6table_raw_net_exit(struct net *net) static struct pernet_operations ip6table_raw_net_ops = { .exit = ip6table_raw_net_exit, - .async = true, }; static int __init ip6table_raw_init(void) diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 320048c008dc..cf26ccb04056 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -74,7 +74,6 @@ static void __net_exit ip6table_security_net_exit(struct net *net) static struct pernet_operations ip6table_security_net_ops = { .exit = ip6table_security_net_exit, - .async = true, }; static int __init ip6table_security_init(void) diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index ba54bb3bd1e4..663827ee3cf8 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -401,7 +401,6 @@ static struct pernet_operations ipv6_net_ops = { .exit = ipv6_net_exit, .id = &conntrack6_net_id, .size = sizeof(struct conntrack6_net), - .async = true, }; static int __init nf_conntrack_l3proto_ipv6_init(void) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 34136fe80ed5..b84ce3e6d728 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -646,7 +646,6 @@ static void nf_ct_net_exit(struct net *net) static struct pernet_operations nf_ct_net_ops = { .init = nf_ct_net_init, .exit = nf_ct_net_exit, - .async = true, }; int nf_ct_frag6_init(void) diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 32f98bc06900..c87b48359e8f 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -103,7 +103,6 @@ static void __net_exit defrag6_net_exit(struct net *net) static struct pernet_operations defrag6_net_ops = { .exit = defrag6_net_exit, - .async = true, }; static int __init nf_defrag_init(void) diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index 0220e584589c..b397a8fe88b9 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -390,7 +390,6 @@ static void __net_exit nf_log_ipv6_net_exit(struct net *net) static struct pernet_operations nf_log_ipv6_net_ops = { .init = nf_log_ipv6_net_init, .exit = nf_log_ipv6_net_exit, - .async = true, }; static int __init nf_log_ipv6_init(void) diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 318c6e914234..d12c55dad7d1 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -240,7 +240,6 @@ static void __net_init ping_v6_proc_exit_net(struct net *net) static struct pernet_operations ping_v6_net_ops = { .init = ping_v6_proc_init_net, .exit = ping_v6_proc_exit_net, - .async = true, }; #endif diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index f9891fa672f3..6e57028d2e91 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -343,7 +343,6 @@ static void __net_exit ipv6_proc_exit_net(struct net *net) static struct pernet_operations ipv6_proc_ops = { .init = ipv6_proc_init_net, .exit = ipv6_proc_exit_net, - .async = true, }; int __init ipv6_misc_proc_init(void) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index b5e5de732494..5eb9b08947ed 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1332,7 +1332,6 @@ static void __net_exit raw6_exit_net(struct net *net) static struct pernet_operations raw6_net_ops = { .init = raw6_init_net, .exit = raw6_exit_net, - .async = true, }; int __init raw6_proc_init(void) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b5da69c83123..afbc000ad4f2 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -733,7 +733,6 @@ static void __net_exit ipv6_frags_exit_net(struct net *net) static struct pernet_operations ip6_frags_ops = { .init = ipv6_frags_init_net, .exit = ipv6_frags_exit_net, - .async = true, }; int __init ipv6_frag_init(void) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1d0eaa69874d..ba8d5df50ebe 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5083,7 +5083,6 @@ static void __net_exit ip6_route_net_exit_late(struct net *net) static struct pernet_operations ip6_route_net_ops = { .init = ip6_route_net_init, .exit = ip6_route_net_exit, - .async = true, }; static int __net_init ipv6_inetpeer_init(struct net *net) @@ -5109,13 +5108,11 @@ static void __net_exit ipv6_inetpeer_exit(struct net *net) static struct pernet_operations ipv6_inetpeer_ops = { .init = ipv6_inetpeer_init, .exit = ipv6_inetpeer_exit, - .async = true, }; static struct pernet_operations ip6_route_net_late_ops = { .init = ip6_route_net_init_late, .exit = ip6_route_net_exit_late, - .async = true, }; static struct notifier_block ip6_route_dev_notifier = { diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index c3f13c3bd8a9..7f5621d09571 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -395,7 +395,6 @@ static void __net_exit seg6_net_exit(struct net *net) static struct pernet_operations ip6_segments_ops = { .init = seg6_net_init, .exit = seg6_net_exit, - .async = true, }; static const struct genl_ops seg6_genl_ops[] = { diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 8a4f8fddd812..1522bcfd253f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1888,7 +1888,6 @@ static struct pernet_operations sit_net_ops = { .exit_batch = sit_exit_batch_net, .id = &sit_net_id, .size = sizeof(struct sit_net), - .async = true, }; static void __exit sit_cleanup(void) diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 966c42af92f4..6fbdef630152 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -278,7 +278,6 @@ static void __net_exit ipv6_sysctl_net_exit(struct net *net) static struct pernet_operations ipv6_sysctl_net_ops = { .init = ipv6_sysctl_net_init, .exit = ipv6_sysctl_net_exit, - .async = true, }; static struct ctl_table_header *ip6_header; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5425d7b100ee..883df0ad5bfe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2007,7 +2007,6 @@ static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, .exit_batch = tcpv6_net_exit_batch, - .async = true, }; int __init tcpv6_init(void) diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index f3839780dc31..14ae32bb1f3d 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -123,7 +123,6 @@ static void __net_exit udplite6_proc_exit_net(struct net *net) static struct pernet_operations udplite6_net_ops = { .init = udplite6_proc_init_net, .exit = udplite6_proc_exit_net, - .async = true, }; int __init udplite6_proc_init(void) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index cbb270bd81b0..416fe67271a9 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -400,7 +400,6 @@ static void __net_exit xfrm6_net_exit(struct net *net) static struct pernet_operations xfrm6_net_ops = { .init = xfrm6_net_init, .exit = xfrm6_net_exit, - .async = true, }; int __init xfrm6_init(void) diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index a9673619e0e9..f85f0d7480ac 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -353,7 +353,6 @@ static struct pernet_operations xfrm6_tunnel_net_ops = { .exit = xfrm6_tunnel_net_exit, .id = &xfrm6_tunnel_net_id, .size = sizeof(struct xfrm6_tunnel_net), - .async = true, }; static int __init xfrm6_tunnel_init(void) diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index 4c2e9907f254..1fac92543094 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -433,7 +433,6 @@ static void kcm_proc_exit_net(struct net *net) static struct pernet_operations kcm_net_ops = { .init = kcm_proc_init_net, .exit = kcm_proc_exit_net, - .async = true, }; int __init kcm_proc_init(void) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 516cfad71b85..dc76bc346829 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -2028,7 +2028,6 @@ static struct pernet_operations kcm_net_ops = { .exit = kcm_exit_net, .id = &kcm_net_id, .size = sizeof(struct kcm_net), - .async = true, }; static int __init kcm_init(void) diff --git a/net/key/af_key.c b/net/key/af_key.c index 3ac08ab26207..7e2e7188e7f4 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3863,7 +3863,6 @@ static struct pernet_operations pfkey_net_ops = { .exit = pfkey_net_exit, .id = &pfkey_net_id, .size = sizeof(struct netns_pfkey), - .async = true, }; static void __exit ipsec_pfkey_exit(void) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index b86868da50d4..14b67dfacc4b 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1789,7 +1789,6 @@ static struct pernet_operations l2tp_net_ops = { .exit = l2tp_exit_net, .id = &l2tp_net_id, .size = sizeof(struct l2tp_net), - .async = true, }; static int __init l2tp_init(void) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index f24504efe729..d6deca11da19 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1762,7 +1762,6 @@ static struct pernet_operations pppol2tp_net_ops = { .init = pppol2tp_init_net, .exit = pppol2tp_exit_net, .id = &pppol2tp_net_id, - .async = true, }; /***************************************************************************** diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index d4a89a8be013..7a4de6d618b1 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2488,7 +2488,6 @@ static void mpls_net_exit(struct net *net) static struct pernet_operations mpls_net_ops = { .init = mpls_net_init, .exit = mpls_net_exit, - .async = true, }; static struct rtnl_af_ops mpls_af_ops __read_mostly = { diff --git a/net/netfilter/core.c b/net/netfilter/core.c index d72cc786c7b7..0f6b8172fb9a 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -629,7 +629,6 @@ static void __net_exit netfilter_net_exit(struct net *net) static struct pernet_operations netfilter_net_ops = { .init = netfilter_net_init, .exit = netfilter_net_exit, - .async = true, }; int __init netfilter_init(void) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 2523ebe2b3cc..bc4bd247bb7d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -2095,7 +2095,6 @@ static struct pernet_operations ip_set_net_ops = { .exit = ip_set_net_exit, .id = &ip_set_net_id, .size = sizeof(struct ip_set_net), - .async = true, }; static int __init diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 6a6cb9db030b..5f6f73cf2174 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2289,12 +2289,10 @@ static struct pernet_operations ipvs_core_ops = { .exit = __ip_vs_cleanup, .id = &ip_vs_net_id, .size = sizeof(struct netns_ipvs), - .async = true, }; static struct pernet_operations ipvs_core_dev_ops = { .exit = __ip_vs_dev_cleanup, - .async = true, }; /* diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 8b25aab41928..58d5d05aec24 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -479,7 +479,6 @@ static void __ip_vs_ftp_exit(struct net *net) static struct pernet_operations ip_vs_ftp_ops = { .init = __ip_vs_ftp_init, .exit = __ip_vs_ftp_exit, - .async = true, }; static int __init ip_vs_ftp_init(void) diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 6a340c94c4b8..d625179de485 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -604,7 +604,6 @@ static void __net_exit __ip_vs_lblc_exit(struct net *net) { } static struct pernet_operations ip_vs_lblc_ops = { .init = __ip_vs_lblc_init, .exit = __ip_vs_lblc_exit, - .async = true, }; static int __init ip_vs_lblc_init(void) diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 0627881128da..84c57b62a588 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -789,7 +789,6 @@ static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } static struct pernet_operations ip_vs_lblcr_ops = { .init = __ip_vs_lblcr_init, .exit = __ip_vs_lblcr_exit, - .async = true, }; static int __init ip_vs_lblcr_init(void) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 8884d302d33a..dd177ebee9aa 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3417,7 +3417,6 @@ static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) static struct pernet_operations ctnetlink_net_ops = { .init = ctnetlink_net_init, .exit_batch = ctnetlink_net_exit_batch, - .async = true, }; static int __init ctnetlink_init(void) diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9bcd72fe91f9..d049ea5a3770 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -406,7 +406,6 @@ static struct pernet_operations proto_gre_net_ops = { .exit = proto_gre_net_exit, .id = &proto_gre_net_id, .size = sizeof(struct netns_proto_gre), - .async = true, }; static int __init nf_ct_proto_gre_init(void) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 98844c87d01e..037fec54c850 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -705,7 +705,6 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) static struct pernet_operations nf_conntrack_net_ops = { .init = nf_conntrack_pernet_init, .exit_batch = nf_conntrack_pernet_exit, - .async = true, }; static int __init nf_conntrack_standalone_init(void) diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index a964e4d356cc..6d0357817cda 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -577,7 +577,6 @@ static void __net_exit nf_log_net_exit(struct net *net) static struct pernet_operations nf_log_net_ops = { .init = nf_log_net_init, .exit = nf_log_net_exit, - .async = true, }; int __init netfilter_log_init(void) diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c index 254c2c6bde48..350eb147754d 100644 --- a/net/netfilter/nf_log_netdev.c +++ b/net/netfilter/nf_log_netdev.c @@ -47,7 +47,6 @@ static void __net_exit nf_log_netdev_net_exit(struct net *net) static struct pernet_operations nf_log_netdev_net_ops = { .init = nf_log_netdev_net_init, .exit = nf_log_netdev_net_exit, - .async = true, }; static int __init nf_log_netdev_init(void) diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 8f16fd27132d..6039b350abbe 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -398,7 +398,6 @@ static struct pernet_operations synproxy_net_ops = { .exit = synproxy_net_exit, .id = &synproxy_net_id, .size = sizeof(struct synproxy_net), - .async = true, }; static int __init synproxy_core_init(void) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd13d28e4ca7..c4acc7340eb1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6597,7 +6597,6 @@ static void __net_exit nf_tables_exit_net(struct net *net) static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .exit = nf_tables_exit_net, - .async = true, }; static int __init nf_tables_module_init(void) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 84fc4954862d..03ead8a9e90c 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -566,7 +566,6 @@ static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) static struct pernet_operations nfnetlink_net_ops = { .init = nfnetlink_net_init, .exit_batch = nfnetlink_net_exit_batch, - .async = true, }; static int __init nfnetlink_init(void) diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 8d9f18bb8840..88d427f9f9e6 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -515,7 +515,6 @@ static void __net_exit nfnl_acct_net_exit(struct net *net) static struct pernet_operations nfnl_acct_ops = { .init = nfnl_acct_net_init, .exit = nfnl_acct_net_exit, - .async = true, }; static int __init nfnl_acct_init(void) diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 6819300f7fb7..95b04702a655 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -586,7 +586,6 @@ static void __net_exit cttimeout_net_exit(struct net *net) static struct pernet_operations cttimeout_ops = { .init = cttimeout_net_init, .exit = cttimeout_net_exit, - .async = true, }; static int __init cttimeout_init(void) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b21ef79849a1..7b46aa4c478d 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -1108,7 +1108,6 @@ static struct pernet_operations nfnl_log_net_ops = { .exit = nfnl_log_net_exit, .id = &nfnl_log_net_id, .size = sizeof(struct nfnl_log_net), - .async = true, }; static int __init nfnetlink_log_init(void) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 9f572ed56208..0b839c38800f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1525,7 +1525,6 @@ static struct pernet_operations nfnl_queue_net_ops = { .exit_batch = nfnl_queue_net_exit_batch, .id = &nfnl_queue_net_id, .size = sizeof(struct nfnl_queue_net), - .async = true, }; static int __init nfnetlink_queue_init(void) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 6de1f6a4cb80..4aa01c90e9d1 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1789,7 +1789,6 @@ static void __net_exit xt_net_exit(struct net *net) static struct pernet_operations xt_net_ops = { .init = xt_net_init, .exit = xt_net_exit, - .async = true, }; static int __init xt_init(void) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index ef65b7a9173e..3360f13dc208 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -1349,7 +1349,6 @@ static struct pernet_operations hashlimit_net_ops = { .exit = hashlimit_net_exit, .id = &hashlimit_net_id, .size = sizeof(struct hashlimit_net), - .async = true, }; static int __init hashlimit_mt_init(void) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 434e35ce940b..9bbfc17ce3ec 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -687,7 +687,6 @@ static struct pernet_operations recent_net_ops = { .exit = recent_net_exit, .id = &recent_net_id, .size = sizeof(struct recent_net), - .async = true, }; static struct xt_match recent_mt_reg[] __read_mostly = { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5d10dcfe6411..f1b02d87e336 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -253,7 +253,6 @@ static struct pernet_operations netlink_tap_net_ops = { .exit = netlink_tap_exit_net, .id = &netlink_tap_net_id, .size = sizeof(struct netlink_tap_net), - .async = true, }; static bool netlink_filter_tap(const struct sk_buff *skb) @@ -2726,7 +2725,6 @@ static void __init netlink_add_usersock_entry(void) static struct pernet_operations __net_initdata netlink_net_ops = { .init = netlink_net_init, .exit = netlink_net_exit, - .async = true, }; static inline u32 netlink_hash(const void *data, u32 len, u32 seed) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index af51b8c0a2cb..b9ce82c9440f 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1035,7 +1035,6 @@ static void __net_exit genl_pernet_exit(struct net *net) static struct pernet_operations genl_pernet_ops = { .init = genl_pernet_init, .exit = genl_pernet_exit, - .async = true, }; static int __init genl_init(void) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 100191df0371..ef38e5aecd28 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2384,7 +2384,6 @@ static struct pernet_operations ovs_net_ops = { .exit = ovs_exit_net, .id = &ovs_net_id, .size = sizeof(struct ovs_net), - .async = true, }; static int __init dp_init(void) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2c5a6fe5d749..616cb9c18f88 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4557,7 +4557,6 @@ static void __net_exit packet_net_exit(struct net *net) static struct pernet_operations packet_net_ops = { .init = packet_net_init, .exit = packet_net_exit, - .async = true, }; diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 9454e8393793..77787512fc32 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -342,7 +342,6 @@ static struct pernet_operations phonet_net_ops = { .exit = phonet_exit_net, .id = &phonet_net_id, .size = sizeof(struct phonet_net), - .async = true, }; /* Initialize Phonet devices list */ diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 4f3a32c38bf5..351a28474667 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -530,7 +530,6 @@ static struct pernet_operations rds_tcp_net_ops = { .exit = rds_tcp_exit_net, .id = &rds_tcp_netid, .size = sizeof(struct rds_tcp_net), - .async = true, }; void *rds_tcp_listen_sock_def_readable(struct net *net) diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 5fd939dabf41..f18c9248e0d4 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -106,5 +106,4 @@ struct pernet_operations rxrpc_net_ops = { .exit = rxrpc_exit_net, .id = &rxrpc_net_id, .size = sizeof(struct rxrpc_net), - .async = true, }; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 7bd1b964f021..0d78b58e1898 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1533,7 +1533,6 @@ static struct pernet_operations tcf_action_net_ops = { .exit = tcf_action_net_exit, .id = &tcf_action_net_id, .size = sizeof(struct tcf_action_net), - .async = true, }; static int __init tc_action_init(void) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 5cb9b268e8ff..9092531d45d8 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -413,7 +413,6 @@ static struct pernet_operations bpf_net_ops = { .exit_batch = bpf_exit_net, .id = &bpf_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init bpf_init_module(void) diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 371e5e4ab3e2..e4b880fa51fe 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -222,7 +222,6 @@ static struct pernet_operations connmark_net_ops = { .exit_batch = connmark_exit_net, .id = &connmark_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init connmark_init_module(void) diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index a527e287c086..7e28b2ce1437 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -678,7 +678,6 @@ static struct pernet_operations csum_net_ops = { .exit_batch = csum_exit_net, .id = &csum_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_DESCRIPTION("Checksum updating actions"); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 88fbb8403565..4dc4f153cad8 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -261,7 +261,6 @@ static struct pernet_operations gact_net_ops = { .exit_batch = gact_exit_net, .id = &gact_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 555b1caeff72..a5994cf0512b 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -870,7 +870,6 @@ static struct pernet_operations ife_net_ops = { .exit_batch = ife_exit_net, .id = &ife_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init ife_init_module(void) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index b5e8565b89c7..14c312d7908f 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -352,7 +352,6 @@ static struct pernet_operations ipt_net_ops = { .exit_batch = ipt_exit_net, .id = &ipt_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int tcf_xt_walker(struct net *net, struct sk_buff *skb, @@ -403,7 +402,6 @@ static struct pernet_operations xt_net_ops = { .exit_batch = xt_exit_net, .id = &xt_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 64c86579c3d9..fd34015331ab 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -353,7 +353,6 @@ static struct pernet_operations mirred_net_ops = { .exit_batch = mirred_exit_net, .id = &mirred_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index b1bc757f6491..4b5848b6c252 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -323,7 +323,6 @@ static struct pernet_operations nat_net_ops = { .exit_batch = nat_exit_net, .id = &nat_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_DESCRIPTION("Stateless NAT actions"); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index f392ccaaa0d8..8a925c72db5f 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -465,7 +465,6 @@ static struct pernet_operations pedit_net_ops = { .exit_batch = pedit_exit_net, .id = &pedit_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 7081ec75e696..4e72bc2a0dfb 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -347,7 +347,6 @@ static struct pernet_operations police_net_ops = { .exit_batch = police_exit_net, .id = &police_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init police_init_module(void) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 3a89f98f17e6..5db358497c9e 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -249,7 +249,6 @@ static struct pernet_operations sample_net_ops = { .exit_batch = sample_exit_net, .id = &sample_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init sample_init_module(void) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index e84768ae610a..9618b4a83cee 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -216,7 +216,6 @@ static struct pernet_operations simp_net_ops = { .exit_batch = simp_exit_net, .id = &simp_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 7971510fe61b..ddf69fc01bdf 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -253,7 +253,6 @@ static struct pernet_operations skbedit_net_ops = { .exit_batch = skbedit_exit_net, .id = &skbedit_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Alexander Duyck, "); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 142a996ac776..bbcbdce732cc 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -279,7 +279,6 @@ static struct pernet_operations skbmod_net_ops = { .exit_batch = skbmod_exit_net, .id = &skbmod_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim, "); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index a1c8dd406a04..626dac81a48a 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -339,7 +339,6 @@ static struct pernet_operations tunnel_key_net_ops = { .exit_batch = tunnel_key_exit_net, .id = &tunnel_key_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init tunnel_key_init_module(void) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 41a66effeb5f..853604685965 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -314,7 +314,6 @@ static struct pernet_operations vlan_net_ops = { .exit_batch = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init vlan_init_module(void) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ec5fe8ec0c3e..b66754f52a9f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1619,7 +1619,6 @@ static struct pernet_operations tcf_net_ops = { .exit = tcf_net_exit, .id = &tcf_net_id, .size = sizeof(struct tcf_net), - .async = true, }; static int __init tc_filter_init(void) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 68f9d942bed4..106dae7e4818 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -2133,7 +2133,6 @@ static void __net_exit psched_net_exit(struct net *net) static struct pernet_operations psched_net_ops = { .init = psched_net_init, .exit = psched_net_exit, - .async = true, }; static int __init pktsched_init(void) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 493b817f6a2a..84a09f599131 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1283,7 +1283,6 @@ static void __net_exit sctp_defaults_exit(struct net *net) static struct pernet_operations sctp_defaults_ops = { .init = sctp_defaults_init, .exit = sctp_defaults_exit, - .async = true, }; static int __net_init sctp_ctrlsock_init(struct net *net) @@ -1307,7 +1306,6 @@ static void __net_init sctp_ctrlsock_exit(struct net *net) static struct pernet_operations sctp_ctrlsock_ops = { .init = sctp_ctrlsock_init, .exit = sctp_ctrlsock_exit, - .async = true, }; /* Initialize the universe into something sensible. */ diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 44f939cb6bc8..9463af4b32e8 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -2063,7 +2063,6 @@ static __net_exit void rpcsec_gss_exit_net(struct net *net) static struct pernet_operations rpcsec_gss_net_ops = { .init = rpcsec_gss_init_net, .exit = rpcsec_gss_exit_net, - .async = true, }; /* diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 68287e921847..56f9eff74150 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -79,7 +79,6 @@ static struct pernet_operations sunrpc_net_ops = { .exit = sunrpc_exit_net, .id = &sunrpc_net_id, .size = sizeof(struct sunrpc_net), - .async = true, }; static int __init diff --git a/net/sysctl_net.c b/net/sysctl_net.c index f424539829b7..9aed6fe1bf1a 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -89,7 +89,6 @@ static void __net_exit sysctl_net_exit(struct net *net) static struct pernet_operations sysctl_pernet_ops = { .init = sysctl_net_init, .exit = sysctl_net_exit, - .async = true, }; static struct ctl_table_header *net_header; diff --git a/net/tipc/core.c b/net/tipc/core.c index 52dfc51ac4d5..5b38f5164281 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -109,7 +109,6 @@ static struct pernet_operations tipc_net_ops = { .exit = tipc_exit_net, .id = &tipc_net_id, .size = sizeof(struct tipc_net), - .async = true, }; static int __init tipc_init(void) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bc2970a8e7f3..aded82da1aea 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2913,7 +2913,6 @@ static void __net_exit unix_net_exit(struct net *net) static struct pernet_operations unix_net_ops = { .init = unix_net_init, .exit = unix_net_exit, - .async = true, }; static int __init af_unix_init(void) diff --git a/net/wireless/core.c b/net/wireless/core.c index 670aa229168a..a6f3cac8c640 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1340,7 +1340,6 @@ static void __net_exit cfg80211_pernet_exit(struct net *net) static struct pernet_operations cfg80211_pernet_ops = { .exit = cfg80211_pernet_exit, - .async = true, }; static int __init cfg80211_init(void) diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index bc7064486b15..9efbfc753347 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -390,7 +390,6 @@ static void __net_exit wext_pernet_exit(struct net *net) static struct pernet_operations wext_pernet_ops = { .init = wext_pernet_init, .exit = wext_pernet_exit, - .async = true, }; static int __init wireless_nlevent_init(void) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index cb3bb9ae4407..625b3fca5704 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2985,7 +2985,6 @@ static void __net_exit xfrm_net_exit(struct net *net) static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, .exit = xfrm_net_exit, - .async = true, }; void __init xfrm_init(void) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e92b8c019c88..080035f056d9 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3253,7 +3253,6 @@ static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list) static struct pernet_operations xfrm_user_net_ops = { .init = xfrm_user_net_init, .exit_batch = xfrm_user_net_exit, - .async = true, }; static int __init xfrm_user_init(void) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index b4d7b6242a40..8644d864e3c1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6743,7 +6743,6 @@ static void __net_exit selinux_nf_unregister(struct net *net) static struct pernet_operations selinux_net_ops = { .init = selinux_nf_register, .exit = selinux_nf_unregister, - .async = true, }; static int __init selinux_nf_ip_init(void) diff --git a/security/smack/smack_netfilter.c b/security/smack/smack_netfilter.c index 3f29c03162ca..e36d17835d4f 100644 --- a/security/smack/smack_netfilter.c +++ b/security/smack/smack_netfilter.c @@ -89,7 +89,6 @@ static void __net_exit smack_nf_unregister(struct net *net) static struct pernet_operations smack_net_ops = { .init = smack_nf_register, .exit = smack_nf_unregister, - .async = true, }; static int __init smack_nf_ip_init(void) -- cgit v1.2.3-71-gd317 From b76354cdfefb68bc017065850a34d486887a4f7b Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 27 Mar 2018 11:53:21 -0700 Subject: bpf: follow idr code convention Generally we do a preload before doing idr allocation. This also help improve the allocation success rate in memory pressure. Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Shaohua Li Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd172ee16716..77d45bd9f507 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -203,11 +203,13 @@ static int bpf_map_alloc_id(struct bpf_map *map) { int id; + idr_preload(GFP_KERNEL); spin_lock_bh(&map_idr_lock); id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); if (id > 0) map->id = id; spin_unlock_bh(&map_idr_lock); + idr_preload_end(); if (WARN_ON_ONCE(!id)) return -ENOSPC; @@ -940,11 +942,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) { int id; + idr_preload(GFP_KERNEL); spin_lock_bh(&prog_idr_lock); id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); if (id > 0) prog->aux->id = id; spin_unlock_bh(&prog_idr_lock); + idr_preload_end(); /* id is in [1, INT_MAX) */ if (WARN_ON_ONCE(!id)) -- cgit v1.2.3-71-gd317 From f67b15037a7a50c57f72e69a6d59941ad90a0f0f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 26 Mar 2018 15:39:07 -1000 Subject: perf/hwbp: Simplify the perf-hwbp code, fix documentation Annoyingly, modify_user_hw_breakpoint() unnecessarily complicates the modification of a breakpoint - simplify it and remove the pointless local variables. Also update the stale Docbook while at it. Signed-off-by: Linus Torvalds Acked-by: Thomas Gleixner Cc: Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3f8cb1e14588..253ae2da13c3 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -427,16 +427,9 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); * modify_user_hw_breakpoint - modify a user-space hardware breakpoint * @bp: the breakpoint structure to modify * @attr: new breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * @tsk: pointer to 'task_struct' of the process to which the address belongs */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { - u64 old_addr = bp->attr.bp_addr; - u64 old_len = bp->attr.bp_len; - int old_type = bp->attr.bp_type; - int err = 0; - /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it * will not be possible to raise IPIs that invoke __perf_event_disable. @@ -451,27 +444,18 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att bp->attr.bp_addr = attr->bp_addr; bp->attr.bp_type = attr->bp_type; bp->attr.bp_len = attr->bp_len; + bp->attr.disabled = 1; - if (attr->disabled) - goto end; - - err = validate_hw_breakpoint(bp); - if (!err) - perf_event_enable(bp); + if (!attr->disabled) { + int err = validate_hw_breakpoint(bp); - if (err) { - bp->attr.bp_addr = old_addr; - bp->attr.bp_type = old_type; - bp->attr.bp_len = old_len; - if (!bp->attr.disabled) - perf_event_enable(bp); + if (err) + return err; - return err; + perf_event_enable(bp); + bp->attr.disabled = 0; } -end: - bp->attr.disabled = attr->disabled; - return 0; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); -- cgit v1.2.3-71-gd317 From c4f6699dfcb8558d138fe838f741b2c10f416cf9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 28 Mar 2018 12:05:37 -0700 Subject: bpf: introduce BPF_RAW_TRACEPOINT Introduce BPF_PROG_TYPE_RAW_TRACEPOINT bpf program type to access kernel internal arguments of the tracepoints in their raw form. >From bpf program point of view the access to the arguments look like: struct bpf_raw_tracepoint_args { __u64 args[0]; }; int bpf_prog(struct bpf_raw_tracepoint_args *ctx) { // program can read args[N] where N depends on tracepoint // and statically verified at program load+attach time } kprobe+bpf infrastructure allows programs access function arguments. This feature allows programs access raw tracepoint arguments. Similar to proposed 'dynamic ftrace events' there are no abi guarantees to what the tracepoints arguments are and what their meaning is. The program needs to type cast args properly and use bpf_probe_read() helper to access struct fields when argument is a pointer. For every tracepoint __bpf_trace_##call function is prepared. In assembler it looks like: (gdb) disassemble __bpf_trace_xdp_exception Dump of assembler code for function __bpf_trace_xdp_exception: 0xffffffff81132080 <+0>: mov %ecx,%ecx 0xffffffff81132082 <+2>: jmpq 0xffffffff811231f0 where TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), The above assembler snippet is casting 32-bit 'act' field into 'u64' to pass into bpf_trace_run3(), while 'dev' and 'xdp' args are passed as-is. All of ~500 of __bpf_trace_*() functions are only 5-10 byte long and in total this approach adds 7k bytes to .text. This approach gives the lowest possible overhead while calling trace_xdp_exception() from kernel C code and transitioning into bpf land. Since tracepoint+bpf are used at speeds of 1M+ events per second this is valuable optimization. The new BPF_RAW_TRACEPOINT_OPEN sys_bpf command is introduced that returns anon_inode FD of 'bpf-raw-tracepoint' object. The user space looks like: // load bpf prog with BPF_PROG_TYPE_RAW_TRACEPOINT type prog_fd = bpf_prog_load(...); // receive anon_inode fd for given bpf_raw_tracepoint with prog attached raw_tp_fd = bpf_raw_tracepoint_open("xdp_exception", prog_fd); Ctrl-C of tracing daemon or cmdline tool that uses this feature will automatically detach bpf program, unload it and unregister tracepoint probe. On the kernel side the __bpf_raw_tp_map section of pointers to tracepoint definition and to __bpf_trace_*() probe function is used to find a tracepoint with "xdp_exception" name and corresponding __bpf_trace_xdp_exception() probe function which are passed to tracepoint_probe_register() to connect probe with tracepoint. Addition of bpf_raw_tracepoint doesn't interfere with ftrace and perf tracepoint mechanisms. perf_event_open() can be used in parallel on the same tracepoint. Multiple bpf_raw_tracepoint_open("xdp_exception", prog_fd) are permitted. Each with its own bpf program. The kernel will execute all tracepoint probes and all attached bpf programs. In the future bpf_raw_tracepoints can be extended with query/introspection logic. __bpf_raw_tp_map section logic was contributed by Steven Rostedt Signed-off-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) Acked-by: Steven Rostedt (VMware) Signed-off-by: Daniel Borkmann --- include/asm-generic/vmlinux.lds.h | 10 +++ include/linux/bpf_types.h | 1 + include/linux/trace_events.h | 42 +++++++++ include/linux/tracepoint-defs.h | 6 ++ include/trace/bpf_probe.h | 92 +++++++++++++++++++ include/trace/define_trace.h | 1 + include/uapi/linux/bpf.h | 11 +++ kernel/bpf/syscall.c | 78 ++++++++++++++++ kernel/trace/bpf_trace.c | 183 ++++++++++++++++++++++++++++++++++++++ 9 files changed, 424 insertions(+) create mode 100644 include/trace/bpf_probe.h (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 1ab0e520d6fc..8add3493a202 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -178,6 +178,15 @@ #define TRACE_SYSCALLS() #endif +#ifdef CONFIG_BPF_EVENTS +#define BPF_RAW_TP() STRUCT_ALIGN(); \ + VMLINUX_SYMBOL(__start__bpf_raw_tp) = .; \ + KEEP(*(__bpf_raw_tp_map)) \ + VMLINUX_SYMBOL(__stop__bpf_raw_tp) = .; +#else +#define BPF_RAW_TP() +#endif + #ifdef CONFIG_SERIAL_EARLYCON #define EARLYCON_TABLE() STRUCT_ALIGN(); \ VMLINUX_SYMBOL(__earlycon_table) = .; \ @@ -249,6 +258,7 @@ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ + BPF_RAW_TP() \ TRACEPOINT_STR() /* diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5e2e8a49fb21..6d7243bfb0ff 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -19,6 +19,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8a1442c4e513..b0357cd198b0 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -468,6 +468,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); +struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -487,6 +490,18 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info) { return -EOPNOTSUPP; } +static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) +{ + return -EOPNOTSUPP; +} +static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) +{ + return -EOPNOTSUPP; +} +static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +{ + return NULL; +} #endif enum { @@ -546,6 +561,33 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); +void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); +void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); +void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3); +void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4); +void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5); +void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6); +void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); +void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8); +void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9); +void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10); +void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10, u64 arg11); +void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 64ed7064f1fa..22c5a46e9693 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -35,4 +35,10 @@ struct tracepoint { struct tracepoint_func __rcu *funcs; }; +struct bpf_raw_event_map { + struct tracepoint *tp; + void *bpf_func; + u32 num_args; +} __aligned(32); + #endif diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h new file mode 100644 index 000000000000..505dae0bed80 --- /dev/null +++ b/include/trace/bpf_probe.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM_VAR + +#ifdef CONFIG_BPF_EVENTS + +#undef __entry +#define __entry entry + +#undef __get_dynamic_array +#define __get_dynamic_array(field) \ + ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) + +#undef __get_dynamic_array_len +#define __get_dynamic_array_len(field) \ + ((__entry->__data_loc_##field >> 16) & 0xffff) + +#undef __get_str +#define __get_str(field) ((char *)__get_dynamic_array(field)) + +#undef __get_bitmask +#define __get_bitmask(field) (char *)__get_dynamic_array(field) + +#undef __perf_count +#define __perf_count(c) (c) + +#undef __perf_task +#define __perf_task(t) (t) + +/* cast any integer, pointer, or small struct to u64 */ +#define UINTTYPE(size) \ + __typeof__(__builtin_choose_expr(size == 1, (u8)1, \ + __builtin_choose_expr(size == 2, (u16)2, \ + __builtin_choose_expr(size == 4, (u32)3, \ + __builtin_choose_expr(size == 8, (u64)4, \ + (void)5))))) +#define __CAST_TO_U64(x) ({ \ + typeof(x) __src = (x); \ + UINTTYPE(sizeof(x)) __dst; \ + memcpy(&__dst, &__src, sizeof(__dst)); \ + (u64)__dst; }) + +#define __CAST1(a,...) __CAST_TO_U64(a) +#define __CAST2(a,...) __CAST_TO_U64(a), __CAST1(__VA_ARGS__) +#define __CAST3(a,...) __CAST_TO_U64(a), __CAST2(__VA_ARGS__) +#define __CAST4(a,...) __CAST_TO_U64(a), __CAST3(__VA_ARGS__) +#define __CAST5(a,...) __CAST_TO_U64(a), __CAST4(__VA_ARGS__) +#define __CAST6(a,...) __CAST_TO_U64(a), __CAST5(__VA_ARGS__) +#define __CAST7(a,...) __CAST_TO_U64(a), __CAST6(__VA_ARGS__) +#define __CAST8(a,...) __CAST_TO_U64(a), __CAST7(__VA_ARGS__) +#define __CAST9(a,...) __CAST_TO_U64(a), __CAST8(__VA_ARGS__) +#define __CAST10(a,...) __CAST_TO_U64(a), __CAST9(__VA_ARGS__) +#define __CAST11(a,...) __CAST_TO_U64(a), __CAST10(__VA_ARGS__) +#define __CAST12(a,...) __CAST_TO_U64(a), __CAST11(__VA_ARGS__) +/* tracepoints with more than 12 arguments will hit build error */ +#define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +static notrace void \ +__bpf_trace_##call(void *__data, proto) \ +{ \ + struct bpf_prog *prog = __data; \ + CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \ +} + +/* + * This part is compiled out, it is only here as a build time check + * to make sure that if the tracepoint handling changes, the + * bpf probe will fail to compile unless it too is updated. + */ +#undef DEFINE_EVENT +#define DEFINE_EVENT(template, call, proto, args) \ +static inline void bpf_test_probe_##call(void) \ +{ \ + check_trace_callback_type_##call(__bpf_trace_##template); \ +} \ +static struct bpf_raw_event_map __used \ + __attribute__((section("__bpf_raw_tp_map"))) \ +__bpf_trace_tp_map_##call = { \ + .tp = &__tracepoint_##call, \ + .bpf_func = (void *)__bpf_trace_##template, \ + .num_args = COUNT_ARGS(args), \ +}; + + +#undef DEFINE_EVENT_PRINT +#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ + DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#endif /* CONFIG_BPF_EVENTS */ diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index d9e3d4aa3f6e..cb30c5532144 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -95,6 +95,7 @@ #ifdef TRACEPOINTS_ENABLED #include #include +#include #endif #undef TRACE_EVENT diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 18b7c510c511..1878201c2d77 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -94,6 +94,7 @@ enum bpf_cmd { BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, }; enum bpf_map_type { @@ -134,6 +135,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, }; enum bpf_attach_type { @@ -344,6 +346,11 @@ union bpf_attr { __aligned_u64 prog_ids; __u32 prog_cnt; } query; + + struct { + __u64 name; + __u32 prog_fd; + } raw_tracepoint; } __attribute__((aligned(8))); /* BPF helper function descriptions: @@ -1152,4 +1159,8 @@ struct bpf_cgroup_dev_ctx { __u32 minor; }; +struct bpf_raw_tracepoint_args { + __u64 args[0]; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 77d45bd9f507..95ca2523fa6e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1315,6 +1315,81 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } +struct bpf_raw_tracepoint { + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; +}; + +static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) +{ + struct bpf_raw_tracepoint *raw_tp = filp->private_data; + + if (raw_tp->prog) { + bpf_probe_unregister(raw_tp->btp, raw_tp->prog); + bpf_prog_put(raw_tp->prog); + } + kfree(raw_tp); + return 0; +} + +static const struct file_operations bpf_raw_tp_fops = { + .release = bpf_raw_tracepoint_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, +}; + +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd + +static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +{ + struct bpf_raw_tracepoint *raw_tp; + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; + char tp_name[128]; + int tp_fd, err; + + if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name), + sizeof(tp_name) - 1) < 0) + return -EFAULT; + tp_name[sizeof(tp_name) - 1] = 0; + + btp = bpf_find_raw_tracepoint(tp_name); + if (!btp) + return -ENOENT; + + raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); + if (!raw_tp) + return -ENOMEM; + raw_tp->btp = btp; + + prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, + BPF_PROG_TYPE_RAW_TRACEPOINT); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto out_free_tp; + } + + err = bpf_probe_register(raw_tp->btp, prog); + if (err) + goto out_put_prog; + + raw_tp->prog = prog; + tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, + O_CLOEXEC); + if (tp_fd < 0) { + bpf_probe_unregister(raw_tp->btp, prog); + err = tp_fd; + goto out_put_prog; + } + return tp_fd; + +out_put_prog: + bpf_prog_put(prog); +out_free_tp: + kfree(raw_tp); + return err; +} + #ifdef CONFIG_CGROUP_BPF #define BPF_PROG_ATTACH_LAST_FIELD attach_flags @@ -1925,6 +2000,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET_INFO_BY_FD: err = bpf_obj_get_info_by_fd(&attr, uattr); break; + case BPF_RAW_TRACEPOINT_OPEN: + err = bpf_raw_tracepoint_open(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7f9691c86b6e..463e72d18c4c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -735,6 +735,86 @@ static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) } } +/* + * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp + * to avoid potential recursive reuse issue when/if tracepoints are added + * inside bpf_*_event_output and/or bpf_get_stack_id + */ +static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); +BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + return ____bpf_perf_event_output(regs, map, flags, data, size); +} + +static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { + .func = bpf_perf_event_output_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); +} + +static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { + .func = bpf_get_stackid_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_raw_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_raw_tp; + default: + return tracing_func_proto(func_id); + } +} + +static bool raw_tp_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + /* largest tracepoint in the kernel has 12 args */ + if (off < 0 || off >= sizeof(__u64) * 12) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -908,3 +988,106 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return ret; } + +extern struct bpf_raw_event_map __start__bpf_raw_tp[]; +extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; + +struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +{ + struct bpf_raw_event_map *btp = __start__bpf_raw_tp; + + for (; btp < __stop__bpf_raw_tp; btp++) { + if (!strcmp(btp->tp->name, name)) + return btp; + } + return NULL; +} + +static __always_inline +void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +{ + rcu_read_lock(); + preempt_disable(); + (void) BPF_PROG_RUN(prog, args); + preempt_enable(); + rcu_read_unlock(); +} + +#define UNPACK(...) __VA_ARGS__ +#define REPEAT_1(FN, DL, X, ...) FN(X) +#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) +#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__) +#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__) +#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__) +#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__) +#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__) +#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__) +#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__) +#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__) +#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__) +#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__) +#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__) + +#define SARG(X) u64 arg##X +#define COPY(X) args[X] = arg##X + +#define __DL_COM (,) +#define __DL_SEM (;) + +#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + +#define BPF_TRACE_DEFN_x(x) \ + void bpf_trace_run##x(struct bpf_prog *prog, \ + REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ + { \ + u64 args[x]; \ + REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ + __bpf_trace_run(prog, args); \ + } \ + EXPORT_SYMBOL_GPL(bpf_trace_run##x) +BPF_TRACE_DEFN_x(1); +BPF_TRACE_DEFN_x(2); +BPF_TRACE_DEFN_x(3); +BPF_TRACE_DEFN_x(4); +BPF_TRACE_DEFN_x(5); +BPF_TRACE_DEFN_x(6); +BPF_TRACE_DEFN_x(7); +BPF_TRACE_DEFN_x(8); +BPF_TRACE_DEFN_x(9); +BPF_TRACE_DEFN_x(10); +BPF_TRACE_DEFN_x(11); +BPF_TRACE_DEFN_x(12); + +static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + struct tracepoint *tp = btp->tp; + + /* + * check that program doesn't access arguments beyond what's + * available in this tracepoint + */ + if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) + return -EINVAL; + + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); +} + +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = __bpf_probe_register(btp, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} + +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} -- cgit v1.2.3-71-gd317 From c28d62cf52d791ba5f6db7ce525ed06b86291c82 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 27 Mar 2018 14:14:38 +0200 Subject: locking/rtmutex: Handle non enqueued waiters gracefully in remove_waiter() In -RT task_blocks_on_rt_mutex() may return with -EAGAIN due to (->pi_blocked_on == PI_WAKEUP_INPROGRESS) before it added itself as a waiter. In such a case remove_waiter() must not be called because without a waiter it will trigger the BUG_ON() statement. This was initially reported by Yimin Deng. Thomas Gleixner fixed it then with an explicit check for waiters before calling remove_waiter(). Instead of an explicit NULL check before calling rt_mutex_top_waiter() make the function return NULL if there are no waiters. With that fixed the now pointless NULL check is removed from rt_mutex_slowlock(). Reported-and-debugged-by: Yimin Deng Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/CAAh1qt=DCL9aUXNxanP5BKtiPp3m+qj4yB+gDohhXPVFCxWwzg@mail.gmail.com Link: https://lkml.kernel.org/r/20180327121438.sss7hxg3crqy4ecd@linutronix.de --- kernel/locking/rtmutex.c | 3 +-- kernel/locking/rtmutex_common.h | 11 ++++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 940633c63254..4f014be7a4b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1268,8 +1268,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(ret)) { __set_current_state(TASK_RUNNING); - if (rt_mutex_has_waiters(lock)) - remove_waiter(lock, &waiter); + remove_waiter(lock, &waiter); rt_mutex_handle_deadlock(ret, chwalk, &waiter); } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 68686b3ec3c1..d1d62f942be2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -52,12 +52,13 @@ static inline int rt_mutex_has_waiters(struct rt_mutex *lock) static inline struct rt_mutex_waiter * rt_mutex_top_waiter(struct rt_mutex *lock) { - struct rt_mutex_waiter *w; - - w = rb_entry(lock->waiters.rb_leftmost, - struct rt_mutex_waiter, tree_entry); - BUG_ON(w->lock != lock); + struct rb_node *leftmost = rb_first_cached(&lock->waiters); + struct rt_mutex_waiter *w = NULL; + if (leftmost) { + w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); + BUG_ON(w->lock != lock); + } return w; } -- cgit v1.2.3-71-gd317 From b3c39758c8a6972f02b43f83dba7fe7a352371b9 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 27 Mar 2018 19:41:41 +0900 Subject: lockdep: Make the lock debug output more useful The lock debug output in print_lock() has a few shortcomings: - It prints the hlock->acquire_ip field in %px and %pS format. That's redundant information. - It lacks information about the lock object itself. The lock class is not helpful to identify a particular instance of a lock. Change the output so it prints: - hlock->instance to allow identification of a particular lock instance. - only the %pS format of hlock->ip_acquire which is sufficient to decode the actual code line with faddr2line. The resulting output is: 3 locks held by a.out/31106: #0: 00000000b0f753ba (&mm->mmap_sem){++++}, at: copy_process.part.41+0x10d5/0x1fe0 #1: 00000000ef64d539 (&mm->mmap_sem/1){+.+.}, at: copy_process.part.41+0x10fe/0x1fe0 #2: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: copy_process.part.41+0x12f2/0x1fe0 [ tglx: Massaged changelog ] Signed-off-by: Tetsuo Handa Signed-off-by: Thomas Gleixner Acked-by: Michal Hocko Acked-by: David Rientjes Acked-by: Peter Zijlstra Cc: linux-mm@kvack.org Cc: Borislav Petkov Link: https://lkml.kernel.org/r/201803271941.GBE57310.tVSOJLQOFFOHFM@I-love.SAKURA.ne.jp --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 12a2805dd64a..023386338269 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -556,9 +556,9 @@ static void print_lock(struct held_lock *hlock) return; } + printk(KERN_CONT "%p", hlock->instance); print_lock_name(lock_classes + class_idx - 1); - printk(KERN_CONT ", at: [<%px>] %pS\n", - (void *)hlock->acquire_ip, (void *)hlock->acquire_ip); + printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } static void lockdep_print_held_locks(struct task_struct *curr) -- cgit v1.2.3-71-gd317 From 6ed70cf342de03c7b11cd4eb032705faeb29d284 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 29 Mar 2018 15:06:48 +0300 Subject: perf/x86/pt, coresight: Clean up address filter structure This is a cosmetic patch that deals with the address filter structure's ambiguous fields 'filter' and 'range'. The former stands to mean that the filter's *action* should be to filter the traces to its address range if it's set or stop tracing if it's unset. This is confusing and hard on the eyes, so this patch replaces it with 'action' enum. The 'range' field is completely redundant (meaning that the filter is an address range as opposed to a single address trigger), as we can use zero size to mean the same thing. Signed-off-by: Alexander Shishkin Acked-by: Mathieu Poirier Acked-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Will Deacon Link: http://lkml.kernel.org/r/20180329120648.11902-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/pt.c | 13 ++++-- drivers/hwtracing/coresight/coresight-etm-perf.c | 59 +++++++++++------------- include/linux/perf_event.h | 14 ++++-- kernel/events/core.c | 26 +++++++---- 4 files changed, 63 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 81fd41d5a0d9..3b993942a0e4 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1186,8 +1186,12 @@ static int pt_event_addr_filters_validate(struct list_head *filters) int range = 0; list_for_each_entry(filter, filters, entry) { - /* PT doesn't support single address triggers */ - if (!filter->range || !filter->size) + /* + * PT doesn't support single address triggers and + * 'start' filters. + */ + if (!filter->size || + filter->action == PERF_ADDR_FILTER_ACTION_START) return -EOPNOTSUPP; if (!filter->inode) { @@ -1227,7 +1231,10 @@ static void pt_event_addr_filters_sync(struct perf_event *event) filters->filter[range].msr_a = msr_a; filters->filter[range].msr_b = msr_b; - filters->filter[range].config = filter->filter ? 1 : 2; + if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER) + filters->filter[range].config = 1; + else + filters->filter[range].config = 2; range++; } diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c index 8a0ad77574e7..4e5ed6597f2f 100644 --- a/drivers/hwtracing/coresight/coresight-etm-perf.c +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c @@ -393,35 +393,26 @@ static int etm_addr_filters_validate(struct list_head *filters) if (++index > ETM_ADDR_CMP_MAX) return -EOPNOTSUPP; + /* filter::size==0 means single address trigger */ + if (filter->size) { + /* + * The existing code relies on START/STOP filters + * being address filters. + */ + if (filter->action == PERF_ADDR_FILTER_ACTION_START || + filter->action == PERF_ADDR_FILTER_ACTION_STOP) + return -EOPNOTSUPP; + + range = true; + } else + address = true; + /* - * As taken from the struct perf_addr_filter documentation: - * @range: 1: range, 0: address - * * At this time we don't allow range and start/stop filtering * to cohabitate, they have to be mutually exclusive. */ - if ((filter->range == 1) && address) + if (range && address) return -EOPNOTSUPP; - - if ((filter->range == 0) && range) - return -EOPNOTSUPP; - - /* - * For range filtering, the second address in the address - * range comparator needs to be higher than the first. - * Invalid otherwise. - */ - if (filter->range && filter->size == 0) - return -EINVAL; - - /* - * Everything checks out with this filter, record what we've - * received before moving on to the next one. - */ - if (filter->range) - range = true; - else - address = true; } return 0; @@ -441,18 +432,20 @@ static void etm_addr_filters_sync(struct perf_event *event) stop = start + filter->size; etm_filter = &filters->etm_filter[i]; - if (filter->range == 1) { + switch (filter->action) { + case PERF_ADDR_FILTER_ACTION_FILTER: etm_filter->start_addr = start; etm_filter->stop_addr = stop; etm_filter->type = ETM_ADDR_TYPE_RANGE; - } else { - if (filter->filter == 1) { - etm_filter->start_addr = start; - etm_filter->type = ETM_ADDR_TYPE_START; - } else { - etm_filter->stop_addr = stop; - etm_filter->type = ETM_ADDR_TYPE_STOP; - } + break; + case PERF_ADDR_FILTER_ACTION_START: + etm_filter->start_addr = start; + etm_filter->type = ETM_ADDR_TYPE_START; + break; + case PERF_ADDR_FILTER_ACTION_STOP: + etm_filter->stop_addr = stop; + etm_filter->type = ETM_ADDR_TYPE_STOP; + break; } i++; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ff39ab011376..e71e99eb9a4e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -449,14 +449,19 @@ struct pmu { int (*filter_match) (struct perf_event *event); /* optional */ }; +enum perf_addr_filter_action_t { + PERF_ADDR_FILTER_ACTION_STOP = 0, + PERF_ADDR_FILTER_ACTION_START, + PERF_ADDR_FILTER_ACTION_FILTER, +}; + /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage * @inode: object file's inode for file-based filters * @offset: filter range offset - * @size: filter range size - * @range: 1: range, 0: address - * @filter: 1: filter/start, 0: stop + * @size: filter range size (size==0 means single address trigger) + * @action: filter/start/stop * * This is a hardware-agnostic filter configuration as specified by the user. */ @@ -465,8 +470,7 @@ struct perf_addr_filter { struct inode *inode; unsigned long offset; unsigned long size; - unsigned int range : 1, - filter : 1; + enum perf_addr_filter_action_t action; }; /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 7517b4fb3ef4..fc1c330c6bd6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8803,7 +8803,8 @@ restart: * * for kernel addresses: [/] * * for object files: [/]@ * - * if is not specified, the range is treated as a single address. + * if is not specified or is zero, the range is treated as a single + * address; not valid for ACTION=="filter". */ enum { IF_ACT_NONE = -1, @@ -8853,6 +8854,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, return -ENOMEM; while ((start = strsep(&fstr, " ,\n")) != NULL) { + static const enum perf_addr_filter_action_t actions[] = { + [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, + [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, + [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, + }; ret = -EINVAL; if (!*start) @@ -8869,12 +8875,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, switch (token) { case IF_ACT_FILTER: case IF_ACT_START: - filter->filter = 1; - case IF_ACT_STOP: if (state != IF_STATE_ACTION) goto fail; + filter->action = actions[token]; state = IF_STATE_SOURCE; break; @@ -8887,15 +8892,12 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (state != IF_STATE_SOURCE) goto fail; - if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) - filter->range = 1; - *args[0].to = 0; ret = kstrtoul(args[0].from, 0, &filter->offset); if (ret) goto fail; - if (filter->range) { + if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { *args[1].to = 0; ret = kstrtoul(args[1].from, 0, &filter->size); if (ret) @@ -8903,7 +8905,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { - int fpos = filter->range ? 2 : 1; + int fpos = token == IF_SRC_FILE ? 2 : 1; filename = match_strdup(&args[fpos]); if (!filename) { @@ -8929,6 +8931,14 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (kernel && event->attr.exclude_kernel) goto fail; + /* + * ACTION "filter" must have a non-zero length region + * specified. + */ + if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && + !filter->size) + goto fail; + if (!kernel) { if (!filename) goto fail; -- cgit v1.2.3-71-gd317 From bd03143007eb9b03a7f2316c677780561b68ba2a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 26 Mar 2018 15:29:57 +0200 Subject: alarmtimer: Init nanosleep alarm timer on stack syszbot reported the following debugobjects splat: ODEBUG: object is on stack, but not annotated WARNING: CPU: 0 PID: 4185 at lib/debugobjects.c:328 RIP: 0010:debug_object_is_on_stack lib/debugobjects.c:327 [inline] debug_object_init+0x17/0x20 lib/debugobjects.c:391 debug_hrtimer_init kernel/time/hrtimer.c:410 [inline] debug_init kernel/time/hrtimer.c:458 [inline] hrtimer_init+0x8c/0x410 kernel/time/hrtimer.c:1259 alarm_init kernel/time/alarmtimer.c:339 [inline] alarm_timer_nsleep+0x164/0x4d0 kernel/time/alarmtimer.c:787 SYSC_clock_nanosleep kernel/time/posix-timers.c:1226 [inline] SyS_clock_nanosleep+0x235/0x330 kernel/time/posix-timers.c:1204 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 This happens because the hrtimer for the alarm nanosleep is on stack, but the code does not use the proper debug objects initialization. Split out the code for the allocated use cases and invoke hrtimer_init_on_stack() for the nanosleep related functions. Reported-by: syzbot+a3e0726462b2e346a31d@syzkaller.appspotmail.com Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: syzkaller-bugs@googlegroups.com Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1803261528270.1585@nanos.tec.linutronix.de --- kernel/time/alarmtimer.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ec09ce9a6012..639321bf2e39 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -326,6 +326,17 @@ static int alarmtimer_resume(struct device *dev) } #endif +static void +__alarm_init(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + timerqueue_init(&alarm->node); + alarm->timer.function = alarmtimer_fired; + alarm->function = function; + alarm->type = type; + alarm->state = ALARMTIMER_STATE_INACTIVE; +} + /** * alarm_init - Initialize an alarm structure * @alarm: ptr to alarm to be initialized @@ -335,13 +346,9 @@ static int alarmtimer_resume(struct device *dev) void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) { - timerqueue_init(&alarm->node); hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, - HRTIMER_MODE_ABS); - alarm->timer.function = alarmtimer_fired; - alarm->function = function; - alarm->type = type; - alarm->state = ALARMTIMER_STATE_INACTIVE; + HRTIMER_MODE_ABS); + __alarm_init(alarm, type, function); } EXPORT_SYMBOL_GPL(alarm_init); @@ -719,6 +726,8 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, __set_current_state(TASK_RUNNING); + destroy_hrtimer_on_stack(&alarm->timer); + if (!alarm->data) return 0; @@ -740,6 +749,15 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, return -ERESTART_RESTARTBLOCK; } +static void +alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + __alarm_init(alarm, type, function); +} + /** * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep * @restart: ptr to restart block @@ -752,7 +770,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) ktime_t exp = restart->nanosleep.expires; struct alarm alarm; - alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); return alarmtimer_do_nsleep(&alarm, exp, type); } @@ -784,7 +802,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, if (!capable(CAP_WAKE_ALARM)) return -EPERM; - alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); exp = timespec64_to_ktime(*tsreq); /* Convert (if necessary) to absolute time */ -- cgit v1.2.3-71-gd317 From 8934ce2fd08171e8605f7fada91ee7619fe17ab8 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 28 Mar 2018 12:49:15 -0700 Subject: bpf: sockmap redirect ingress support Add support for the BPF_F_INGRESS flag in sk_msg redirect helper. To do this add a scatterlist ring for receiving socks to check before calling into regular recvmsg call path. Additionally, because the poll wakeup logic only checked the skb recv queue we need to add a hook in TCP stack (similar to write side) so that we have a way to wake up polling socks when a scatterlist is redirected to that sock. After this all that is needed is for the redirect helper to push the scatterlist into the psock receive queue. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/net/sock.h | 1 + kernel/bpf/sockmap.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++- net/core/filter.c | 2 +- net/ipv4/tcp.c | 10 ++- 5 files changed, 207 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index c2f167db8bd5..961cc5d53956 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -521,6 +521,7 @@ struct sk_msg_buff { __u32 key; __u32 flags; struct bpf_map *map; + struct list_head list; }; /* Compute the linear packet data range [data, data_end) which diff --git a/include/net/sock.h b/include/net/sock.h index 709311132d4c..b8ff435fa96e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1085,6 +1085,7 @@ struct proto { #endif bool (*stream_memory_free)(const struct sock *sk); + bool (*stream_memory_read)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 69c5bccabd22..402e15466e9f 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -82,6 +84,7 @@ struct smap_psock { int sg_size; int eval; struct sk_msg_buff *cork; + struct list_head ingress; struct strparser strp; struct bpf_prog *bpf_tx_msg; @@ -103,6 +106,8 @@ struct smap_psock { }; static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len); static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); @@ -112,6 +117,21 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +static bool bpf_tcp_stream_read(const struct sock *sk) +{ + struct smap_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + empty = list_empty(&psock->ingress); +out: + rcu_read_unlock(); + return !empty; +} + static struct proto tcp_bpf_proto; static int bpf_tcp_init(struct sock *sk) { @@ -135,6 +155,8 @@ static int bpf_tcp_init(struct sock *sk) if (psock->bpf_tx_msg) { tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; + tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; } sk->sk_prot = &tcp_bpf_proto; @@ -170,6 +192,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; struct sock *osk; @@ -188,6 +211,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } + list_for_each_entry_safe(e, tmp, &psock->maps, list) { osk = cmpxchg(e->entry, sk, NULL); if (osk == sk) { @@ -468,6 +497,72 @@ verdict: return _rc; } +static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, + struct smap_psock *psock, + struct sk_msg_buff *md, int flags) +{ + bool apply = apply_bytes; + size_t size, copied = 0; + struct sk_msg_buff *r; + int err = 0, i; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); + if (unlikely(!r)) + return -ENOMEM; + + lock_sock(sk); + r->sg_start = md->sg_start; + i = md->sg_start; + + do { + r->sg_data[i] = md->sg_data[i]; + + size = (apply && apply_bytes < md->sg_data[i].length) ? + apply_bytes : md->sg_data[i].length; + + if (!sk_wmem_schedule(sk, size)) { + if (!copied) + err = -ENOMEM; + break; + } + + sk_mem_charge(sk, size); + r->sg_data[i].length = size; + md->sg_data[i].length -= size; + md->sg_data[i].offset += size; + copied += size; + + if (md->sg_data[i].length) { + get_page(sg_page(&r->sg_data[i])); + r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; + } else { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + r->sg_end = i; + } + + if (apply) { + apply_bytes -= size; + if (!apply_bytes) + break; + } + } while (i != md->sg_end); + + md->sg_start = i; + + if (!err) { + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + } else { + free_start_sg(sk, r); + kfree(r); + } + + release_sock(sk); + return err; +} + static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct sk_msg_buff *md, int flags) @@ -475,6 +570,7 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct smap_psock *psock; struct scatterlist *sg; int i, err, free = 0; + bool ingress = !!(md->flags & BPF_F_INGRESS); sg = md->sg_data; @@ -487,9 +583,14 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, goto out_rcu; rcu_read_unlock(); - lock_sock(sk); - err = bpf_tcp_push(sk, send, md, flags, false); - release_sock(sk); + + if (ingress) { + err = bpf_tcp_ingress(sk, send, psock, md, flags); + } else { + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + } smap_release_sock(psock, sk); if (unlikely(err)) goto out; @@ -623,6 +724,89 @@ out_err: return err; } +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct iov_iter *iter = &msg->msg_iter; + struct smap_psock *psock; + int copied = 0; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + + if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) + goto out; + rcu_read_unlock(); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); + while (copied != len) { + struct scatterlist *sg; + struct sk_msg_buff *md; + int i; + + md = list_first_entry_or_null(&psock->ingress, + struct sk_msg_buff, list); + if (unlikely(!md)) + break; + i = md->sg_start; + do { + struct page *page; + int n, copy; + + sg = &md->sg_data[i]; + copy = sg->length; + page = sg_page(sg); + + if (copied + copy > len) + copy = len - copied; + + n = copy_page_to_iter(page, sg->offset, copy, iter); + if (n != copy) { + md->sg_start = i; + release_sock(sk); + smap_release_sock(psock, sk); + return -EFAULT; + } + + copied += copy; + sg->offset += copy; + sg->length -= copy; + sk_mem_uncharge(sk, copy); + + if (!sg->length) { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + put_page(page); + } + if (copied == len) + break; + } while (i != md->sg_end); + md->sg_start = i; + + if (!sg->length && md->sg_start == md->sg_end) { + list_del(&md->list); + kfree(md); + } + } + + release_sock(sk); + smap_release_sock(psock, sk); + return copied; +out: + rcu_read_unlock(); + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); +} + + static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; @@ -1107,6 +1291,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab) static void smap_gc_work(struct work_struct *w) { struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; psock = container_of(w, struct smap_psock, gc_work); @@ -1131,6 +1316,12 @@ static void smap_gc_work(struct work_struct *w) kfree(psock->cork); } + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } + list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); @@ -1160,6 +1351,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); INIT_LIST_HEAD(&psock->maps); + INIT_LIST_HEAD(&psock->ingress); refcount_set(&psock->refcnt, 1); rcu_assign_sk_user_data(sock, psock); diff --git a/net/core/filter.c b/net/core/filter.c index afd825534ac4..a5a995e5b380 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1894,7 +1894,7 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, struct bpf_map *, map, u32, key, u64, flags) { /* If user passes invalid input drop the packet. */ - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; msg->key = key; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0c31be306572..bccc4c270087 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -485,6 +485,14 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) } } +static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, + int target, struct sock *sk) +{ + return (tp->rcv_nxt - tp->copied_seq >= target) || + (sk->sk_prot->stream_memory_read ? + sk->sk_prot->stream_memory_read(sk) : false); +} + /* * Wait for a TCP event. * @@ -554,7 +562,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - if (tp->rcv_nxt - tp->copied_seq >= target) + if (tcp_stream_is_readable(tp, target, sk)) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { -- cgit v1.2.3-71-gd317 From fa246693a111fab32bd51d20f07a347e42773ee9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 28 Mar 2018 12:49:25 -0700 Subject: bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT: Add support for the BPF_F_INGRESS flag in skb redirect helper. To do this convert skb into a scatterlist and push into ingress queue. This is the same logic that is used in the sk_msg redirect helper so it should feel familiar. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + kernel/bpf/sockmap.c | 94 ++++++++++++++++++++++++++++++++++++++++---------- net/core/filter.c | 2 +- 3 files changed, 78 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 961cc5d53956..897ff3d95968 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -521,6 +521,7 @@ struct sk_msg_buff { __u32 key; __u32 flags; struct bpf_map *map; + struct sk_buff *skb; struct list_head list; }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 402e15466e9f..9192fdbcbccc 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -785,7 +785,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, i++; if (i == MAX_SKB_FRAGS) i = 0; - put_page(page); + if (!md->skb) + put_page(page); } if (copied == len) break; @@ -794,6 +795,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (!sg->length && md->sg_start == md->sg_end) { list_del(&md->list); + if (md->skb) + consume_skb(md->skb); kfree(md); } } @@ -1045,27 +1048,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) __SK_DROP; } +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sock; + int copied = 0, num_sg; + struct sk_msg_buff *r; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); + if (unlikely(!r)) + return -EAGAIN; + + if (!sk_rmem_schedule(sk, skb, skb->len)) { + kfree(r); + return -EAGAIN; + } + + sg_init_table(r->sg_data, MAX_SKB_FRAGS); + num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); + if (unlikely(num_sg < 0)) { + kfree(r); + return num_sg; + } + sk_mem_charge(sk, skb->len); + copied = skb->len; + r->sg_start = 0; + r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; + r->skb = skb; + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + return copied; +} + static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) { + struct smap_psock *peer; struct sock *sk; + __u32 in; int rc; rc = smap_verdict_func(psock, skb); switch (rc) { case __SK_REDIRECT: sk = do_sk_redirect_map(skb); - if (likely(sk)) { - struct smap_psock *peer = smap_psock_sk(sk); - - if (likely(peer && - test_bit(SMAP_TX_RUNNING, &peer->state) && - !sock_flag(sk, SOCK_DEAD) && - sock_writeable(sk))) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } + if (!sk) { + kfree_skb(skb); + break; + } + + peer = smap_psock_sk(sk); + in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; + + if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || + !test_bit(SMAP_TX_RUNNING, &peer->state))) { + kfree_skb(skb); + break; + } + + if (!in && sock_writeable(sk)) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; + } else if (in && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; } /* Fall through and free skb otherwise */ case __SK_DROP: @@ -1127,15 +1175,23 @@ static void smap_tx_work(struct work_struct *w) } while ((skb = skb_dequeue(&psock->rxqueue))) { + __u32 flags; + rem = skb->len; off = 0; start: + flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; do { - if (likely(psock->sock->sk_socket)) - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - else + if (likely(psock->sock->sk_socket)) { + if (flags) + n = smap_do_ingress(psock, skb); + else + n = skb_send_sock_locked(psock->sock, + skb, off, rem); + } else { n = -EINVAL; + } + if (n <= 0) { if (n == -EAGAIN) { /* Retry when space is available */ @@ -1153,7 +1209,9 @@ start: rem -= n; off += n; } while (rem); - kfree_skb(skb); + + if (!flags) + kfree_skb(skb); } out: release_sock(psock->sock); diff --git a/net/core/filter.c b/net/core/filter.c index a5a995e5b380..e989bf313195 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1855,7 +1855,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); /* If user passes invalid input drop the packet. */ - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; tcb->bpf.key = key; -- cgit v1.2.3-71-gd317 From 355064675f1c997cea017ea64c8f2c216e5425d9 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 28 Mar 2018 12:01:09 -0500 Subject: PM / hibernate: Make passing hibernate offsets more friendly Currently the only way to specify a hibernate offset for a swap file is on the kernel command line. Add a new /sys/power/resume_offset that lets userspace specify the offset and disk to use when initiating a hibernate cycle. Signed-off-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-power | 14 ++++++++++++++ Documentation/power/swsusp.txt | 10 +++++++++- kernel/power/hibernate.c | 24 ++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 1e0d1dac706b..2f813d644c69 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -287,3 +287,17 @@ Description: Writing a "1" to this file enables the debug messages and writing a "0" (default) to it disables them. Reads from this file return the current value. + +What: /sys/power/resume_offset +Date: April 2018 +Contact: Mario Limonciello +Description: + This file is used for telling the kernel an offset into a disk + to use when hibernating the system such as with a swap file. + + Reads from this file will display the current offset + the kernel will be using on the next hibernation + attempt. + + Using this sysfs file will override any values that were + set using the kernel command line for disk offset. \ No newline at end of file diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt index 9f2f942a01cf..cc87adf44c0a 100644 --- a/Documentation/power/swsusp.txt +++ b/Documentation/power/swsusp.txt @@ -24,8 +24,16 @@ Some warnings, first. * see the FAQ below for details. (This is not true for more traditional * power states like "standby", which normally don't turn USB off.) +Swap partition: You need to append resume=/dev/your_swap_partition to kernel command -line. Then you suspend by +line or specify it using /sys/power/resume. + +Swap file: +If using a swapfile you can also specify a resume offset using +resume_offset= on the kernel command line or specify it +in /sys/power/resume_offset. + +After preparing then you suspend by echo shutdown > /sys/power/disk; echo disk > /sys/power/state diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a5c36e9c56a6..d58fad3db22d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1061,6 +1061,29 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(resume); +static ssize_t resume_offset_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)swsusp_resume_block); +} + +static ssize_t resume_offset_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t n) +{ + unsigned long long offset; + int rc; + + rc = kstrtoull(buf, 0, &offset); + if (rc) + return rc; + swsusp_resume_block = offset; + + return n; +} + +power_attr(resume_offset); + static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1106,6 +1129,7 @@ power_attr(reserved_size); static struct attribute * g[] = { &disk_attr.attr, + &resume_offset_attr.attr, &resume_attr.attr, &image_size_attr.attr, &reserved_size_attr.attr, -- cgit v1.2.3-71-gd317 From 648464076160ee7a4112d05eea13621790ab9d04 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 28 Mar 2018 12:01:10 -0500 Subject: PM / hibernate: Change message when writing to /sys/power/resume This file is used both for setting the wakeup device without kernel command line as well as for actually waking the system (when appropriate swap header is in place). To avoid confusion on incorrect logs in system log downgrade the message to debug and make it clearer. Signed-off-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index d58fad3db22d..1028ecbb7a06 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1053,7 +1053,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, lock_system_sleep(); swsusp_resume_device = res; unlock_system_sleep(); - pr_info("Starting manual resume from disk\n"); + pm_pr_dbg("Configured resume from disk to %u\n", swsusp_resume_device); noresume = 0; software_resume(); return n; -- cgit v1.2.3-71-gd317 From 6ef6d84ceee2262a8c7a4247616c7eb71268b3f9 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Fri, 30 Mar 2018 09:21:00 +0900 Subject: bpf: sockmap: initialize sg table entries properly When CONFIG_DEBUG_SG is set, sg->sg_magic is initialized in sg_init_table() and it is verified in sg api while navigating. We hit BUG_ON when magic check is failed. In functions sg_tcp_sendpage and sg_tcp_sendmsg, the struct containing the scatterlist is already zeroed out. So to avoid extra memset, we use sg_init_marker() to initialize sg_magic. Fixed following things: - In bpf_tcp_sendpage: initialize sg using sg_init_marker - In bpf_tcp_sendmsg: Replace sg_init_table with sg_init_marker - In bpf_tcp_push: Replace memset with sg_init_table where consumed sg entry needs to be re-initialized. Signed-off-by: Prashant Bhole Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 9192fdbcbccc..d2bda5aa25d7 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -341,7 +341,7 @@ retry: md->sg_start++; if (md->sg_start == MAX_SKB_FRAGS) md->sg_start = 0; - memset(sg, 0, sizeof(*sg)); + sg_init_table(sg, 1); if (md->sg_start == md->sg_end) break; @@ -843,7 +843,7 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } sg = md.sg_data; - sg_init_table(sg, MAX_SKB_FRAGS); + sg_init_marker(sg, MAX_SKB_FRAGS); rcu_read_unlock(); lock_sock(sk); @@ -950,10 +950,14 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page *page, lock_sock(sk); - if (psock->cork_bytes) + if (psock->cork_bytes) { m = psock->cork; - else + sg = &m->sg_data[m->sg_end]; + } else { m = &md; + sg = m->sg_data; + sg_init_marker(sg, MAX_SKB_FRAGS); + } /* Catch case where ring is full and sendpage is stalled. */ if (unlikely(m->sg_end == m->sg_start && @@ -961,7 +965,6 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page *page, goto out_err; psock->sg_size += size; - sg = &m->sg_data[m->sg_end]; sg_set_page(sg, page, size, offset); get_page(page); m->sg_copy[m->sg_end] = true; -- cgit v1.2.3-71-gd317 From 5e43f899b03a3492ce5fc44e8900becb04dae9c0 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:00 -0700 Subject: bpf: Check attach type at prog load time == The problem == There are use-cases when a program of some type can be attached to multiple attach points and those attach points must have different permissions to access context or to call helpers. E.g. context structure may have fields for both IPv4 and IPv6 but it doesn't make sense to read from / write to IPv6 field when attach point is somewhere in IPv4 stack. Same applies to BPF-helpers: it may make sense to call some helper from some attach point, but not from other for same prog type. == The solution == Introduce `expected_attach_type` field in in `struct bpf_attr` for `BPF_PROG_LOAD` command. If scenario described in "The problem" section is the case for some prog type, the field will be checked twice: 1) At load time prog type is checked to see if attach type for it must be known to validate program permissions correctly. Prog will be rejected with EINVAL if it's the case and `expected_attach_type` is not specified or has invalid value. 2) At attach time `attach_type` is compared with `expected_attach_type`, if prog type requires to have one, and, if they differ, attach will be rejected with EINVAL. The `expected_attach_type` is now available as part of `struct bpf_prog` in both `bpf_verifier_ops->is_valid_access()` and `bpf_verifier_ops->get_func_proto()` () and can be used to check context accesses and calls to helpers correspondingly. Initially the idea was discussed by Alexei Starovoitov and Daniel Borkmann here: https://marc.info/?l=linux-netdev&m=152107378717201&w=2 Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 5 ++++- include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/cgroup.c | 3 ++- kernel/bpf/syscall.c | 31 ++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 6 +++--- kernel/trace/bpf_trace.c | 27 ++++++++++++++++++--------- net/core/filter.c | 39 +++++++++++++++++++++++++-------------- 8 files changed, 88 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 819229c80eca..95a7abd0ee92 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -208,12 +208,15 @@ struct bpf_prog_ops { struct bpf_verifier_ops { /* return eBPF function prototype for verification */ - const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); + const struct bpf_func_proto * + (*get_func_proto)(enum bpf_func_id func_id, + const struct bpf_prog *prog); /* return true if 'size' wide access at offset 'off' within bpf_context * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); diff --git a/include/linux/filter.h b/include/linux/filter.h index 897ff3d95968..13c044e4832d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -469,6 +469,7 @@ struct bpf_prog { is_func:1, /* program is a bpf function */ kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ + enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1878201c2d77..102718624d1e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -296,6 +296,11 @@ union bpf_attr { __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c1c0b60d3f2f..8730b24ed540 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -545,7 +545,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -566,6 +566,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id) static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 95ca2523fa6e..9d3b572d4dec 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1171,8 +1171,27 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +static int +bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, + enum bpf_attach_type expected_attach_type) +{ + /* There are currently no prog types that require specifying + * attach_type at load time. + */ + return 0; +} + +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + /* There are currently no prog types that require specifying + * attach_type at load time. + */ + return 0; +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex +#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type static int bpf_prog_load(union bpf_attr *attr) { @@ -1209,11 +1228,16 @@ static int bpf_prog_load(union bpf_attr *attr) !capable(CAP_SYS_ADMIN)) return -EPERM; + if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; + prog->expected_attach_type = attr->expected_attach_type; + prog->aux->offload_requested = !!attr->prog_ifindex; err = security_bpf_prog_alloc(prog->aux); @@ -1474,6 +1498,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); + if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { + bpf_prog_put(prog); + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) { bpf_prog_put(prog); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8acd2207e412..10024323031d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1323,7 +1323,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, }; if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -2349,7 +2349,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } if (env->ops->get_func_proto) - fn = env->ops->get_func_proto(func_id); + fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -5572,7 +5572,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = env->ops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 463e72d18c4c..d88e96d4e12c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -524,7 +524,8 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -568,7 +569,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -582,12 +584,13 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_override_return_proto; #endif default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) @@ -661,7 +664,8 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -669,11 +673,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) @@ -721,7 +726,8 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -731,7 +737,7 @@ static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } @@ -781,7 +787,8 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -789,12 +796,13 @@ static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } static bool raw_tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { /* largest tracepoint in the kernel has 12 args */ @@ -816,6 +824,7 @@ const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_u64 = sizeof(u64); diff --git a/net/core/filter.c b/net/core/filter.c index e989bf313195..7790fd128614 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3685,7 +3685,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sock_filter_func_proto(enum bpf_func_id func_id) +sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { /* inet and inet6 sockets are created in a process @@ -3699,7 +3699,7 @@ sock_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sk_filter_func_proto(enum bpf_func_id func_id) +sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3714,7 +3714,7 @@ sk_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -tc_cls_act_func_proto(enum bpf_func_id func_id) +tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3781,7 +3781,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -xdp_func_proto(enum bpf_func_id func_id) +xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -3804,7 +3804,7 @@ xdp_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id) +lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3831,7 +3831,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * - sock_ops_func_proto(enum bpf_func_id func_id) +sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: @@ -3847,7 +3847,8 @@ static const struct bpf_func_proto * } } -static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_msg_redirect_map: @@ -3863,7 +3864,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3888,7 +3890,7 @@ static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_xmit_func_proto(enum bpf_func_id func_id) +lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: @@ -3918,11 +3920,12 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: - return lwt_inout_func_proto(func_id); + return lwt_inout_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -3966,6 +3969,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -3986,11 +3990,12 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -4020,11 +4025,12 @@ static bool lwt_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -4096,6 +4102,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -4125,7 +4132,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, return false; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool __is_valid_xdp_access(int off, int size) @@ -4142,6 +4149,7 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) @@ -4174,6 +4182,7 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -4220,6 +4229,7 @@ static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -4249,11 +4259,12 @@ static bool sk_skb_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sk_msg_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) -- cgit v1.2.3-71-gd317 From 4fbac77d2d092b475dda9eea66da674369665427 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:02 -0700 Subject: bpf: Hooks for sys_bind == The problem == There is a use-case when all processes inside a cgroup should use one single IP address on a host that has multiple IP configured. Those processes should use the IP for both ingress and egress, for TCP and UDP traffic. So TCP/UDP servers should be bound to that IP to accept incoming connections on it, and TCP/UDP clients should make outgoing connections from that IP. It should not require changing application code since it's often not possible. Currently it's solved by intercepting glibc wrappers around syscalls such as `bind(2)` and `connect(2)`. It's done by a shared library that is preloaded for every process in a cgroup so that whenever TCP/UDP server calls `bind(2)`, the library replaces IP in sockaddr before passing arguments to syscall. When application calls `connect(2)` the library transparently binds the local end of connection to that IP (`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty). Shared library approach is fragile though, e.g.: * some applications clear env vars (incl. `LD_PRELOAD`); * `/etc/ld.so.preload` doesn't help since some applications are linked with option `-z nodefaultlib`; * other applications don't use glibc and there is nothing to intercept. == The solution == The patch provides much more reliable in-kernel solution for the 1st part of the problem: binding TCP/UDP servers on desired IP. It does not depend on application environment and implementation details (whether glibc is used or not). It adds new eBPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` and attach types `BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND` (similar to already existing `BPF_CGROUP_INET_SOCK_CREATE`). The new program type is intended to be used with sockets (`struct sock`) in a cgroup and provided by user `struct sockaddr`. Pointers to both of them are parts of the context passed to programs of newly added types. The new attach types provides hooks in `bind(2)` system call for both IPv4 and IPv6 so that one can write a program to override IP addresses and ports user program tries to bind to and apply such a program for whole cgroup. == Implementation notes == [1] Separate attach types for `AF_INET` and `AF_INET6` are added intentionally to prevent reading/writing to offsets that don't make sense for corresponding socket family. E.g. if user passes `sockaddr_in` it doesn't make sense to read from / write to `user_ip6[]` context fields. [2] The write access to `struct bpf_sock_addr_kern` is implemented using special field as an additional "register". There are just two registers in `sock_addr_convert_ctx_access`: `src` with value to write and `dst` with pointer to context that can't be changed not to break later instructions. But the fields, allowed to write to, are not available directly and to access them address of corresponding pointer has to be loaded first. To get additional register the 1st not used by `src` and `dst` one is taken, its content is saved to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load address of pointer field, and finally the register's content is restored from the temporary field after writing `src` value. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 21 ++++ include/linux/bpf_types.h | 1 + include/linux/filter.h | 10 ++ include/uapi/linux/bpf.h | 23 +++++ kernel/bpf/cgroup.c | 36 +++++++ kernel/bpf/syscall.c | 36 +++++-- kernel/bpf/verifier.c | 1 + net/core/filter.c | 232 +++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 7 ++ net/ipv6/af_inet6.c | 7 ++ 10 files changed, 366 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8a4566691c8f..67dc4a6471ad 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -6,6 +6,7 @@ #include struct sock; +struct sockaddr; struct cgroup; struct sk_buff; struct bpf_sock_ops_kern; @@ -63,6 +64,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type); + int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum bpf_attach_type type); @@ -103,6 +108,20 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -135,6 +154,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 6d7243bfb0ff..2b28fcf6f6ae 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -8,6 +8,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) diff --git a/include/linux/filter.h b/include/linux/filter.h index 13c044e4832d..fc4e8f91b03d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1021,6 +1021,16 @@ static inline int bpf_tell_extensions(void) return SKF_AD_MAX; } +struct bpf_sock_addr_kern { + struct sock *sk; + struct sockaddr *uaddr; + /* Temporary "register" to make indirect stores to nested structures + * defined above. We need three registers to make such a store, but + * only two (src and dst) are available at convert_ctx_access time + */ + u64 tmp_reg; +}; + struct bpf_sock_ops_kern { struct sock *sk; u32 op; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 102718624d1e..ce3e69e3c793 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -136,6 +136,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, }; enum bpf_attach_type { @@ -147,6 +148,8 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -1010,6 +1013,26 @@ struct bpf_map_info { __u64 netns_ino; } __attribute__((aligned(8))); +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 4-byte read and write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ +}; + /* User bpf_sock_ops struct to access socket values and specify request ops * and their replies. * Some of this fields are in network (bigendian) byte order and may need diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8730b24ed540..43171a0bb02b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -494,6 +494,42 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); +/** + * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and + * provided by user sockaddr + * @sk: sock struct that will use sockaddr + * @uaddr: sockaddr struct provided by user + * @type: The type of program to be exectuted + * + * socket is expected to be of type INET or INET6. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type) +{ + struct bpf_sock_addr_kern ctx = { + .sk = sk, + .uaddr = uaddr, + }; + struct cgroup *cgrp; + int ret; + + /* Check socket family since not all sockets represent network + * endpoint (e.g. AF_UNIX). + */ + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); + /** * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock * @sk: socket to get cgroup from diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9d3b572d4dec..2cad66a4cacb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1175,19 +1175,29 @@ static int bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type) { - /* There are currently no prog types that require specifying - * attach_type at load time. - */ - return 0; + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + switch (expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + return 0; + default: + return -EINVAL; + } + default: + return 0; + } } static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { - /* There are currently no prog types that require specifying - * attach_type at load time. - */ - return 0; + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + default: + return 0; + } } /* last field in 'union bpf_attr' used by this command */ @@ -1479,6 +1489,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; @@ -1541,6 +1555,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; @@ -1590,6 +1608,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 10024323031d..5dd1dcb902bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3887,6 +3887,7 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: break; diff --git a/net/core/filter.c b/net/core/filter.c index 7790fd128614..c08e5b121558 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3698,6 +3698,20 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + /* inet and inet6 sockets are created in a process + * context so there is always a valid uid/gid + */ + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4180,6 +4194,69 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool sock_addr_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sock_addr)) + return false; + if (off % size != 0) + return false; + + /* Disallow access to IPv6 fields from IPv4 contex and vise + * versa. + */ + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET6_BIND: + break; + default: + return false; + } + break; + } + + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + /* Only narrow read access allowed for now. */ + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } else { + if (size != size_default) + return false; + } + break; + case bpf_ctx_range(struct bpf_sock_addr, user_port): + if (size != size_default) + return false; + break; + default: + if (type == BPF_READ) { + if (size != size_default) + return false; + } else { + return false; + } + } + + return true; +} + static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -4724,6 +4801,152 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of + * context Structure, F is Field in context structure that contains a pointer + * to Nested Structure of type NS that has the field NF. + * + * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make + * sure that SIZE is not greater than actual size of S.F.NF. + * + * If offset OFF is provided, the load happens from that offset relative to + * offset of NF. + */ +#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ + do { \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ + si->src_reg, offsetof(S, F)); \ + *insn++ = BPF_LDX_MEM( \ + SIZE, si->dst_reg, si->dst_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + } while (0) + +#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ + BPF_FIELD_SIZEOF(NS, NF), 0) + +/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to + * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. + * + * It doesn't support SIZE argument though since narrow stores are not + * supported for now. + * + * In addition it uses Temporary Field TF (member of struct S) as the 3rd + * "register" since two registers available in convert_ctx_access are not + * enough: we can't override neither SRC, since it contains value to store, nor + * DST since it contains pointer to context that may be used by later + * instructions. But we need a temporary place to save pointer to nested + * structure whose field we want to store to. + */ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ + do { \ + int tmp_reg = BPF_REG_9; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ + offsetof(S, TF)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ + si->dst_reg, offsetof(S, F)); \ + *insn++ = BPF_STX_MEM( \ + BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ + offsetof(S, TF)); \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ + TF) \ + do { \ + if (type == BPF_WRITE) { \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ + TF); \ + } else { \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, SIZE, OFF); \ + } \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) + +static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock_addr, user_family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr, uaddr, sa_family); + break; + + case offsetof(struct bpf_sock_addr, user_ip4): + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, + sin_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, user_ip6[0]); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, + tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, user_port): + /* To get port we need to know sa_family first and then treat + * sockaddr as either sockaddr_in or sockaddr_in6. + * Though we can simplify since port field has same offset and + * size in both structures. + * Here we check this invariant and use just one of the + * structures if it's true. + */ + BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != + offsetof(struct sockaddr_in6, sin6_port)); + BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) != + FIELD_SIZEOF(struct sockaddr_in6, sin6_port)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr_in6, uaddr, + sin6_port, tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_family); + break; + + case offsetof(struct bpf_sock_addr, type): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + break; + + case offsetof(struct bpf_sock_addr, protocol): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + break; + } + + return insn - insn_buf; +} + static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -5181,6 +5404,15 @@ const struct bpf_verifier_ops cg_sock_verifier_ops = { const struct bpf_prog_ops cg_sock_prog_ops = { }; +const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { + .get_func_proto = sock_addr_func_proto, + .is_valid_access = sock_addr_is_valid_access, + .convert_ctx_access = sock_addr_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sock_addr_prog_ops = { +}; + const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e8c7fad8c329..2dec266507dc 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -450,6 +450,13 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < sizeof(struct sockaddr_in)) goto out; + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + if (err) + goto out; + if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) * only if s_addr is INADDR_ANY. diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index dbbe04018813..fa24e3f06ac6 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -295,6 +295,13 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + if (err) + return err; + if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; -- cgit v1.2.3-71-gd317 From d74bad4e74ee373787a9ae24197c17b7cdc428d5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:05 -0700 Subject: bpf: Hooks for sys_connect == The problem == See description of the problem in the initial patch of this patch set. == The solution == The patch provides much more reliable in-kernel solution for the 2nd part of the problem: making outgoing connecttion from desired IP. It adds new attach types `BPF_CGROUP_INET4_CONNECT` and `BPF_CGROUP_INET6_CONNECT` for program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that can be used to override both source and destination of a connection at connect(2) time. Local end of connection can be bound to desired IP using newly introduced BPF-helper `bpf_bind()`. It allows to bind to only IP though, and doesn't support binding to port, i.e. leverages `IP_BIND_ADDRESS_NO_PORT` socket option. There are two reasons for this: * looking for a free port is expensive and can affect performance significantly; * there is no use-case for port. As for remote end (`struct sockaddr *` passed by user), both parts of it can be overridden, remote IP and remote port. It's useful if an application inside cgroup wants to connect to another application inside same cgroup or to itself, but knows nothing about IP assigned to the cgroup. Support is added for IPv4 and IPv6, for TCP and UDP. IPv4 and IPv6 have separate attach types for same reason as sys_bind hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. == Implementation notes == The patch introduces new field in `struct proto`: `pre_connect` that is a pointer to a function with same signature as `connect` but is called before it. The reason is in some cases BPF hooks should be called way before control is passed to `sk->sk_prot->connect`. Specifically `inet_dgram_connect` autobinds socket before calling `sk->sk_prot->connect` and there is no way to call `bpf_bind()` from hooks from e.g. `ip4_datagram_connect` or `ip6_datagram_connect` since it'd cause double-bind. On the other hand `proto.pre_connect` provides a flexible way to add BPF hooks for connect only for necessary `proto` and call them at desired time before `connect`. Since `bpf_bind()` is allowed to bind only to IP and autobind in `inet_dgram_connect` binds only port there is no chance of double-bind. bpf_bind() sets `force_bind_address_no_port` to bind to only IP despite of value of `bind_address_no_port` socket field. bpf_bind() sets `with_lock` to `false` when calling to __inet_bind() and __inet6_bind() since all call-sites, where bpf_bind() is called, already hold socket lock. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 31 +++++++++++++++++++++++++ include/net/addrconf.h | 7 ++++++ include/net/sock.h | 3 +++ include/net/udp.h | 1 + include/uapi/linux/bpf.h | 12 +++++++++- kernel/bpf/syscall.c | 8 +++++++ net/core/filter.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 13 +++++++++++ net/ipv4/tcp_ipv4.c | 16 +++++++++++++ net/ipv4/udp.c | 14 ++++++++++++ net/ipv6/af_inet6.c | 5 ++++ net/ipv6/tcp_ipv6.c | 16 +++++++++++++ net/ipv6/udp.c | 20 ++++++++++++++++ 13 files changed, 202 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 67dc4a6471ad..c6ab295e6dcb 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -116,12 +116,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) { \ + lock_sock(sk); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + release_sock(sk); \ + } \ + __ret; \ +}) + #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ + sk->sk_prot->pre_connect) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -151,11 +177,16 @@ struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 132e5b95167a..378d601258be 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -231,6 +231,13 @@ struct ipv6_stub { }; extern const struct ipv6_stub *ipv6_stub __read_mostly; +/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ +struct ipv6_bpf_stub { + int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); +}; +extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; + /* * identify MLD packets for MLD filter exceptions */ diff --git a/include/net/sock.h b/include/net/sock.h index b8ff435fa96e..49bd2c1796b0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1026,6 +1026,9 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) struct proto { void (*close)(struct sock *sk, long timeout); + int (*pre_connect)(struct sock *sk, + struct sockaddr *uaddr, + int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); diff --git a/include/net/udp.h b/include/net/udp.h index 850a8e581cce..0676b272f6ac 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -273,6 +273,7 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_init_sock(struct sock *sk); +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ce3e69e3c793..77afaf1ba556 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -150,6 +150,8 @@ enum bpf_attach_type { BPF_SK_MSG_VERDICT, BPF_CGROUP_INET4_BIND, BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, __MAX_BPF_ATTACH_TYPE }; @@ -744,6 +746,13 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_PASS * + * int bpf_bind(ctx, addr, addr_len) + * Bind socket to address. Only binding to IP is supported, no port can be + * set in addr. + * @ctx: pointer to context of type bpf_sock_addr + * @addr: pointer to struct sockaddr to bind socket to + * @addr_len: length of sockaddr structure + * Return: 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -809,7 +818,8 @@ union bpf_attr { FN(msg_redirect_map), \ FN(msg_apply_bytes), \ FN(msg_cork_bytes), \ - FN(msg_pull_data), + FN(msg_pull_data), \ + FN(bind), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2cad66a4cacb..cf1b29bc0ab8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1180,6 +1180,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, switch (expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: return 0; default: return -EINVAL; @@ -1491,6 +1493,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) break; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1557,6 +1561,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) break; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1610,6 +1616,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; diff --git a/net/core/filter.c b/net/core/filter.c index c08e5b121558..bdb9cadd4d27 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -3656,6 +3657,52 @@ static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .arg2_type = ARG_ANYTHING, }; +const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_bpf_stub); + +BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, + int, addr_len) +{ +#ifdef CONFIG_INET + struct sock *sk = ctx->sk; + int err; + + /* Binding to port can be expensive so it's prohibited in the helper. + * Only binding to IP is supported. + */ + err = -EINVAL; + if (addr->sa_family == AF_INET) { + if (addr_len < sizeof(struct sockaddr_in)) + return err; + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) + return err; + return __inet_bind(sk, addr, addr_len, true, false); +#if IS_ENABLED(CONFIG_IPV6) + } else if (addr->sa_family == AF_INET6) { + if (addr_len < SIN6_LEN_RFC2133) + return err; + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) + return err; + /* ipv6_bpf_stub cannot be NULL, since it's called from + * bpf_cgroup_inet6_connect hook and ipv6 is already loaded + */ + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); +#endif /* CONFIG_IPV6 */ + } +#endif /* CONFIG_INET */ + + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_bind_proto = { + .func = bpf_bind, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3707,6 +3754,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_bind: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_bind_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id); } @@ -4213,6 +4268,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, user_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET4_CONNECT: break; default: return false; @@ -4221,6 +4277,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET6_CONNECT: break; default: return false; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e203a39d6988..488fe26ac8e5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -547,12 +547,19 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + return err; + } + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->connect(sk, uaddr, addr_len); @@ -633,6 +640,12 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != TCP_CLOSE) goto out; + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + goto out; + } + err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2c6aec2643e8..3c11d992d784 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -140,6 +140,21 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_SYMBOL_GPL(tcp_twsk_unique); +static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v4_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); +} + /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -2409,6 +2424,7 @@ struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v4_pre_connect, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 908fc02fb4f8..9c6c77fec963 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1658,6 +1658,19 @@ csum_copy_err: goto try_again; } +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + /* This check is replicated from __ip4_datagram_connect() and + * intended to prevent BPF program called below from accessing bytes + * that are out of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); +} +EXPORT_SYMBOL(udp_pre_connect); + int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2530,6 +2543,7 @@ struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, .close = udp_lib_close, + .pre_connect = udp_pre_connect, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 13110bee5c14..566cec0e0a44 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -887,6 +887,10 @@ static const struct ipv6_stub ipv6_stub_impl = { .nd_tbl = &nd_tbl, }; +static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { + .inet6_bind = __inet6_bind, +}; + static int __init inet6_init(void) { struct list_head *r; @@ -1043,6 +1047,7 @@ static int __init inet6_init(void) /* ensure that ipv6 stubs are visible only after ipv6 is ready */ wmb(); ipv6_stub = &ipv6_stub_impl; + ipv6_bpf_stub = &ipv6_bpf_stub_impl; out: return err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5425d7b100ee..6469b741cf5a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -117,6 +117,21 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) ipv6_hdr(skb)->saddr.s6_addr32); } +static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v6_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr); +} + static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -1925,6 +1940,7 @@ struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ad30f5e31969..6861ed479469 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -957,6 +957,25 @@ static void udp_v6_flush_pending_frames(struct sock *sk) } } +static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* The following checks are replicated from __ip6_datagram_connect() + * and intended to prevent BPF program called below from accessing + * bytes that are out of the bound specified by user in addr_len. + */ + if (uaddr->sa_family == AF_INET) { + if (__ipv6_only_sock(sk)) + return -EAFNOSUPPORT; + return udp_pre_connect(sk, uaddr, addr_len); + } + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); +} + /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on @@ -1512,6 +1531,7 @@ struct proto udpv6_prot = { .name = "UDPv6", .owner = THIS_MODULE, .close = udp_lib_close, + .pre_connect = udpv6_pre_connect, .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, -- cgit v1.2.3-71-gd317 From aac3fc320d9404f2665a8b1249dc3170d5fa3caf Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:07 -0700 Subject: bpf: Post-hooks for sys_bind "Post-hooks" are hooks that are called right before returning from sys_bind. At this time IP and port are already allocated and no further changes to `struct sock` can happen before returning from sys_bind but BPF program has a chance to inspect the socket and change sys_bind result. Specifically it can e.g. inspect what port was allocated and if it doesn't satisfy some policy, BPF program can force sys_bind to fail and return EPERM to user. Another example of usage is recording the IP:port pair to some map to use it in later calls to sys_connect. E.g. if some TCP server inside cgroup was bound to some IP:port_n, it can be recorded to a map. And later when some TCP client inside same cgroup is trying to connect to 127.0.0.1:port_n, BPF hook for sys_connect can override the destination and connect application to IP:port_n instead of 127.0.0.1:port_n. That helps forcing all applications inside a cgroup to use desired IP and not break those applications if they e.g. use localhost to communicate between each other. == Implementation details == Post-hooks are implemented as two new attach types `BPF_CGROUP_INET4_POST_BIND` and `BPF_CGROUP_INET6_POST_BIND` for existing prog type `BPF_PROG_TYPE_CGROUP_SOCK`. Separate attach types for IPv4 and IPv6 are introduced to avoid access to IPv6 field in `struct sock` from `inet_bind()` and to IPv4 field from `inet6_bind()` since those fields might not make sense in such cases. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 16 +++++-- include/uapi/linux/bpf.h | 11 +++++ kernel/bpf/syscall.c | 43 +++++++++++++++++ net/core/filter.c | 116 +++++++++++++++++++++++++++++++++++++++------ net/ipv4/af_inet.c | 18 ++++--- net/ipv6/af_inet6.c | 21 +++++--- 6 files changed, 195 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c6ab295e6dcb..30d15e64b993 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -98,16 +98,24 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, \ - BPF_CGROUP_INET_SOCK_CREATE); \ + __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ int __ret = 0; \ @@ -183,6 +191,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77afaf1ba556..c5ec89732a8d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -152,6 +152,8 @@ enum bpf_attach_type { BPF_CGROUP_INET6_BIND, BPF_CGROUP_INET4_CONNECT, BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -948,6 +950,15 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; + __u32 src_ip4; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_ip6[4]; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_port; /* Allows 4-byte read. + * Stored in host byte order + */ }; #define XDP_PACKET_HEADROOM 256 diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf1b29bc0ab8..0244973ee544 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1171,11 +1171,46 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +/* Initially all BPF programs could be loaded w/o specifying + * expected_attach_type. Later for some of them specifying expected_attach_type + * at load time became required so that program could be validated properly. + * Programs of types that are allowed to be loaded both w/ and w/o (for + * backward compatibility) expected_attach_type, should have the default attach + * type assigned to expected_attach_type for the latter case, so that it can be + * validated later at attach time. + * + * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if + * prog type requires it but has some attach types that have to be backward + * compatible. + */ +static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) +{ + switch (attr->prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't + * exist so checking for non-zero is the way to go here. + */ + if (!attr->expected_attach_type) + attr->expected_attach_type = + BPF_CGROUP_INET_SOCK_CREATE; + break; + } +} + static int bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type) { switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + switch (expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + return 0; + default: + return -EINVAL; + } case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: switch (expected_attach_type) { case BPF_CGROUP_INET4_BIND: @@ -1195,6 +1230,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; default: @@ -1240,6 +1276,7 @@ static int bpf_prog_load(union bpf_attr *attr) !capable(CAP_SYS_ADMIN)) return -EPERM; + bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) return -EINVAL; @@ -1489,6 +1526,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; case BPF_CGROUP_INET4_BIND: @@ -1557,6 +1596,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; case BPF_CGROUP_INET4_BIND: @@ -1616,6 +1657,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: diff --git a/net/core/filter.c b/net/core/filter.c index bdb9cadd4d27..d31aff93270d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4097,30 +4097,80 @@ static bool lwt_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } -static bool sock_filter_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) + +/* Attach type specific accesses */ +static bool __sock_filter_check_attach_type(int off, + enum bpf_access_type access_type, + enum bpf_attach_type attach_type) { - if (type == BPF_WRITE) { - switch (off) { - case offsetof(struct bpf_sock, bound_dev_if): - case offsetof(struct bpf_sock, mark): - case offsetof(struct bpf_sock, priority): - break; + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + case offsetof(struct bpf_sock, mark): + case offsetof(struct bpf_sock, priority): + switch (attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + goto full_access; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_ip4): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + switch (attach_type) { + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_port): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; default: return false; } } +read_only: + return access_type == BPF_READ; +full_access: + return true; +} + +static bool __sock_filter_check_size(int off, int size, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); - if (off < 0 || off + size > sizeof(struct bpf_sock)) + switch (off) { + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } + + return size == size_default; +} + +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sock)) return false; - /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) + if (!__sock_filter_check_attach_type(off, type, + prog->expected_attach_type)) + return false; + if (!__sock_filter_check_size(off, size, info)) return false; - return true; } @@ -4728,6 +4778,7 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): @@ -4783,6 +4834,43 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; + + case offsetof(struct bpf_sock, src_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_rcv_saddr, + FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr), + target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, src_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off( + struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0], + FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]), + target_size) + off); +#else + (void)off; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock, src_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_num), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_num, + FIELD_SIZEOF(struct sock_common, + skc_num), + target_size)); + break; } return insn - insn_buf; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 488fe26ac8e5..142d4c35b493 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -519,12 +519,18 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if ((snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) && - sk->sk_prot->get_port(sk, snum)) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - err = -EADDRINUSE; - goto out_release_sock; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + err = -EADDRINUSE; + goto out_release_sock; + } + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } if (inet->inet_rcv_saddr) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 566cec0e0a44..41f50472679d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -412,13 +412,20 @@ int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if ((snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) && - sk->sk_prot->get_port(sk, snum)) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - err = -EADDRINUSE; - goto out; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + err = -EADDRINUSE; + goto out; + } + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } if (addr_type != IPV6_ADDR_ANY) -- cgit v1.2.3-71-gd317 From 5149cbac4235e12a34cf089592a8bd1c9fcfa467 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 30 Mar 2018 17:27:58 -0400 Subject: locking/rwsem: Add DEBUG_RWSEMS to look for lock/unlock mismatches For a rwsem, locking can either be exclusive or shared. The corresponding exclusive or shared unlock must be used. Otherwise, the protected data structures may get corrupted or the lock may be in an inconsistent state. In order to detect such anomaly, a new configuration option DEBUG_RWSEMS is added which can be enabled to look for such mismatches and print warnings that that happens. Signed-off-by: Waiman Long Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1522445280-7767-2-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 4 ++++ kernel/locking/rwsem.h | 8 +++++++- lib/Kconfig.debug | 8 ++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f549c552dbf1..30465a2f2b6c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -117,6 +117,7 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); __up_read(sem); } @@ -129,6 +130,7 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != current); rwsem_clear_owner(sem); __up_write(sem); @@ -142,6 +144,7 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != current); rwsem_set_reader_owned(sem); __downgrade_write(sem); @@ -211,6 +214,7 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { + DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); __up_read(sem); } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a883b8f1fdc6..a17cba8d94bb 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -16,6 +16,12 @@ */ #define RWSEM_READER_OWNED ((struct task_struct *)1UL) +#ifdef CONFIG_DEBUG_RWSEMS +# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) +#else +# define DEBUG_RWSEMS_WARN_ON(c) +#endif + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -41,7 +47,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) * do a write to the rwsem cacheline when it is really necessary * to minimize cacheline contention. */ - if (sem->owner != RWSEM_READER_OWNED) + if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED) WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 64155e310a9f..88650ab5cedb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1075,6 +1075,13 @@ config DEBUG_WW_MUTEX_SLOWPATH even a debug kernel. If you are a driver writer, enable it. If you are a distro, do not. +config DEBUG_RWSEMS + bool "RW Semaphore debugging: basic checks" + depends on DEBUG_KERNEL && RWSEM_SPIN_ON_OWNER + help + This debugging feature allows mismatched rw semaphore locks and unlocks + to be detected and reported. + config DEBUG_LOCK_ALLOC bool "Lock debugging: detect incorrect freeing of live locks" depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT @@ -1097,6 +1104,7 @@ config PROVE_LOCKING select DEBUG_SPINLOCK select DEBUG_MUTEXES select DEBUG_RT_MUTEXES if RT_MUTEXES + select DEBUG_RWSEMS if RWSEM_SPIN_ON_OWNER select DEBUG_LOCK_ALLOC select TRACE_IRQFLAGS default n -- cgit v1.2.3-71-gd317 From 1b5d43cfb69759d8ef8d30469cea31d0c037aed5 Mon Sep 17 00:00:00 2001 From: Jules Maselbas Date: Thu, 29 Mar 2018 15:43:01 +0100 Subject: sched/cpufreq/schedutil: Fix error path mutex unlock This patch prevents the 'global_tunables_lock' mutex from being unlocked before being locked. This mutex is not locked if the sugov_kthread_create() function fails. Signed-off-by: Jules Maselbas Acked-by: Peter Zijlstra Cc: Chris Redpath Cc: Dietmar Eggermann Cc: Linus Torvalds Cc: Mike Galbraith Cc: Patrick Bellasi Cc: Stephen Kyle Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: nd@arm.com Link: http://lkml.kernel.org/r/20180329144301.38419-1-jules.maselbas@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7936f548e071..617c6741c525 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -625,10 +625,9 @@ fail: stop_kthread: sugov_kthread_stop(sg_policy); - -free_sg_policy: mutex_unlock(&global_tunables_lock); +free_sg_policy: sugov_policy_free(sg_policy); disable_fast_switch: -- cgit v1.2.3-71-gd317 From d300b610812f3c10d146db4c18f98eba38834c70 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 11 Mar 2018 11:34:26 +0100 Subject: kernel: use kernel_wait4() instead of sys_wait4() All call sites of sys_wait4() set *rusage to NULL. Therefore, there is no need for the copy_to_user() handling of *rusage, and we can use kernel_wait4() directly. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Acked-by: Luis R. Rodriguez Cc: Al Viro Cc: Andrew Morton Signed-off-by: Dominik Brodowski --- kernel/exit.c | 2 +- kernel/pid_namespace.c | 6 +++--- kernel/umh.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 995453d9fb55..c3c7ac560114 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1691,7 +1691,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { - return sys_wait4(pid, stat_addr, options, NULL); + return kernel_wait4(pid, stat_addr, options, NULL); } #endif diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0b53eef7d34b..93b57f026688 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -242,16 +242,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. - * sys_wait4() will also block until our children traced from the + * kernel_wait4() will also block until our children traced from the * parent namespace are detached and become EXIT_DEAD. */ do { clear_thread_flag(TIF_SIGPENDING); - rc = sys_wait4(-1, NULL, __WALL, NULL); + rc = kernel_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); /* - * sys_wait4() above can't reap the EXIT_DEAD children but we do not + * kernel_wait4() above can't reap the EXIT_DEAD children but we do not * really care, we could reparent them to the global init. We could * exit and reap ->child_reaper even if it is not the last thread in * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), diff --git a/kernel/umh.c b/kernel/umh.c index 18e5fa4b0e71..f76b3ff876cf 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -118,7 +118,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { pid_t pid; - /* If SIGCLD is ignored sys_wait4 won't populate the status. */ + /* If SIGCLD is ignored kernel_wait4 won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) { @@ -135,7 +135,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) * * Thus the __user pointer cast is valid here. */ - sys_wait4(pid, (int __user *)&ret, 0, NULL); + kernel_wait4(pid, (int __user *)&ret, 0, NULL); /* * If ret is 0, either call_usermodehelper_exec_async failed and -- cgit v1.2.3-71-gd317 From d53238cd51a80f6f2e5b9d64830c62e2086787bd Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 11 Mar 2018 11:34:37 +0100 Subject: kernel: open-code sys_rt_sigpending() in sys_sigpending() A similar but not fully equivalent code path is already open-coded three times (in sys_rt_sigpending and in the two compat stubs), so do it a fourth time here. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Cc: Andrew Morton Signed-off-by: Dominik Brodowski --- include/linux/syscalls.h | 2 +- kernel/signal.c | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0526286a0314..a63e21e7a3af 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -288,7 +288,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data); asmlinkage long sys_personality(unsigned int personality); -asmlinkage long sys_sigpending(old_sigset_t __user *set); +asmlinkage long sys_sigpending(old_sigset_t __user *uset); asmlinkage long sys_sigprocmask(int how, old_sigset_t __user *set, old_sigset_t __user *oset); asmlinkage long sys_sigaltstack(const struct sigaltstack __user *uss, diff --git a/kernel/signal.c b/kernel/signal.c index c6e4c83dc090..985c61749bcf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3629,11 +3629,20 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) /** * sys_sigpending - examine pending signals - * @set: where mask of pending signal is returned + * @uset: where mask of pending signal is returned */ -SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) +SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) { - return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); + sigset_t set; + int err; + + if (sizeof(old_sigset_t) > sizeof(*uset)) + return -EINVAL; + + err = do_sigpending(&set); + if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t))) + err = -EFAULT; + return err; } #ifdef CONFIG_COMPAT -- cgit v1.2.3-71-gd317 From 6b27aef09fea32b805a8c81287b1bb80362dadb0 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sat, 17 Mar 2018 15:18:30 +0100 Subject: kexec: call do_kexec_load() in compat syscall directly do_kexec_load() can be called directly by compat_sys_kexec() as long as the same parameters checks are completed which are currently handled (also) by sys_kexec(). Therefore, move those to kexec_load_check(), call that newly introduced helper function from both sys_kexec() and compat_sys_kexec(), and duplicate the remaining code from sys_kexec() in compat_sys_kexec(). This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Eric Biederman Cc: kexec@lists.infradead.org Signed-off-by: Dominik Brodowski --- kernel/kexec.c | 52 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index e62ec4dc6620..aed8fb2564b3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -192,11 +192,9 @@ out: * that to happen you need to do that yourself. */ -SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, - struct kexec_segment __user *, segments, unsigned long, flags) +static inline int kexec_load_check(unsigned long nr_segments, + unsigned long flags) { - int result; - /* We only trust the superuser with rebooting the system. */ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return -EPERM; @@ -208,17 +206,29 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) return -EINVAL; - /* Verify we are on the appropriate architecture */ - if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && - ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) - return -EINVAL; - /* Put an artificial cap on the number * of segments passed to kexec_load. */ if (nr_segments > KEXEC_SEGMENT_MAX) return -EINVAL; + return 0; +} + +SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, + struct kexec_segment __user *, segments, unsigned long, flags) +{ + int result; + + result = kexec_load_check(nr_segments, flags); + if (result) + return result; + + /* Verify we are on the appropriate architecture */ + if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && + ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) + return -EINVAL; + /* Because we write directly to the reserved memory * region when loading crash kernels we need a mutex here to * prevent multiple crash kernels from attempting to load @@ -247,15 +257,16 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, struct kexec_segment out, __user *ksegments; unsigned long i, result; + result = kexec_load_check(nr_segments, flags); + if (result) + return result; + /* Don't allow clients that don't understand the native * architecture to do anything. */ if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) return -EINVAL; - if (nr_segments > KEXEC_SEGMENT_MAX) - return -EINVAL; - ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); @@ -272,6 +283,21 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return -EFAULT; } - return sys_kexec_load(entry, nr_segments, ksegments, flags); + /* Because we write directly to the reserved memory + * region when loading crash kernels we need a mutex here to + * prevent multiple crash kernels from attempting to load + * simultaneously, and to prevent a crash kernel from loading + * over the top of a in use crash kernel. + * + * KISS: always take the mutex. + */ + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + result = do_kexec_load(entry, nr_segments, ksegments, flags); + + mutex_unlock(&kexec_mutex); + + return result; } #endif -- cgit v1.2.3-71-gd317 From 2de0db992de189fccc83fed57c30875144821491 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 11 Mar 2018 11:34:26 +0100 Subject: mm: use do_futex() instead of sys_futex() in mm_release() sys_futex() is a wrapper to do_futex() which does not modify any values here: - uaddr, val and val3 are kept the same - op is masked with FUTEX_CMD_MASK, but is always set to FUTEX_WAKE. Therefore, val2 is always 0. - as utime is set to NULL, *timeout is NULL This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Darren Hart Cc: Andrew Morton Reviewed-by: Thomas Gleixner Signed-off-by: Dominik Brodowski --- include/linux/futex.h | 13 ++++++++++--- kernel/fork.c | 4 ++-- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/futex.h b/include/linux/futex.h index c0fb9a24bbd2..821ae502d3d8 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -9,9 +9,6 @@ struct inode; struct mm_struct; struct task_struct; -long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - u32 __user *uaddr2, u32 val2, u32 val3); - extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi); @@ -55,6 +52,9 @@ union futex_key { #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); + +long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + u32 __user *uaddr2, u32 val2, u32 val3); #ifdef CONFIG_HAVE_FUTEX_CMPXCHG #define futex_cmpxchg_enabled 1 #else @@ -64,6 +64,13 @@ extern int futex_cmpxchg_enabled; static inline void exit_robust_list(struct task_struct *curr) { } + +static inline long do_futex(u32 __user *uaddr, int op, u32 val, + ktime_t *timeout, u32 __user *uaddr2, + u32 val2, u32 val3) +{ + return -EINVAL; +} #endif #ifdef CONFIG_FUTEX_PI diff --git a/kernel/fork.c b/kernel/fork.c index e5d9d405ae4e..b1e031aac9db 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1198,8 +1198,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * not set up a proper pointer then tough luck. */ put_user(0, tsk->clear_child_tid); - sys_futex(tsk->clear_child_tid, FUTEX_WAKE, - 1, NULL, NULL, 0); + do_futex(tsk->clear_child_tid, FUTEX_WAKE, + 1, NULL, NULL, 0, 0); } tsk->clear_child_tid = NULL; } -- cgit v1.2.3-71-gd317 From 192c58073d9abb1c507c89f109da5dc9f130ba70 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 11 Mar 2018 11:34:27 +0100 Subject: kernel: add do_getpgid() helper; remove internal call to sys_getpgid() Using the do_getpgid() helper removes an in-kernel call to the sys_getpgid() syscall. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Signed-off-by: Dominik Brodowski --- kernel/sys.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index f2289de20e19..ebb138b841c8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1027,7 +1027,7 @@ out: return err; } -SYSCALL_DEFINE1(getpgid, pid_t, pid) +static int do_getpgid(pid_t pid) { struct task_struct *p; struct pid *grp; @@ -1055,11 +1055,16 @@ out: return retval; } +SYSCALL_DEFINE1(getpgid, pid_t, pid) +{ + return do_getpgid(pid); +} + #ifdef __ARCH_WANT_SYS_GETPGRP SYSCALL_DEFINE0(getpgrp) { - return sys_getpgid(0); + return do_getpgid(0); } #endif -- cgit v1.2.3-71-gd317 From 6203deb0a76f35b6dc476fecb228c850099a1bc4 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sat, 17 Mar 2018 17:11:51 +0100 Subject: kernel: add do_compat_sigaltstack() helper; remove in-kernel call to compat syscall Using this helper allows us to avoid the in-kernel call to the compat_sys_sigaltstack() syscall. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: "Eric W. Biederman" Cc: Al Viro Cc: Andrew Morton Signed-off-by: Dominik Brodowski --- kernel/signal.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 985c61749bcf..f04466655238 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3573,9 +3573,8 @@ int __save_altstack(stack_t __user *uss, unsigned long sp) } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(sigaltstack, - const compat_stack_t __user *, uss_ptr, - compat_stack_t __user *, uoss_ptr) +static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr, + compat_stack_t __user *uoss_ptr) { stack_t uss, uoss; int ret; @@ -3602,9 +3601,16 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack, return ret; } +COMPAT_SYSCALL_DEFINE2(sigaltstack, + const compat_stack_t __user *, uss_ptr, + compat_stack_t __user *, uoss_ptr) +{ + return do_compat_sigaltstack(uss_ptr, uoss_ptr); +} + int compat_restore_altstack(const compat_stack_t __user *uss) { - int err = compat_sys_sigaltstack(uss, NULL); + int err = do_compat_sigaltstack(uss, NULL); /* squash all but -EFAULT for now */ return err == -EFAULT ? err : 0; } -- cgit v1.2.3-71-gd317 From e530dca584a9aa4aedf26adf0ed3c1c9b80e2767 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Mon, 19 Mar 2018 18:09:27 +0100 Subject: kernel: provide ksys_*() wrappers for syscalls called by kernel/uid16.c Using these helpers allows us to avoid the in-kernel calls to these syscalls: sys_setregid(), sys_setgid(), sys_setreuid(), sys_setuid(), sys_setresuid(), sys_setresgid(), sys_setfsuid(), and sys_setfsgid(). The ksys_ prefix denotes that these function are meant as a drop-in replacement for the syscall. In particular, they use the same calling convention. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Cc: Eric W. Biederman Cc: Andrew Morton Signed-off-by: Dominik Brodowski --- kernel/sys.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++-------- kernel/uid16.c | 19 ++++++++++--------- kernel/uid16.h | 14 ++++++++++++++ 3 files changed, 74 insertions(+), 17 deletions(-) create mode 100644 kernel/uid16.h (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ebb138b841c8..550f47788ae4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -69,6 +69,8 @@ #include #include +#include "uid16.h" + #ifndef SET_UNALIGN_CTL # define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif @@ -340,7 +342,7 @@ out_unlock: * operations (as far as semantic preservation is concerned). */ #ifdef CONFIG_MULTIUSER -SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) +long __sys_setregid(gid_t rgid, gid_t egid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -392,12 +394,17 @@ error: return retval; } +SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) +{ + return __sys_setregid(rgid, egid); +} + /* * setgid() is implemented like SysV w/ SAVED_IDS * * SMP: Same implicit races as above. */ -SYSCALL_DEFINE1(setgid, gid_t, gid) +long __sys_setgid(gid_t gid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -429,6 +436,11 @@ error: return retval; } +SYSCALL_DEFINE1(setgid, gid_t, gid) +{ + return __sys_setgid(gid); +} + /* * change the user struct in a credentials set to match the new UID */ @@ -473,7 +485,7 @@ static int set_user(struct cred *new) * 100% compatible with BSD. A program which uses just setuid() will be * 100% compatible with POSIX with saved IDs. */ -SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) +long __sys_setreuid(uid_t ruid, uid_t euid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -533,6 +545,11 @@ error: return retval; } +SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) +{ + return __sys_setreuid(ruid, euid); +} + /* * setuid() is implemented like SysV with SAVED_IDS * @@ -544,7 +561,7 @@ error: * will allow a root program to temporarily drop privileges and be able to * regain them by swapping the real and effective uid. */ -SYSCALL_DEFINE1(setuid, uid_t, uid) +long __sys_setuid(uid_t uid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -586,12 +603,17 @@ error: return retval; } +SYSCALL_DEFINE1(setuid, uid_t, uid) +{ + return __sys_setuid(uid); +} + /* * This function implements a generic ability to update ruid, euid, * and suid. This allows you to implement the 4.4 compatible seteuid(). */ -SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) +long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -656,6 +678,11 @@ error: return retval; } +SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) +{ + return __sys_setresuid(ruid, euid, suid); +} + SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) { const struct cred *cred = current_cred(); @@ -678,7 +705,7 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ /* * Same as above, but for rgid, egid, sgid. */ -SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) +long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; @@ -730,6 +757,11 @@ error: return retval; } +SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) +{ + return __sys_setresgid(rgid, egid, sgid); +} + SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) { const struct cred *cred = current_cred(); @@ -757,7 +789,7 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ * whatever uid it wants to). It normally shadows "euid", except when * explicitly set by setfsuid() or for access.. */ -SYSCALL_DEFINE1(setfsuid, uid_t, uid) +long __sys_setfsuid(uid_t uid) { const struct cred *old; struct cred *new; @@ -793,10 +825,15 @@ change_okay: return old_fsuid; } +SYSCALL_DEFINE1(setfsuid, uid_t, uid) +{ + return __sys_setfsuid(uid); +} + /* * Samma pÃ¥ svenska.. */ -SYSCALL_DEFINE1(setfsgid, gid_t, gid) +long __sys_setfsgid(gid_t gid) { const struct cred *old; struct cred *new; @@ -830,6 +867,11 @@ change_okay: commit_creds(new); return old_fsgid; } + +SYSCALL_DEFINE1(setfsgid, gid_t, gid) +{ + return __sys_setfsgid(gid); +} #endif /* CONFIG_MULTIUSER */ /** diff --git a/kernel/uid16.c b/kernel/uid16.c index ef1da2a5f9bd..7b930edfe461 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -18,6 +18,8 @@ #include +#include "uid16.h" + SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { return sys_chown(filename, low2highuid(user), low2highgid(group)); @@ -35,27 +37,27 @@ SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) { - return sys_setregid(low2highgid(rgid), low2highgid(egid)); + return __sys_setregid(low2highgid(rgid), low2highgid(egid)); } SYSCALL_DEFINE1(setgid16, old_gid_t, gid) { - return sys_setgid(low2highgid(gid)); + return __sys_setgid(low2highgid(gid)); } SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) { - return sys_setreuid(low2highuid(ruid), low2highuid(euid)); + return __sys_setreuid(low2highuid(ruid), low2highuid(euid)); } SYSCALL_DEFINE1(setuid16, old_uid_t, uid) { - return sys_setuid(low2highuid(uid)); + return __sys_setuid(low2highuid(uid)); } SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) { - return sys_setresuid(low2highuid(ruid), low2highuid(euid), + return __sys_setresuid(low2highuid(ruid), low2highuid(euid), low2highuid(suid)); } @@ -78,11 +80,10 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) { - return sys_setresgid(low2highgid(rgid), low2highgid(egid), + return __sys_setresgid(low2highgid(rgid), low2highgid(egid), low2highgid(sgid)); } - SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) { const struct cred *cred = current_cred(); @@ -102,12 +103,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) { - return sys_setfsuid(low2highuid(uid)); + return __sys_setfsuid(low2highuid(uid)); } SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) { - return sys_setfsgid(low2highgid(gid)); + return __sys_setfsgid(low2highgid(gid)); } static int groups16_to_user(old_gid_t __user *grouplist, diff --git a/kernel/uid16.h b/kernel/uid16.h new file mode 100644 index 000000000000..cdca040f7602 --- /dev/null +++ b/kernel/uid16.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_UID16_H +#define LINUX_UID16_H + +long __sys_setuid(uid_t uid); +long __sys_setgid(gid_t gid); +long __sys_setreuid(uid_t ruid, uid_t euid); +long __sys_setregid(gid_t rgid, gid_t egid); +long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid); +long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid); +long __sys_setfsuid(uid_t uid); +long __sys_setfsgid(gid_t gid); + +#endif /* LINUX_UID16_H */ -- cgit v1.2.3-71-gd317 From 7d4dd4f159b94003655b1688d9a4c0e2b6268ff8 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Wed, 14 Mar 2018 22:40:35 +0100 Subject: sched: add do_sched_yield() helper; remove in-kernel call to sched_yield() Using the sched-internal do_sched_yield() helper allows us to get rid of the sched-internal call to the sys_sched_yield() syscall. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Signed-off-by: Dominik Brodowski --- kernel/sched/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e7c535eee0a6..8de4919c889a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4892,7 +4892,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, * * Return: 0. */ -SYSCALL_DEFINE0(sched_yield) +static void do_sched_yield(void) { struct rq_flags rf; struct rq *rq; @@ -4913,7 +4913,11 @@ SYSCALL_DEFINE0(sched_yield) sched_preempt_enable_no_resched(); schedule(); +} +SYSCALL_DEFINE0(sched_yield) +{ + do_sched_yield(); return 0; } @@ -4997,7 +5001,7 @@ EXPORT_SYMBOL(__cond_resched_softirq); void __sched yield(void) { set_current_state(TASK_RUNNING); - sys_sched_yield(); + do_sched_yield(); } EXPORT_SYMBOL(yield); -- cgit v1.2.3-71-gd317 From b6e9b0babb7a02ae4f00f053974609000f00950e Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sat, 17 Mar 2018 16:00:25 +0100 Subject: mm: add kernel_migrate_pages() helper, move compat syscall to mm/mempolicy.c Move compat_sys_migrate_pages() to mm/mempolicy.c and make it call a newly introduced helper -- kernel_migrate_pages() -- instead of the syscall. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Cc: linux-mm@kvack.org Cc: Andrew Morton Signed-off-by: Dominik Brodowski --- kernel/compat.c | 33 --------------------------------- mm/mempolicy.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 3f5fa8902e7d..51bdf1808943 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -508,39 +508,6 @@ COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, } return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); } - -COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, - compat_ulong_t, maxnode, - const compat_ulong_t __user *, old_nodes, - const compat_ulong_t __user *, new_nodes) -{ - unsigned long __user *old = NULL; - unsigned long __user *new = NULL; - nodemask_t tmp_mask; - unsigned long nr_bits; - unsigned long size; - - nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); - size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - if (old_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) - return -EFAULT; - old = compat_alloc_user_space(new_nodes ? size * 2 : size); - if (new_nodes) - new = old + size / sizeof(unsigned long); - if (copy_to_user(old, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - if (new_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) - return -EFAULT; - if (new == NULL) - new = compat_alloc_user_space(size); - if (copy_to_user(new, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - return sys_migrate_pages(pid, nr_bits + 1, old, new); -} #endif /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d879f1d8a44a..7399ede02b5f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1377,9 +1377,9 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, return do_set_mempolicy(mode, flags, &nodes); } -SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, - const unsigned long __user *, old_nodes, - const unsigned long __user *, new_nodes) +static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, + const unsigned long __user *old_nodes, + const unsigned long __user *new_nodes) { struct mm_struct *mm = NULL; struct task_struct *task; @@ -1469,6 +1469,13 @@ out_put: } +SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, + const unsigned long __user *, old_nodes, + const unsigned long __user *, new_nodes) +{ + return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); +} + /* Retrieve NUMA policy */ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, @@ -1571,7 +1578,40 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, return sys_mbind(start, len, mode, nm, nr_bits+1, flags); } -#endif +COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, + compat_ulong_t, maxnode, + const compat_ulong_t __user *, old_nodes, + const compat_ulong_t __user *, new_nodes) +{ + unsigned long __user *old = NULL; + unsigned long __user *new = NULL; + nodemask_t tmp_mask; + unsigned long nr_bits; + unsigned long size; + + nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); + size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + if (old_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) + return -EFAULT; + old = compat_alloc_user_space(new_nodes ? size * 2 : size); + if (new_nodes) + new = old + size / sizeof(unsigned long); + if (copy_to_user(old, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + if (new_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) + return -EFAULT; + if (new == NULL) + new = compat_alloc_user_space(size); + if (copy_to_user(new, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + return kernel_migrate_pages(pid, nr_bits + 1, old, new); +} + +#endif /* CONFIG_COMPAT */ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr) -- cgit v1.2.3-71-gd317 From 7addf44388255f6fa99c83e3e2ad79cef0813698 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sat, 17 Mar 2018 16:08:03 +0100 Subject: mm: add kernel_move_pages() helper, move compat syscall to mm/migrate.c Move compat_sys_move_pages() to mm/migrate.c and make it call a newly introduced helper -- kernel_move_pages() -- instead of the syscall. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Cc: linux-mm@kvack.org Cc: Andrew Morton Signed-off-by: Dominik Brodowski --- kernel/compat.c | 22 ---------------------- mm/migrate.c | 39 +++++++++++++++++++++++++++++++++++---- 2 files changed, 35 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 51bdf1808943..6d21894806b4 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -488,28 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) } EXPORT_SYMBOL_GPL(get_compat_sigset); -#ifdef CONFIG_NUMA -COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, - compat_uptr_t __user *, pages32, - const int __user *, nodes, - int __user *, status, - int, flags) -{ - const void __user * __user *pages; - int i; - - pages = compat_alloc_user_space(nr_pages * sizeof(void *)); - for (i = 0; i < nr_pages; i++) { - compat_uptr_t p; - - if (get_user(p, pages32 + i) || - put_user(compat_ptr(p), pages + i)) - return -EFAULT; - } - return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); -} -#endif - /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. diff --git a/mm/migrate.c b/mm/migrate.c index 1e5525a25691..003886606a22 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1745,10 +1746,10 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, * Move a list of pages in the address space of the currently executing * process. */ -SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, - const void __user * __user *, pages, - const int __user *, nodes, - int __user *, status, int, flags) +static int kernel_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) { struct task_struct *task; struct mm_struct *mm; @@ -1807,6 +1808,36 @@ out: return err; } +SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, + const void __user * __user *, pages, + const int __user *, nodes, + int __user *, status, int, flags) +{ + return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, + compat_uptr_t __user *, pages32, + const int __user *, nodes, + int __user *, status, + int, flags) +{ + const void __user * __user *pages; + int i; + + pages = compat_alloc_user_space(nr_pages * sizeof(void *)); + for (i = 0; i < nr_pages; i++) { + compat_uptr_t p; + + if (get_user(p, pages32 + i) || + put_user(compat_ptr(p), pages + i)) + return -EFAULT; + } + return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); +} +#endif /* CONFIG_COMPAT */ + #ifdef CONFIG_NUMA_BALANCING /* * Returns true if this is a safe migration target node for misplaced NUMA -- cgit v1.2.3-71-gd317 From ab0d1e85bfd0c25260f02cd3708d5abdfb5b5a9c Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 4 Mar 2018 21:54:05 +0100 Subject: fs/quota: use COMPAT_SYSCALL_DEFINE for sys32_quotactl() While sys32_quotactl() is only needed on x86, it can use the recommended COMPAT_SYSCALL_DEFINEx() machinery for its setup. Acked-by: Jan Kara Cc: Christoph Hellwig Signed-off-by: Dominik Brodowski --- arch/x86/entry/syscalls/syscall_32.tbl | 2 +- fs/quota/compat.c | 5 +++-- include/linux/compat.h | 3 +++ include/linux/syscalls.h | 2 -- kernel/sys_ni.c | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ef6edaf285cd..c58f75b088c5 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -137,7 +137,7 @@ 128 i386 init_module sys_init_module 129 i386 delete_module sys_delete_module 130 i386 get_kernel_syms -131 i386 quotactl sys_quotactl sys32_quotactl +131 i386 quotactl sys_quotactl compat_sys_quotactl32 132 i386 getpgid sys_getpgid 133 i386 fchdir sys_fchdir 134 i386 bdflush sys_bdflush diff --git a/fs/quota/compat.c b/fs/quota/compat.c index 1577a2fd51f4..c30572857619 100644 --- a/fs/quota/compat.c +++ b/fs/quota/compat.c @@ -41,8 +41,9 @@ struct compat_fs_quota_stat { __u16 qs_iwarnlimit; }; -asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, - qid_t id, void __user *addr) +COMPAT_SYSCALL_DEFINE4(quotactl32, unsigned int, cmd, + const char __user *, special, qid_t, id, + void __user *, addr) { unsigned int cmds; struct if_dqblk __user *dqblk; diff --git a/include/linux/compat.h b/include/linux/compat.h index 16c3027074a2..f1649a5e6716 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -461,6 +461,9 @@ asmlinkage ssize_t compat_sys_pwritev2(compat_ulong_t fd, const struct compat_iovec __user *vec, compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags); +asmlinkage long compat_sys_quotactl32(unsigned int cmd, + const char __user *special, qid_t id, void __user *addr); + #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 asmlinkage long compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a63e21e7a3af..6ab7ed71a8b6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -241,8 +241,6 @@ static inline void addr_limit_user_check(void) #endif } -asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, - qid_t id, void __user *addr); asmlinkage long sys_time(time_t __user *tloc); asmlinkage long sys_stime(time_t __user *tptr); asmlinkage long sys_gettimeofday(struct timeval __user *tv, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index b5189762d275..951dbda5c2b4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -18,7 +18,7 @@ asmlinkage long sys_ni_syscall(void) } cond_syscall(sys_quotactl); -cond_syscall(sys32_quotactl); +cond_syscall(compat_sys_quotactl32); cond_syscall(sys_acct); cond_syscall(sys_lookup_dcookie); cond_syscall(compat_sys_lookup_dcookie); -- cgit v1.2.3-71-gd317 From 55731b3cda3a85ee888dac3bf1f36489f275c187 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 11 Mar 2018 11:34:55 +0100 Subject: fs: add do_fchownat(), ksys_fchown() helpers and ksys_{,l}chown() wrappers Using the fs-interal do_fchownat() wrapper allows us to get rid of fs-internal calls to the sys_fchownat() syscall. Introducing the ksys_fchown() helper and the ksys_{,}chown() wrappers allows us to avoid the in-kernel calls to the sys_{,l,f}chown() syscalls. The ksys_ prefix denotes that these functions are meant as a drop-in replacement for the syscalls. In particular, they use the same calling convention as sys_{,l,f}chown(). This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Cc: Andrew Morton Signed-off-by: Dominik Brodowski --- arch/s390/kernel/compat_linux.c | 6 +++--- fs/internal.h | 2 ++ fs/open.c | 23 +++++++++++++++++------ include/linux/syscalls.h | 17 +++++++++++++++++ init/initramfs.c | 8 ++++---- kernel/uid16.c | 6 +++--- 6 files changed, 46 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 5a9cfde5fc28..9a9bb395359c 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -89,18 +89,18 @@ COMPAT_SYSCALL_DEFINE3(s390_chown16, const char __user *, filename, u16, user, u16, group) { - return sys_chown(filename, low2highuid(user), low2highgid(group)); + return ksys_chown(filename, low2highuid(user), low2highgid(group)); } COMPAT_SYSCALL_DEFINE3(s390_lchown16, const char __user *, filename, u16, user, u16, group) { - return sys_lchown(filename, low2highuid(user), low2highgid(group)); + return ksys_lchown(filename, low2highuid(user), low2highgid(group)); } COMPAT_SYSCALL_DEFINE3(s390_fchown16, unsigned int, fd, u16, user, u16, group) { - return sys_fchown(fd, low2highuid(user), low2highgid(group)); + return ksys_fchown(fd, low2highuid(user), low2highgid(group)); } COMPAT_SYSCALL_DEFINE2(s390_setregid16, u16, rgid, u16, egid) diff --git a/fs/internal.h b/fs/internal.h index 26f4f05b52ef..c797480cbd6f 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -121,6 +121,8 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, long do_faccessat(int dfd, const char __user *filename, int mode); int do_fchmodat(int dfd, const char __user *filename, umode_t mode); +int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, + int flag); extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); diff --git a/fs/open.c b/fs/open.c index 0fc8188be31a..7b2eccb541f2 100644 --- a/fs/open.c +++ b/fs/open.c @@ -645,8 +645,8 @@ retry_deleg: return error; } -SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, - gid_t, group, int, flag) +int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, + int flag) { struct path path; int error = -EINVAL; @@ -677,18 +677,24 @@ out: return error; } +SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, + gid_t, group, int, flag) +{ + return do_fchownat(dfd, filename, user, group, flag); +} + SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { - return sys_fchownat(AT_FDCWD, filename, user, group, 0); + return do_fchownat(AT_FDCWD, filename, user, group, 0); } SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { - return sys_fchownat(AT_FDCWD, filename, user, group, - AT_SYMLINK_NOFOLLOW); + return do_fchownat(AT_FDCWD, filename, user, group, + AT_SYMLINK_NOFOLLOW); } -SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) +int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { struct fd f = fdget(fd); int error = -EBADF; @@ -708,6 +714,11 @@ out: return error; } +SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) +{ + return ksys_fchown(fd, user, group); +} + int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 33f06de090ea..df0d1e818a6e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -954,6 +954,7 @@ int ksys_chroot(const char __user *filename); ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count); int ksys_chdir(const char __user *filename); int ksys_fchmod(unsigned int fd, umode_t mode); +int ksys_fchown(unsigned int fd, uid_t user, gid_t group); /* * The following kernel syscall equivalents are just wrappers to fs-internal @@ -1021,4 +1022,20 @@ static inline long ksys_access(const char __user *filename, int mode) return do_faccessat(AT_FDCWD, filename, mode); } +extern int do_fchownat(int dfd, const char __user *filename, uid_t user, + gid_t group, int flag); + +static inline long ksys_chown(const char __user *filename, uid_t user, + gid_t group) +{ + return do_fchownat(AT_FDCWD, filename, user, group, 0); +} + +static inline long ksys_lchown(const char __user *filename, uid_t user, + gid_t group) +{ + return do_fchownat(AT_FDCWD, filename, user, group, + AT_SYMLINK_NOFOLLOW); +} + #endif diff --git a/init/initramfs.c b/init/initramfs.c index 16c3c23076e2..35173bef7c00 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -343,7 +343,7 @@ static int __init do_name(void) wfd = sys_open(collected, openflags, mode); if (wfd >= 0) { - sys_fchown(wfd, uid, gid); + ksys_fchown(wfd, uid, gid); ksys_fchmod(wfd, mode); if (body_len) sys_ftruncate(wfd, body_len); @@ -353,14 +353,14 @@ static int __init do_name(void) } } else if (S_ISDIR(mode)) { ksys_mkdir(collected, mode); - sys_chown(collected, uid, gid); + ksys_chown(collected, uid, gid); ksys_chmod(collected, mode); dir_add(collected, mtime); } else if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { if (maybe_link() == 0) { ksys_mknod(collected, mode, rdev); - sys_chown(collected, uid, gid); + ksys_chown(collected, uid, gid); ksys_chmod(collected, mode); do_utime(collected, mtime); } @@ -393,7 +393,7 @@ static int __init do_symlink(void) collected[N_ALIGN(name_len) + body_len] = '\0'; clean_path(collected, 0); ksys_symlink(collected + N_ALIGN(name_len), collected); - sys_lchown(collected, uid, gid); + ksys_lchown(collected, uid, gid); do_utime(collected, mtime); state = SkipIt; next_state = Reset; diff --git a/kernel/uid16.c b/kernel/uid16.c index 7b930edfe461..af6925d8599b 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -22,17 +22,17 @@ SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { - return sys_chown(filename, low2highuid(user), low2highgid(group)); + return ksys_chown(filename, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { - return sys_lchown(filename, low2highuid(user), low2highgid(group)); + return ksys_lchown(filename, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) { - return sys_fchown(fd, low2highuid(user), low2highgid(group)); + return ksys_fchown(fd, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) -- cgit v1.2.3-71-gd317 From 70f68ee81e2e9ad5105b8d2bd324e890e94c6ad9 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Wed, 14 Mar 2018 22:35:11 +0100 Subject: fs: add ksys_sync() helper; remove in-kernel calls to sys_sync() Using this helper allows us to avoid the in-kernel calls to the sys_sync() syscall. The ksys_ prefix denotes that this function is meant as a drop-in replacement for the syscall. In particular, it uses the same calling convention as sys_sync(). This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Alexander Viro Signed-off-by: Dominik Brodowski --- arch/sparc/kernel/setup_32.c | 2 +- drivers/tty/sysrq.c | 2 +- fs/sync.c | 7 ++++++- include/linux/syscalls.h | 1 + kernel/power/hibernate.c | 2 +- kernel/power/suspend.c | 2 +- kernel/power/user.c | 2 +- 7 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c index 2e3a3e203061..13664c377196 100644 --- a/arch/sparc/kernel/setup_32.c +++ b/arch/sparc/kernel/setup_32.c @@ -86,7 +86,7 @@ static void prom_sync_me(void) show_free_areas(0, NULL); if (!is_idle_task(current)) { local_irq_enable(); - sys_sync(); + ksys_sync(); local_irq_disable(); } prom_printf("Returning to prom\n"); diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index b674793be478..6364890575ec 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -660,7 +660,7 @@ static void sysrq_do_reset(struct timer_list *t) state->reset_requested = true; - sys_sync(); + ksys_sync(); kernel_restart(NULL); } diff --git a/fs/sync.c b/fs/sync.c index 6e0a2cbaf6de..602ae94bb67e 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -105,7 +105,7 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg) * just write metadata (such as inodes or bitmaps) to block device page cache * and do not sync it on their own in ->sync_fs(). */ -SYSCALL_DEFINE0(sync) +void ksys_sync(void) { int nowait = 0, wait = 1; @@ -117,6 +117,11 @@ SYSCALL_DEFINE0(sync) iterate_bdevs(fdatawait_one_bdev, NULL); if (unlikely(laptop_mode)) laptop_sync_completion(); +} + +SYSCALL_DEFINE0(sync) +{ + ksys_sync(); return 0; } diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3a2e90842ff8..0a9942b3e718 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -960,6 +960,7 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg); off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence); ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count); +void ksys_sync(void); /* * The following kernel syscall equivalents are just wrappers to fs-internal diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a5c36e9c56a6..4710f1b142fc 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -701,7 +701,7 @@ int hibernate(void) } pr_info("Syncing filesystems ... \n"); - sys_sync(); + ksys_sync(); pr_info("done.\n"); error = freeze_processes(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0685c4499431..4c10be0f4843 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -560,7 +560,7 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); pr_info("Syncing filesystems ... "); - sys_sync(); + ksys_sync(); pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif diff --git a/kernel/power/user.c b/kernel/power/user.c index 22df9f7ff672..75c959de4b29 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -224,7 +224,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; printk("Syncing filesystems ... "); - sys_sync(); + ksys_sync(); printk("done.\n"); error = freeze_processes(); -- cgit v1.2.3-71-gd317 From 9b32105ec6b13d32d5db6a6e7992c97ce54b5ea7 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 11 Mar 2018 11:34:42 +0100 Subject: kernel: add ksys_unshare() helper; remove in-kernel calls to sys_unshare() Using this helper allows us to avoid the in-kernel calls to the sys_unshare() syscall. The ksys_ prefix denotes that this function is meant as a drop-in replacement for the syscall. In particular, it uses the same calling convention as sys_unshare(). This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Cc: Andrew Morton Cc: Ingo Molnar Signed-off-by: Dominik Brodowski --- drivers/base/devtmpfs.c | 2 +- include/linux/syscalls.h | 1 + init/do_mounts_initrd.c | 2 +- kernel/fork.c | 7 ++++++- 4 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 3f7ee954fd2c..f7768077e817 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -380,7 +380,7 @@ static int devtmpfsd(void *p) { char options[] = "mode=0755"; int *err = p; - *err = sys_unshare(CLONE_NEWNS); + *err = ksys_unshare(CLONE_NEWNS); if (*err) goto out; *err = ksys_mount("devtmpfs", "/", "devtmpfs", MS_SILENT, options); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0a9942b3e718..e724dda509e0 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -961,6 +961,7 @@ int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg); off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence); ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count); void ksys_sync(void); +int ksys_unshare(unsigned long unshare_flags); /* * The following kernel syscall equivalents are just wrappers to fs-internal diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 03ec0c1b7553..d1d3e53bdeef 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -36,7 +36,7 @@ __setup("noinitrd", no_initrd); static int init_linuxrc(struct subprocess_info *info, struct cred *new) { - sys_unshare(CLONE_FS | CLONE_FILES); + ksys_unshare(CLONE_FS | CLONE_FILES); /* stdin/stdout/stderr for /linuxrc */ ksys_open("/dev/console", O_RDWR, 0); ksys_dup(0); diff --git a/kernel/fork.c b/kernel/fork.c index b1e031aac9db..f71b67dc156d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2354,7 +2354,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp * constructed. Here we are modifying the current, active, * task_struct. */ -SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) +int ksys_unshare(unsigned long unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; @@ -2470,6 +2470,11 @@ bad_unshare_out: return err; } +SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) +{ + return ksys_unshare(unshare_flags); +} + /* * Helper to unshare the files of the current task. * We don't want to expose copy_files internals to -- cgit v1.2.3-71-gd317 From e2aaa9f423367ee03755d632555c242629a08d00 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Fri, 16 Mar 2018 12:36:06 +0100 Subject: kernel: add ksys_setsid() helper; remove in-kernel call to sys_setsid() Using this helper allows us to avoid the in-kernel call to the sys_setsid() syscall. The ksys_ prefix denotes that this function is meant as a drop-in replacement for the syscall. In particular, it uses the same calling convention as sys_setsid(). This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Signed-off-by: Dominik Brodowski --- include/linux/syscalls.h | 1 + init/do_mounts_initrd.c | 2 +- kernel/sys.c | 7 ++++++- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e724dda509e0..4dd685ee425d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -962,6 +962,7 @@ off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence); ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count); void ksys_sync(void); int ksys_unshare(unsigned long unshare_flags); +int ksys_setsid(void); /* * The following kernel syscall equivalents are just wrappers to fs-internal diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index d1d3e53bdeef..5a91aefa7305 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -45,7 +45,7 @@ static int init_linuxrc(struct subprocess_info *info, struct cred *new) ksys_chdir("/root"); ksys_mount(".", "/", NULL, MS_MOVE, NULL); ksys_chroot("."); - sys_setsid(); + ksys_setsid(); return 0; } diff --git a/kernel/sys.c b/kernel/sys.c index 550f47788ae4..ad692183dfe9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1150,7 +1150,7 @@ static void set_special_pids(struct pid *pid) change_pid(curr, PIDTYPE_PGID, pid); } -SYSCALL_DEFINE0(setsid) +int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); @@ -1183,6 +1183,11 @@ out: return err; } +SYSCALL_DEFINE0(setsid) +{ + return ksys_setsid(); +} + DECLARE_RWSEM(uts_sem); #ifdef COMPAT_UTS_MACHINE -- cgit v1.2.3-71-gd317 From 70dd4b3160798b647b7f30baf9fb6e8a5933d4e2 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Tue, 6 Mar 2018 19:53:01 +0100 Subject: kernel/sys_ni: sort cond_syscall() entries Shuffle the cond_syscall() entries in kernel/sys_ni.c around so that they are kept in the same order as in include/uapi/asm-generic/unistd.h. For better structuring, add the same comments as in that file, but keep a few additional comments and extend the commentary where it seems useful. Signed-off-by: Dominik Brodowski --- kernel/sys_ni.c | 506 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 332 insertions(+), 174 deletions(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 951dbda5c2b4..0c1538f5a559 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -17,245 +17,403 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } -cond_syscall(sys_quotactl); -cond_syscall(compat_sys_quotactl32); -cond_syscall(sys_acct); +/* + * This list is kept in the same order as include/uapi/asm-generic/unistd.h. + * Architecture specific entries go below, followed by deprecated or obsolete + * system calls. + */ + +cond_syscall(sys_io_setup); +cond_syscall(compat_sys_io_setup); +cond_syscall(sys_io_destroy); +cond_syscall(sys_io_submit); +cond_syscall(compat_sys_io_submit); +cond_syscall(sys_io_cancel); +cond_syscall(sys_io_getevents); +cond_syscall(compat_sys_io_getevents); + +/* fs/xattr.c */ + +/* fs/dcache.c */ + +/* fs/cookies.c */ cond_syscall(sys_lookup_dcookie); cond_syscall(compat_sys_lookup_dcookie); -cond_syscall(sys_swapon); -cond_syscall(sys_swapoff); + +/* fs/eventfd.c */ +cond_syscall(sys_eventfd2); + +/* fs/eventfd.c */ +cond_syscall(sys_epoll_create1); +cond_syscall(sys_epoll_ctl); +cond_syscall(sys_epoll_pwait); +cond_syscall(compat_sys_epoll_pwait); + +/* fs/fcntl.c */ + +/* fs/inotify_user.c */ +cond_syscall(sys_inotify_init1); +cond_syscall(sys_inotify_add_watch); +cond_syscall(sys_inotify_rm_watch); + +/* fs/ioctl.c */ + +/* fs/ioprio.c */ +cond_syscall(sys_ioprio_set); +cond_syscall(sys_ioprio_get); + +/* fs/locks.c */ +cond_syscall(sys_flock); + +/* fs/namei.c */ + +/* fs/namespace.c */ + +/* fs/nfsctl.c */ + +/* fs/open.c */ + +/* fs/pipe.c */ + +/* fs/quota.c */ +cond_syscall(sys_quotactl); + +/* fs/readdir.c */ + +/* fs/read_write.c */ + +/* fs/sendfile.c */ + +/* fs/select.c */ + +/* fs/signalfd.c */ +cond_syscall(sys_signalfd4); +cond_syscall(compat_sys_signalfd4); + +/* fs/splice.c */ + +/* fs/stat.c */ + +/* fs/sync.c */ + +/* fs/timerfd.c */ +cond_syscall(sys_timerfd_create); +cond_syscall(sys_timerfd_settime); +cond_syscall(compat_sys_timerfd_settime); +cond_syscall(sys_timerfd_gettime); +cond_syscall(compat_sys_timerfd_gettime); + +/* fs/utimes.c */ + +/* kernel/acct.c */ +cond_syscall(sys_acct); + +/* kernel/capability.c */ +cond_syscall(sys_capget); +cond_syscall(sys_capset); + +/* kernel/exec_domain.c */ + +/* kernel/exit.c */ + +/* kernel/fork.c */ + +/* kernel/futex.c */ +cond_syscall(sys_futex); +cond_syscall(compat_sys_futex); +cond_syscall(sys_set_robust_list); +cond_syscall(compat_sys_set_robust_list); +cond_syscall(sys_get_robust_list); +cond_syscall(compat_sys_get_robust_list); + +/* kernel/hrtimer.c */ + +/* kernel/itimer.c */ + +/* kernel/kexec.c */ cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); -cond_syscall(sys_kexec_file_load); + +/* kernel/module.c */ cond_syscall(sys_init_module); -cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); + +/* kernel/posix-timers.c */ + +/* kernel/printk.c */ +cond_syscall(sys_syslog); + +/* kernel/ptrace.c */ + +/* kernel/sched/core.c */ + +/* kernel/signal.c */ + +/* kernel/sys.c */ +cond_syscall(sys_setregid); +cond_syscall(sys_setgid); +cond_syscall(sys_setreuid); +cond_syscall(sys_setuid); +cond_syscall(sys_setresuid); +cond_syscall(sys_getresuid); +cond_syscall(sys_setresgid); +cond_syscall(sys_getresgid); +cond_syscall(sys_setfsuid); +cond_syscall(sys_setfsgid); +cond_syscall(sys_setgroups); +cond_syscall(sys_getgroups); + +/* kernel/time.c */ + +/* kernel/timer.c */ + +/* ipc/mqueue.c */ +cond_syscall(sys_mq_open); +cond_syscall(compat_sys_mq_open); +cond_syscall(sys_mq_unlink); +cond_syscall(sys_mq_timedsend); +cond_syscall(compat_sys_mq_timedsend); +cond_syscall(sys_mq_timedreceive); +cond_syscall(compat_sys_mq_timedreceive); +cond_syscall(sys_mq_notify); +cond_syscall(compat_sys_mq_notify); +cond_syscall(sys_mq_getsetattr); +cond_syscall(compat_sys_mq_getsetattr); + +/* ipc/msg.c */ +cond_syscall(sys_msgget); +cond_syscall(sys_msgctl); +cond_syscall(compat_sys_msgctl); +cond_syscall(sys_msgrcv); +cond_syscall(compat_sys_msgrcv); +cond_syscall(sys_msgsnd); +cond_syscall(compat_sys_msgsnd); + +/* ipc/sem.c */ +cond_syscall(sys_semget); +cond_syscall(sys_semctl); +cond_syscall(compat_sys_semctl); +cond_syscall(sys_semtimedop); +cond_syscall(compat_sys_semtimedop); +cond_syscall(sys_semop); + +/* ipc/shm.c */ +cond_syscall(sys_shmget); +cond_syscall(sys_shmctl); +cond_syscall(compat_sys_shmctl); +cond_syscall(sys_shmat); +cond_syscall(compat_sys_shmat); +cond_syscall(sys_shmdt); + +/* net/socket.c */ +cond_syscall(sys_socket); cond_syscall(sys_socketpair); cond_syscall(sys_bind); cond_syscall(sys_listen); cond_syscall(sys_accept); -cond_syscall(sys_accept4); cond_syscall(sys_connect); cond_syscall(sys_getsockname); cond_syscall(sys_getpeername); -cond_syscall(sys_sendto); -cond_syscall(sys_send); -cond_syscall(sys_recvfrom); -cond_syscall(sys_recv); -cond_syscall(sys_socket); cond_syscall(sys_setsockopt); cond_syscall(compat_sys_setsockopt); cond_syscall(sys_getsockopt); cond_syscall(compat_sys_getsockopt); +cond_syscall(sys_sendto); cond_syscall(sys_shutdown); +cond_syscall(sys_recvfrom); +cond_syscall(compat_sys_recvfrom); cond_syscall(sys_sendmsg); -cond_syscall(sys_sendmmsg); cond_syscall(compat_sys_sendmsg); -cond_syscall(compat_sys_sendmmsg); cond_syscall(sys_recvmsg); -cond_syscall(sys_recvmmsg); cond_syscall(compat_sys_recvmsg); -cond_syscall(compat_sys_recv); -cond_syscall(compat_sys_recvfrom); -cond_syscall(compat_sys_recvmmsg); -cond_syscall(sys_socketcall); -cond_syscall(sys_futex); -cond_syscall(compat_sys_futex); -cond_syscall(sys_set_robust_list); -cond_syscall(compat_sys_set_robust_list); -cond_syscall(sys_get_robust_list); -cond_syscall(compat_sys_get_robust_list); -cond_syscall(sys_epoll_create); -cond_syscall(sys_epoll_create1); -cond_syscall(sys_epoll_ctl); -cond_syscall(sys_epoll_wait); -cond_syscall(sys_epoll_pwait); -cond_syscall(compat_sys_epoll_pwait); -cond_syscall(sys_semget); -cond_syscall(sys_semop); -cond_syscall(sys_semtimedop); -cond_syscall(compat_sys_semtimedop); -cond_syscall(sys_semctl); -cond_syscall(compat_sys_semctl); -cond_syscall(sys_msgget); -cond_syscall(sys_msgsnd); -cond_syscall(compat_sys_msgsnd); -cond_syscall(sys_msgrcv); -cond_syscall(compat_sys_msgrcv); -cond_syscall(sys_msgctl); -cond_syscall(compat_sys_msgctl); -cond_syscall(sys_shmget); -cond_syscall(sys_shmat); -cond_syscall(compat_sys_shmat); -cond_syscall(sys_shmdt); -cond_syscall(sys_shmctl); -cond_syscall(compat_sys_shmctl); -cond_syscall(sys_mq_open); -cond_syscall(sys_mq_unlink); -cond_syscall(sys_mq_timedsend); -cond_syscall(sys_mq_timedreceive); -cond_syscall(sys_mq_notify); -cond_syscall(sys_mq_getsetattr); -cond_syscall(compat_sys_mq_open); -cond_syscall(compat_sys_mq_timedsend); -cond_syscall(compat_sys_mq_timedreceive); -cond_syscall(compat_sys_mq_notify); -cond_syscall(compat_sys_mq_getsetattr); -cond_syscall(sys_mbind); -cond_syscall(sys_get_mempolicy); -cond_syscall(sys_set_mempolicy); -cond_syscall(compat_sys_mbind); -cond_syscall(compat_sys_get_mempolicy); -cond_syscall(compat_sys_set_mempolicy); + +/* mm/filemap.c */ + +/* mm/nommu.c, also with MMU */ +cond_syscall(sys_mremap); + +/* security/keys/keyctl.c */ cond_syscall(sys_add_key); cond_syscall(sys_request_key); cond_syscall(sys_keyctl); cond_syscall(compat_sys_keyctl); -cond_syscall(compat_sys_socketcall); -cond_syscall(sys_inotify_init); -cond_syscall(sys_inotify_init1); -cond_syscall(sys_inotify_add_watch); -cond_syscall(sys_inotify_rm_watch); -cond_syscall(sys_migrate_pages); -cond_syscall(sys_move_pages); -cond_syscall(sys_chown16); -cond_syscall(sys_fchown16); -cond_syscall(sys_getegid16); -cond_syscall(sys_geteuid16); -cond_syscall(sys_getgid16); -cond_syscall(sys_getgroups16); -cond_syscall(sys_getresgid16); -cond_syscall(sys_getresuid16); -cond_syscall(sys_getuid16); -cond_syscall(sys_lchown16); -cond_syscall(sys_setfsgid16); -cond_syscall(sys_setfsuid16); -cond_syscall(sys_setgid16); -cond_syscall(sys_setgroups16); -cond_syscall(sys_setregid16); -cond_syscall(sys_setresgid16); -cond_syscall(sys_setresuid16); -cond_syscall(sys_setreuid16); -cond_syscall(sys_setuid16); -cond_syscall(sys_sgetmask); -cond_syscall(sys_ssetmask); -cond_syscall(sys_vm86old); -cond_syscall(sys_vm86); -cond_syscall(sys_modify_ldt); -cond_syscall(sys_ipc); -cond_syscall(compat_sys_ipc); -cond_syscall(compat_sys_sysctl); -cond_syscall(sys_flock); -cond_syscall(sys_io_setup); -cond_syscall(sys_io_destroy); -cond_syscall(sys_io_submit); -cond_syscall(sys_io_cancel); -cond_syscall(sys_io_getevents); -cond_syscall(compat_sys_io_setup); -cond_syscall(compat_sys_io_submit); -cond_syscall(compat_sys_io_getevents); -cond_syscall(sys_sysfs); -cond_syscall(sys_syslog); -cond_syscall(sys_process_vm_readv); -cond_syscall(sys_process_vm_writev); -cond_syscall(compat_sys_process_vm_readv); -cond_syscall(compat_sys_process_vm_writev); -cond_syscall(sys_uselib); -cond_syscall(sys_fadvise64); -cond_syscall(sys_fadvise64_64); -cond_syscall(sys_madvise); -cond_syscall(sys_setuid); -cond_syscall(sys_setregid); -cond_syscall(sys_setgid); -cond_syscall(sys_setreuid); -cond_syscall(sys_setresuid); -cond_syscall(sys_getresuid); -cond_syscall(sys_setresgid); -cond_syscall(sys_getresgid); -cond_syscall(sys_setgroups); -cond_syscall(sys_getgroups); -cond_syscall(sys_setfsuid); -cond_syscall(sys_setfsgid); -cond_syscall(sys_capget); -cond_syscall(sys_capset); -cond_syscall(sys_copy_file_range); -/* arch-specific weak syscall entries */ -cond_syscall(sys_pciconfig_read); -cond_syscall(sys_pciconfig_write); -cond_syscall(sys_pciconfig_iobase); -cond_syscall(compat_sys_s390_ipc); -cond_syscall(ppc_rtas); -cond_syscall(sys_spu_run); -cond_syscall(sys_spu_create); -cond_syscall(sys_subpage_prot); -cond_syscall(sys_s390_pci_mmio_read); -cond_syscall(sys_s390_pci_mmio_write); +/* arch/example/kernel/sys_example.c */ -/* mmu depending weak syscall entries */ +/* mm/fadvise.c */ +cond_syscall(sys_fadvise64_64); + +/* mm/, CONFIG_MMU only */ +cond_syscall(sys_swapon); +cond_syscall(sys_swapoff); cond_syscall(sys_mprotect); cond_syscall(sys_msync); cond_syscall(sys_mlock); cond_syscall(sys_munlock); cond_syscall(sys_mlockall); cond_syscall(sys_munlockall); -cond_syscall(sys_mlock2); cond_syscall(sys_mincore); cond_syscall(sys_madvise); -cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); -cond_syscall(compat_sys_move_pages); +cond_syscall(sys_mbind); +cond_syscall(compat_sys_mbind); +cond_syscall(sys_get_mempolicy); +cond_syscall(compat_sys_get_mempolicy); +cond_syscall(sys_set_mempolicy); +cond_syscall(compat_sys_set_mempolicy); +cond_syscall(sys_migrate_pages); cond_syscall(compat_sys_migrate_pages); +cond_syscall(sys_move_pages); +cond_syscall(compat_sys_move_pages); -/* block-layer dependent */ -cond_syscall(sys_bdflush); -cond_syscall(sys_ioprio_set); -cond_syscall(sys_ioprio_get); - -/* New file descriptors */ -cond_syscall(sys_signalfd); -cond_syscall(sys_signalfd4); -cond_syscall(compat_sys_signalfd); -cond_syscall(compat_sys_signalfd4); -cond_syscall(sys_timerfd_create); -cond_syscall(sys_timerfd_settime); -cond_syscall(sys_timerfd_gettime); -cond_syscall(compat_sys_timerfd_settime); -cond_syscall(compat_sys_timerfd_gettime); -cond_syscall(sys_eventfd); -cond_syscall(sys_eventfd2); -cond_syscall(sys_memfd_create); -cond_syscall(sys_userfaultfd); - -/* performance counters: */ cond_syscall(sys_perf_event_open); +cond_syscall(sys_accept4); +cond_syscall(sys_recvmmsg); +cond_syscall(compat_sys_recvmmsg); + +/* + * Architecture specific syscalls: see further below + */ -/* fanotify! */ +/* fanotify */ cond_syscall(sys_fanotify_init); cond_syscall(sys_fanotify_mark); -cond_syscall(compat_sys_fanotify_mark); /* open by handle */ cond_syscall(sys_name_to_handle_at); cond_syscall(sys_open_by_handle_at); cond_syscall(compat_sys_open_by_handle_at); +cond_syscall(sys_sendmmsg); +cond_syscall(compat_sys_sendmmsg); +cond_syscall(sys_process_vm_readv); +cond_syscall(compat_sys_process_vm_readv); +cond_syscall(sys_process_vm_writev); +cond_syscall(compat_sys_process_vm_writev); + /* compare kernel pointers */ cond_syscall(sys_kcmp); +cond_syscall(sys_finit_module); + /* operate on Secure Computing state */ cond_syscall(sys_seccomp); +cond_syscall(sys_memfd_create); + /* access BPF programs and maps */ cond_syscall(sys_bpf); /* execveat */ cond_syscall(sys_execveat); +cond_syscall(sys_userfaultfd); + /* membarrier */ cond_syscall(sys_membarrier); +cond_syscall(sys_mlock2); + +cond_syscall(sys_copy_file_range); + /* memory protection keys */ cond_syscall(sys_pkey_mprotect); cond_syscall(sys_pkey_alloc); cond_syscall(sys_pkey_free); + + +/* + * Architecture specific weak syscall entries. + */ + +/* pciconfig: alpha, arm, arm64, ia64, sparc */ +cond_syscall(sys_pciconfig_read); +cond_syscall(sys_pciconfig_write); +cond_syscall(sys_pciconfig_iobase); + +/* sys_socketcall: arm, mips, x86, ... */ +cond_syscall(sys_socketcall); +cond_syscall(compat_sys_socketcall); + +/* compat syscalls for arm64, x86, ... */ +cond_syscall(compat_sys_sysctl); +cond_syscall(compat_sys_fanotify_mark); + +/* x86 */ +cond_syscall(sys_vm86old); +cond_syscall(sys_modify_ldt); +cond_syscall(compat_sys_quotactl32); +cond_syscall(sys_vm86); +cond_syscall(sys_kexec_file_load); + +/* s390 */ +cond_syscall(sys_s390_pci_mmio_read); +cond_syscall(sys_s390_pci_mmio_write); +cond_syscall(compat_sys_s390_ipc); + +/* powerpc */ +cond_syscall(ppc_rtas); +cond_syscall(sys_spu_run); +cond_syscall(sys_spu_create); +cond_syscall(sys_subpage_prot); + + +/* + * Deprecated system calls which are still defined in + * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch + */ + +/* __ARCH_WANT_SYSCALL_NO_FLAGS */ +cond_syscall(sys_epoll_create); +cond_syscall(sys_inotify_init); +cond_syscall(sys_eventfd); +cond_syscall(sys_signalfd); +cond_syscall(compat_sys_signalfd); + +/* __ARCH_WANT_SYSCALL_OFF_T */ +cond_syscall(sys_fadvise64); + +/* __ARCH_WANT_SYSCALL_DEPRECATED */ +cond_syscall(sys_epoll_wait); +cond_syscall(sys_recv); +cond_syscall(compat_sys_recv); +cond_syscall(sys_send); +cond_syscall(sys_bdflush); +cond_syscall(sys_uselib); + + +/* + * The syscalls below are not found in include/uapi/asm-generic/unistd.h + */ + +/* obsolete: SGETMASK_SYSCALL */ +cond_syscall(sys_sgetmask); +cond_syscall(sys_ssetmask); + +/* obsolete: SYSFS_SYSCALL */ +cond_syscall(sys_sysfs); + +/* obsolete: __ARCH_WANT_SYS_IPC */ +cond_syscall(sys_ipc); +cond_syscall(compat_sys_ipc); + +/* obsolete: UID16 */ +cond_syscall(sys_chown16); +cond_syscall(sys_fchown16); +cond_syscall(sys_getegid16); +cond_syscall(sys_geteuid16); +cond_syscall(sys_getgid16); +cond_syscall(sys_getgroups16); +cond_syscall(sys_getresgid16); +cond_syscall(sys_getresuid16); +cond_syscall(sys_getuid16); +cond_syscall(sys_lchown16); +cond_syscall(sys_setfsgid16); +cond_syscall(sys_setfsuid16); +cond_syscall(sys_setgid16); +cond_syscall(sys_setgroups16); +cond_syscall(sys_setregid16); +cond_syscall(sys_setresgid16); +cond_syscall(sys_setresuid16); +cond_syscall(sys_setreuid16); +cond_syscall(sys_setuid16); -- cgit v1.2.3-71-gd317 From 67a7acd3773a94df2e671601a288685485463cf9 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 4 Mar 2018 19:06:35 +0100 Subject: kernel/sys_ni: remove {sys_,sys_compat} from cond_syscall definitions This keeps it in line with the SYSCALL_DEFINEx() / COMPAT_SYSCALL_DEFINEx() calling convention. Signed-off-by: Dominik Brodowski --- Documentation/process/adding-syscalls.rst | 2 +- kernel/sys_ni.c | 433 +++++++++++++++--------------- 2 files changed, 219 insertions(+), 216 deletions(-) (limited to 'kernel') diff --git a/Documentation/process/adding-syscalls.rst b/Documentation/process/adding-syscalls.rst index 556613744556..314c8bf6f2a2 100644 --- a/Documentation/process/adding-syscalls.rst +++ b/Documentation/process/adding-syscalls.rst @@ -222,7 +222,7 @@ your new syscall number may get adjusted to resolve conflicts. The file ``kernel/sys_ni.c`` provides a fallback stub implementation of each system call, returning ``-ENOSYS``. Add your new system call here too:: - cond_syscall(sys_xyzzy); + COND_SYSCALL(xyzzy); Your new kernel functionality, and the system call that controls it, should normally be optional, so add a ``CONFIG`` option (typically to diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0c1538f5a559..6cafc008f6db 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -17,53 +17,56 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } +#define COND_SYSCALL(name) cond_syscall(sys_##name) +#define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name) + /* * This list is kept in the same order as include/uapi/asm-generic/unistd.h. * Architecture specific entries go below, followed by deprecated or obsolete * system calls. */ -cond_syscall(sys_io_setup); -cond_syscall(compat_sys_io_setup); -cond_syscall(sys_io_destroy); -cond_syscall(sys_io_submit); -cond_syscall(compat_sys_io_submit); -cond_syscall(sys_io_cancel); -cond_syscall(sys_io_getevents); -cond_syscall(compat_sys_io_getevents); +COND_SYSCALL(io_setup); +COND_SYSCALL_COMPAT(io_setup); +COND_SYSCALL(io_destroy); +COND_SYSCALL(io_submit); +COND_SYSCALL_COMPAT(io_submit); +COND_SYSCALL(io_cancel); +COND_SYSCALL(io_getevents); +COND_SYSCALL_COMPAT(io_getevents); /* fs/xattr.c */ /* fs/dcache.c */ /* fs/cookies.c */ -cond_syscall(sys_lookup_dcookie); -cond_syscall(compat_sys_lookup_dcookie); +COND_SYSCALL(lookup_dcookie); +COND_SYSCALL_COMPAT(lookup_dcookie); /* fs/eventfd.c */ -cond_syscall(sys_eventfd2); +COND_SYSCALL(eventfd2); /* fs/eventfd.c */ -cond_syscall(sys_epoll_create1); -cond_syscall(sys_epoll_ctl); -cond_syscall(sys_epoll_pwait); -cond_syscall(compat_sys_epoll_pwait); +COND_SYSCALL(epoll_create1); +COND_SYSCALL(epoll_ctl); +COND_SYSCALL(epoll_pwait); +COND_SYSCALL_COMPAT(epoll_pwait); /* fs/fcntl.c */ /* fs/inotify_user.c */ -cond_syscall(sys_inotify_init1); -cond_syscall(sys_inotify_add_watch); -cond_syscall(sys_inotify_rm_watch); +COND_SYSCALL(inotify_init1); +COND_SYSCALL(inotify_add_watch); +COND_SYSCALL(inotify_rm_watch); /* fs/ioctl.c */ /* fs/ioprio.c */ -cond_syscall(sys_ioprio_set); -cond_syscall(sys_ioprio_get); +COND_SYSCALL(ioprio_set); +COND_SYSCALL(ioprio_get); /* fs/locks.c */ -cond_syscall(sys_flock); +COND_SYSCALL(flock); /* fs/namei.c */ @@ -76,7 +79,7 @@ cond_syscall(sys_flock); /* fs/pipe.c */ /* fs/quota.c */ -cond_syscall(sys_quotactl); +COND_SYSCALL(quotactl); /* fs/readdir.c */ @@ -87,8 +90,8 @@ cond_syscall(sys_quotactl); /* fs/select.c */ /* fs/signalfd.c */ -cond_syscall(sys_signalfd4); -cond_syscall(compat_sys_signalfd4); +COND_SYSCALL(signalfd4); +COND_SYSCALL_COMPAT(signalfd4); /* fs/splice.c */ @@ -97,20 +100,20 @@ cond_syscall(compat_sys_signalfd4); /* fs/sync.c */ /* fs/timerfd.c */ -cond_syscall(sys_timerfd_create); -cond_syscall(sys_timerfd_settime); -cond_syscall(compat_sys_timerfd_settime); -cond_syscall(sys_timerfd_gettime); -cond_syscall(compat_sys_timerfd_gettime); +COND_SYSCALL(timerfd_create); +COND_SYSCALL(timerfd_settime); +COND_SYSCALL_COMPAT(timerfd_settime); +COND_SYSCALL(timerfd_gettime); +COND_SYSCALL_COMPAT(timerfd_gettime); /* fs/utimes.c */ /* kernel/acct.c */ -cond_syscall(sys_acct); +COND_SYSCALL(acct); /* kernel/capability.c */ -cond_syscall(sys_capget); -cond_syscall(sys_capset); +COND_SYSCALL(capget); +COND_SYSCALL(capset); /* kernel/exec_domain.c */ @@ -119,29 +122,29 @@ cond_syscall(sys_capset); /* kernel/fork.c */ /* kernel/futex.c */ -cond_syscall(sys_futex); -cond_syscall(compat_sys_futex); -cond_syscall(sys_set_robust_list); -cond_syscall(compat_sys_set_robust_list); -cond_syscall(sys_get_robust_list); -cond_syscall(compat_sys_get_robust_list); +COND_SYSCALL(futex); +COND_SYSCALL_COMPAT(futex); +COND_SYSCALL(set_robust_list); +COND_SYSCALL_COMPAT(set_robust_list); +COND_SYSCALL(get_robust_list); +COND_SYSCALL_COMPAT(get_robust_list); /* kernel/hrtimer.c */ /* kernel/itimer.c */ /* kernel/kexec.c */ -cond_syscall(sys_kexec_load); -cond_syscall(compat_sys_kexec_load); +COND_SYSCALL(kexec_load); +COND_SYSCALL_COMPAT(kexec_load); /* kernel/module.c */ -cond_syscall(sys_init_module); -cond_syscall(sys_delete_module); +COND_SYSCALL(init_module); +COND_SYSCALL(delete_module); /* kernel/posix-timers.c */ /* kernel/printk.c */ -cond_syscall(sys_syslog); +COND_SYSCALL(syslog); /* kernel/ptrace.c */ @@ -150,176 +153,176 @@ cond_syscall(sys_syslog); /* kernel/signal.c */ /* kernel/sys.c */ -cond_syscall(sys_setregid); -cond_syscall(sys_setgid); -cond_syscall(sys_setreuid); -cond_syscall(sys_setuid); -cond_syscall(sys_setresuid); -cond_syscall(sys_getresuid); -cond_syscall(sys_setresgid); -cond_syscall(sys_getresgid); -cond_syscall(sys_setfsuid); -cond_syscall(sys_setfsgid); -cond_syscall(sys_setgroups); -cond_syscall(sys_getgroups); +COND_SYSCALL(setregid); +COND_SYSCALL(setgid); +COND_SYSCALL(setreuid); +COND_SYSCALL(setuid); +COND_SYSCALL(setresuid); +COND_SYSCALL(getresuid); +COND_SYSCALL(setresgid); +COND_SYSCALL(getresgid); +COND_SYSCALL(setfsuid); +COND_SYSCALL(setfsgid); +COND_SYSCALL(setgroups); +COND_SYSCALL(getgroups); /* kernel/time.c */ /* kernel/timer.c */ /* ipc/mqueue.c */ -cond_syscall(sys_mq_open); -cond_syscall(compat_sys_mq_open); -cond_syscall(sys_mq_unlink); -cond_syscall(sys_mq_timedsend); -cond_syscall(compat_sys_mq_timedsend); -cond_syscall(sys_mq_timedreceive); -cond_syscall(compat_sys_mq_timedreceive); -cond_syscall(sys_mq_notify); -cond_syscall(compat_sys_mq_notify); -cond_syscall(sys_mq_getsetattr); -cond_syscall(compat_sys_mq_getsetattr); +COND_SYSCALL(mq_open); +COND_SYSCALL_COMPAT(mq_open); +COND_SYSCALL(mq_unlink); +COND_SYSCALL(mq_timedsend); +COND_SYSCALL_COMPAT(mq_timedsend); +COND_SYSCALL(mq_timedreceive); +COND_SYSCALL_COMPAT(mq_timedreceive); +COND_SYSCALL(mq_notify); +COND_SYSCALL_COMPAT(mq_notify); +COND_SYSCALL(mq_getsetattr); +COND_SYSCALL_COMPAT(mq_getsetattr); /* ipc/msg.c */ -cond_syscall(sys_msgget); -cond_syscall(sys_msgctl); -cond_syscall(compat_sys_msgctl); -cond_syscall(sys_msgrcv); -cond_syscall(compat_sys_msgrcv); -cond_syscall(sys_msgsnd); -cond_syscall(compat_sys_msgsnd); +COND_SYSCALL(msgget); +COND_SYSCALL(msgctl); +COND_SYSCALL_COMPAT(msgctl); +COND_SYSCALL(msgrcv); +COND_SYSCALL_COMPAT(msgrcv); +COND_SYSCALL(msgsnd); +COND_SYSCALL_COMPAT(msgsnd); /* ipc/sem.c */ -cond_syscall(sys_semget); -cond_syscall(sys_semctl); -cond_syscall(compat_sys_semctl); -cond_syscall(sys_semtimedop); -cond_syscall(compat_sys_semtimedop); -cond_syscall(sys_semop); +COND_SYSCALL(semget); +COND_SYSCALL(semctl); +COND_SYSCALL_COMPAT(semctl); +COND_SYSCALL(semtimedop); +COND_SYSCALL_COMPAT(semtimedop); +COND_SYSCALL(semop); /* ipc/shm.c */ -cond_syscall(sys_shmget); -cond_syscall(sys_shmctl); -cond_syscall(compat_sys_shmctl); -cond_syscall(sys_shmat); -cond_syscall(compat_sys_shmat); -cond_syscall(sys_shmdt); +COND_SYSCALL(shmget); +COND_SYSCALL(shmctl); +COND_SYSCALL_COMPAT(shmctl); +COND_SYSCALL(shmat); +COND_SYSCALL_COMPAT(shmat); +COND_SYSCALL(shmdt); /* net/socket.c */ -cond_syscall(sys_socket); -cond_syscall(sys_socketpair); -cond_syscall(sys_bind); -cond_syscall(sys_listen); -cond_syscall(sys_accept); -cond_syscall(sys_connect); -cond_syscall(sys_getsockname); -cond_syscall(sys_getpeername); -cond_syscall(sys_setsockopt); -cond_syscall(compat_sys_setsockopt); -cond_syscall(sys_getsockopt); -cond_syscall(compat_sys_getsockopt); -cond_syscall(sys_sendto); -cond_syscall(sys_shutdown); -cond_syscall(sys_recvfrom); -cond_syscall(compat_sys_recvfrom); -cond_syscall(sys_sendmsg); -cond_syscall(compat_sys_sendmsg); -cond_syscall(sys_recvmsg); -cond_syscall(compat_sys_recvmsg); +COND_SYSCALL(socket); +COND_SYSCALL(socketpair); +COND_SYSCALL(bind); +COND_SYSCALL(listen); +COND_SYSCALL(accept); +COND_SYSCALL(connect); +COND_SYSCALL(getsockname); +COND_SYSCALL(getpeername); +COND_SYSCALL(setsockopt); +COND_SYSCALL_COMPAT(setsockopt); +COND_SYSCALL(getsockopt); +COND_SYSCALL_COMPAT(getsockopt); +COND_SYSCALL(sendto); +COND_SYSCALL(shutdown); +COND_SYSCALL(recvfrom); +COND_SYSCALL_COMPAT(recvfrom); +COND_SYSCALL(sendmsg); +COND_SYSCALL_COMPAT(sendmsg); +COND_SYSCALL(recvmsg); +COND_SYSCALL_COMPAT(recvmsg); /* mm/filemap.c */ /* mm/nommu.c, also with MMU */ -cond_syscall(sys_mremap); +COND_SYSCALL(mremap); /* security/keys/keyctl.c */ -cond_syscall(sys_add_key); -cond_syscall(sys_request_key); -cond_syscall(sys_keyctl); -cond_syscall(compat_sys_keyctl); +COND_SYSCALL(add_key); +COND_SYSCALL(request_key); +COND_SYSCALL(keyctl); +COND_SYSCALL_COMPAT(keyctl); /* arch/example/kernel/sys_example.c */ /* mm/fadvise.c */ -cond_syscall(sys_fadvise64_64); +COND_SYSCALL(fadvise64_64); /* mm/, CONFIG_MMU only */ -cond_syscall(sys_swapon); -cond_syscall(sys_swapoff); -cond_syscall(sys_mprotect); -cond_syscall(sys_msync); -cond_syscall(sys_mlock); -cond_syscall(sys_munlock); -cond_syscall(sys_mlockall); -cond_syscall(sys_munlockall); -cond_syscall(sys_mincore); -cond_syscall(sys_madvise); -cond_syscall(sys_remap_file_pages); -cond_syscall(sys_mbind); -cond_syscall(compat_sys_mbind); -cond_syscall(sys_get_mempolicy); -cond_syscall(compat_sys_get_mempolicy); -cond_syscall(sys_set_mempolicy); -cond_syscall(compat_sys_set_mempolicy); -cond_syscall(sys_migrate_pages); -cond_syscall(compat_sys_migrate_pages); -cond_syscall(sys_move_pages); -cond_syscall(compat_sys_move_pages); - -cond_syscall(sys_perf_event_open); -cond_syscall(sys_accept4); -cond_syscall(sys_recvmmsg); -cond_syscall(compat_sys_recvmmsg); +COND_SYSCALL(swapon); +COND_SYSCALL(swapoff); +COND_SYSCALL(mprotect); +COND_SYSCALL(msync); +COND_SYSCALL(mlock); +COND_SYSCALL(munlock); +COND_SYSCALL(mlockall); +COND_SYSCALL(munlockall); +COND_SYSCALL(mincore); +COND_SYSCALL(madvise); +COND_SYSCALL(remap_file_pages); +COND_SYSCALL(mbind); +COND_SYSCALL_COMPAT(mbind); +COND_SYSCALL(get_mempolicy); +COND_SYSCALL_COMPAT(get_mempolicy); +COND_SYSCALL(set_mempolicy); +COND_SYSCALL_COMPAT(set_mempolicy); +COND_SYSCALL(migrate_pages); +COND_SYSCALL_COMPAT(migrate_pages); +COND_SYSCALL(move_pages); +COND_SYSCALL_COMPAT(move_pages); + +COND_SYSCALL(perf_event_open); +COND_SYSCALL(accept4); +COND_SYSCALL(recvmmsg); +COND_SYSCALL_COMPAT(recvmmsg); /* * Architecture specific syscalls: see further below */ /* fanotify */ -cond_syscall(sys_fanotify_init); -cond_syscall(sys_fanotify_mark); +COND_SYSCALL(fanotify_init); +COND_SYSCALL(fanotify_mark); /* open by handle */ -cond_syscall(sys_name_to_handle_at); -cond_syscall(sys_open_by_handle_at); -cond_syscall(compat_sys_open_by_handle_at); +COND_SYSCALL(name_to_handle_at); +COND_SYSCALL(open_by_handle_at); +COND_SYSCALL_COMPAT(open_by_handle_at); -cond_syscall(sys_sendmmsg); -cond_syscall(compat_sys_sendmmsg); -cond_syscall(sys_process_vm_readv); -cond_syscall(compat_sys_process_vm_readv); -cond_syscall(sys_process_vm_writev); -cond_syscall(compat_sys_process_vm_writev); +COND_SYSCALL(sendmmsg); +COND_SYSCALL_COMPAT(sendmmsg); +COND_SYSCALL(process_vm_readv); +COND_SYSCALL_COMPAT(process_vm_readv); +COND_SYSCALL(process_vm_writev); +COND_SYSCALL_COMPAT(process_vm_writev); /* compare kernel pointers */ -cond_syscall(sys_kcmp); +COND_SYSCALL(kcmp); -cond_syscall(sys_finit_module); +COND_SYSCALL(finit_module); /* operate on Secure Computing state */ -cond_syscall(sys_seccomp); +COND_SYSCALL(seccomp); -cond_syscall(sys_memfd_create); +COND_SYSCALL(memfd_create); /* access BPF programs and maps */ -cond_syscall(sys_bpf); +COND_SYSCALL(bpf); /* execveat */ -cond_syscall(sys_execveat); +COND_SYSCALL(execveat); -cond_syscall(sys_userfaultfd); +COND_SYSCALL(userfaultfd); /* membarrier */ -cond_syscall(sys_membarrier); +COND_SYSCALL(membarrier); -cond_syscall(sys_mlock2); +COND_SYSCALL(mlock2); -cond_syscall(sys_copy_file_range); +COND_SYSCALL(copy_file_range); /* memory protection keys */ -cond_syscall(sys_pkey_mprotect); -cond_syscall(sys_pkey_alloc); -cond_syscall(sys_pkey_free); +COND_SYSCALL(pkey_mprotect); +COND_SYSCALL(pkey_alloc); +COND_SYSCALL(pkey_free); /* @@ -327,35 +330,35 @@ cond_syscall(sys_pkey_free); */ /* pciconfig: alpha, arm, arm64, ia64, sparc */ -cond_syscall(sys_pciconfig_read); -cond_syscall(sys_pciconfig_write); -cond_syscall(sys_pciconfig_iobase); +COND_SYSCALL(pciconfig_read); +COND_SYSCALL(pciconfig_write); +COND_SYSCALL(pciconfig_iobase); /* sys_socketcall: arm, mips, x86, ... */ -cond_syscall(sys_socketcall); -cond_syscall(compat_sys_socketcall); +COND_SYSCALL(socketcall); +COND_SYSCALL_COMPAT(socketcall); /* compat syscalls for arm64, x86, ... */ -cond_syscall(compat_sys_sysctl); -cond_syscall(compat_sys_fanotify_mark); +COND_SYSCALL_COMPAT(sysctl); +COND_SYSCALL_COMPAT(fanotify_mark); /* x86 */ -cond_syscall(sys_vm86old); -cond_syscall(sys_modify_ldt); -cond_syscall(compat_sys_quotactl32); -cond_syscall(sys_vm86); -cond_syscall(sys_kexec_file_load); +COND_SYSCALL(vm86old); +COND_SYSCALL(modify_ldt); +COND_SYSCALL_COMPAT(quotactl32); +COND_SYSCALL(vm86); +COND_SYSCALL(kexec_file_load); /* s390 */ -cond_syscall(sys_s390_pci_mmio_read); -cond_syscall(sys_s390_pci_mmio_write); -cond_syscall(compat_sys_s390_ipc); +COND_SYSCALL(s390_pci_mmio_read); +COND_SYSCALL(s390_pci_mmio_write); +COND_SYSCALL_COMPAT(s390_ipc); /* powerpc */ cond_syscall(ppc_rtas); -cond_syscall(sys_spu_run); -cond_syscall(sys_spu_create); -cond_syscall(sys_subpage_prot); +COND_SYSCALL(spu_run); +COND_SYSCALL(spu_create); +COND_SYSCALL(subpage_prot); /* @@ -364,22 +367,22 @@ cond_syscall(sys_subpage_prot); */ /* __ARCH_WANT_SYSCALL_NO_FLAGS */ -cond_syscall(sys_epoll_create); -cond_syscall(sys_inotify_init); -cond_syscall(sys_eventfd); -cond_syscall(sys_signalfd); -cond_syscall(compat_sys_signalfd); +COND_SYSCALL(epoll_create); +COND_SYSCALL(inotify_init); +COND_SYSCALL(eventfd); +COND_SYSCALL(signalfd); +COND_SYSCALL_COMPAT(signalfd); /* __ARCH_WANT_SYSCALL_OFF_T */ -cond_syscall(sys_fadvise64); +COND_SYSCALL(fadvise64); /* __ARCH_WANT_SYSCALL_DEPRECATED */ -cond_syscall(sys_epoll_wait); -cond_syscall(sys_recv); -cond_syscall(compat_sys_recv); -cond_syscall(sys_send); -cond_syscall(sys_bdflush); -cond_syscall(sys_uselib); +COND_SYSCALL(epoll_wait); +COND_SYSCALL(recv); +COND_SYSCALL_COMPAT(recv); +COND_SYSCALL(send); +COND_SYSCALL(bdflush); +COND_SYSCALL(uselib); /* @@ -387,33 +390,33 @@ cond_syscall(sys_uselib); */ /* obsolete: SGETMASK_SYSCALL */ -cond_syscall(sys_sgetmask); -cond_syscall(sys_ssetmask); +COND_SYSCALL(sgetmask); +COND_SYSCALL(ssetmask); /* obsolete: SYSFS_SYSCALL */ -cond_syscall(sys_sysfs); +COND_SYSCALL(sysfs); /* obsolete: __ARCH_WANT_SYS_IPC */ -cond_syscall(sys_ipc); -cond_syscall(compat_sys_ipc); +COND_SYSCALL(ipc); +COND_SYSCALL_COMPAT(ipc); /* obsolete: UID16 */ -cond_syscall(sys_chown16); -cond_syscall(sys_fchown16); -cond_syscall(sys_getegid16); -cond_syscall(sys_geteuid16); -cond_syscall(sys_getgid16); -cond_syscall(sys_getgroups16); -cond_syscall(sys_getresgid16); -cond_syscall(sys_getresuid16); -cond_syscall(sys_getuid16); -cond_syscall(sys_lchown16); -cond_syscall(sys_setfsgid16); -cond_syscall(sys_setfsuid16); -cond_syscall(sys_setgid16); -cond_syscall(sys_setgroups16); -cond_syscall(sys_setregid16); -cond_syscall(sys_setresgid16); -cond_syscall(sys_setresuid16); -cond_syscall(sys_setreuid16); -cond_syscall(sys_setuid16); +COND_SYSCALL(chown16); +COND_SYSCALL(fchown16); +COND_SYSCALL(getegid16); +COND_SYSCALL(geteuid16); +COND_SYSCALL(getgid16); +COND_SYSCALL(getgroups16); +COND_SYSCALL(getresgid16); +COND_SYSCALL(getresuid16); +COND_SYSCALL(getuid16); +COND_SYSCALL(lchown16); +COND_SYSCALL(setfsgid16); +COND_SYSCALL(setfsuid16); +COND_SYSCALL(setgid16); +COND_SYSCALL(setgroups16); +COND_SYSCALL(setregid16); +COND_SYSCALL(setresgid16); +COND_SYSCALL(setresuid16); +COND_SYSCALL(setreuid16); +COND_SYSCALL(setuid16); -- cgit v1.2.3-71-gd317 From 820ed3fb2e6e986144465082d041e6a403a94135 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 2 Apr 2018 12:50:46 -0700 Subject: bpf: sockmap, free memory on sock close with cork data If a socket with pending cork data is closed we do not return the memory to the socket until the garbage collector free's the psock structure. The garbage collector though can run after the sock has completed its close operation. If this ordering happens the sock code will through a WARN_ON because there is still outstanding memory accounted to the sock. To resolve this ensure we return memory to the sock when a socket is closed. Signed-off-by: John Fastabend Fixes: 91843d540a13 ("bpf: sockmap, add msg_cork_bytes() helper") Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index d2bda5aa25d7..8ddf326b3ade 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -211,6 +211,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; write_lock_bh(&sk->sk_callback_lock); + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; + } + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); free_start_sg(psock->sock, md); -- cgit v1.2.3-71-gd317 From 0e94d87fcfc81883b72574a5cc4638bd87adbb10 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 2 Apr 2018 12:50:52 -0700 Subject: bpf: sockmap, duplicates release calls may NULL sk_prot It is possible to have multiple ULP tcp_release call paths in flight if a sock is closed and simultaneously being removed from the sockmap control path. The result would be setting the sk_prot to the saved values on the first iteration and then on the second iteration setting the value to NULL. This patch resolves this by ensuring we only reset the sk_prot pointer if we have a valid saved state to set. Fixes: 4f738adba30a7 ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 8ddf326b3ade..8dd9210d7db7 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -182,8 +182,10 @@ static void bpf_tcp_release(struct sock *sk) psock->cork = NULL; } - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; + if (psock->sk_proto) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } out: rcu_read_unlock(); } -- cgit v1.2.3-71-gd317 From 33491588c1fb2c76ed114a211ad0ee76c16b5a0c Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 3 Apr 2018 14:09:47 +0200 Subject: kernel/bpf/syscall: fix warning defined but not used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There will be a build warning -Wunused-function if CONFIG_CGROUP_BPF isn't defined, since the only user is inside #ifdef CONFIG_CGROUP_BPF: kernel/bpf/syscall.c:1229:12: warning: ‘bpf_prog_attach_check_attach_type’ defined but not used [-Wunused-function] static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Current code moves function bpf_prog_attach_check_attach_type inside ifdef CONFIG_CGROUP_BPF. Fixes: 5e43f899b03a ("bpf: Check attach type at prog load time") Signed-off-by: Anders Roxell Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0244973ee544..4ca46df19c9a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1226,18 +1226,6 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, } } -static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, - enum bpf_attach_type attach_type) -{ - switch (prog->type) { - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - return attach_type == prog->expected_attach_type ? 0 : -EINVAL; - default: - return 0; - } -} - /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD expected_attach_type @@ -1465,6 +1453,18 @@ out_free_tp: #ifdef CONFIG_CGROUP_BPF +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + default: + return 0; + } +} + #define BPF_PROG_ATTACH_LAST_FIELD attach_flags static int sockmap_get_from_fd(const union bpf_attr *attr, -- cgit v1.2.3-71-gd317 From d6f73825dcd0fa1de9a6bf37c79f6109cc87b82f Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 3 Apr 2018 21:31:30 -0700 Subject: genirq: Make GENERIC_IRQ_MULTI_HANDLER depend on !MULTI_IRQ_HANDLER These config switches enable the same code in the core and the not yet converted architecture code. They can be selected both by randconfig builds and cause linker error because the same symbols are defined twice. Make the new GENERIC_IRQ_MULTI_HANDLER depend on !MULTI_IRQ_HANDLER to prevent that. The dependency will be removed once all architectures are converted over. Signed-off-by: Palmer Dabbelt Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Arnd Bergmann Link: https://lkml.kernel.org/r/20180404043130.31277-4-palmer@sifive.com --- kernel/irq/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5f3e2baefca9..c6766f326072 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -134,6 +134,7 @@ config GENERIC_IRQ_DEBUGFS endmenu config GENERIC_IRQ_MULTI_HANDLER + depends on !MULTI_IRQ_HANDLER bool help Allow to specify the low level IRQ handler at run time. -- cgit v1.2.3-71-gd317 From d29a20645d5e929aa7e8616f28e5d8e1c49263ec Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 2 Apr 2018 09:49:54 -0700 Subject: sched/rt: Fix rq->clock_update_flags < RQCF_ACT_SKIP warning While running rt-tests' pi_stress program I got the following splat: rq->clock_update_flags < RQCF_ACT_SKIP WARNING: CPU: 27 PID: 0 at kernel/sched/sched.h:960 assert_clock_updated.isra.38.part.39+0x13/0x20 [...] enqueue_top_rt_rq+0xf4/0x150 ? cpufreq_dbs_governor_start+0x170/0x170 sched_rt_rq_enqueue+0x65/0x80 sched_rt_period_timer+0x156/0x360 ? sched_rt_rq_enqueue+0x80/0x80 __hrtimer_run_queues+0xfa/0x260 hrtimer_interrupt+0xcb/0x220 smp_apic_timer_interrupt+0x62/0x120 apic_timer_interrupt+0xf/0x20 [...] do_idle+0x183/0x1e0 cpu_startup_entry+0x5f/0x70 start_secondary+0x192/0x1d0 secondary_startup_64+0xa5/0xb0 We can get rid of it be the "traditional" means of adding an update_rq_clock() call after acquiring the rq->lock in do_sched_rt_period_timer(). The case for the RT task throttling (which this workload also hits) can be ignored in that the skip_update call is actually bogus and quite the contrary (the request bits are removed/reverted). By setting RQCF_UPDATED we really don't care if the skip is happening or not and will therefore make the assert_clock_updated() check happy. Signed-off-by: Davidlohr Bueso Reviewed-by: Matt Fleming Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: linux-kernel@vger.kernel.org Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20180402164954.16255-1-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 86b77987435e..ad13e6242481 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -839,6 +839,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) continue; raw_spin_lock(&rq->lock); + update_rq_clock(rq); + if (rt_rq->rt_time) { u64 runtime; -- cgit v1.2.3-71-gd317 From adcc8da8859bee9548bb6d323b1e8de8a7252acd Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 4 Apr 2018 09:15:39 -0700 Subject: sched/core: Simplify helpers for rq clock update skip requests By renaming the functions we can get rid of the skip parameter and have better code redability. It makes zero sense to have things such as: rq_clock_skip_update(rq, false) When the skip request is in fact not going to happen. Ever. Rename things such that we end up with: rq_clock_skip_update(rq) rq_clock_cancel_skipupdate(rq) Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: matt@codeblueprint.co.uk Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20180404161539.nhadkff2aats74jh@linux-n805 Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 17 ++++++++++++----- 5 files changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28b68995a417..550a07f648b6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -874,7 +874,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * this case, we can save a useless back to back clock update. */ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } #ifdef CONFIG_SMP diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d1c7bf7c7e5b..e7b3008b85bb 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1560,7 +1560,7 @@ static void yield_task_dl(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } #ifdef CONFIG_SMP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0951d1c58d2f..54dc31e7ab9b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7089,7 +7089,7 @@ static void yield_task_fair(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } set_skip_buddy(se); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ad13e6242481..7aef6b4e885a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -861,7 +861,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) * 'runtime'. */ if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq_clock_skip_update(rq, false); + rq_clock_cancel_skipupdate(rq); } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c3deaee7a7a2..15750c222ca2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -976,13 +976,20 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -static inline void rq_clock_skip_update(struct rq *rq, bool skip) +static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_held(&rq->lock); - if (skip) - rq->clock_update_flags |= RQCF_REQ_SKIP; - else - rq->clock_update_flags &= ~RQCF_REQ_SKIP; + rq->clock_update_flags |= RQCF_REQ_SKIP; +} + +/* + * See rt task throttoling, which is the only time a skip + * request is cancelled. + */ +static inline void rq_clock_cancel_skipupdate(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + rq->clock_update_flags &= ~RQCF_REQ_SKIP; } struct rq_flags { -- cgit v1.2.3-71-gd317 From 7303e30ec1d8fb5ca1f07c92d069241c32b2ee1b Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Thu, 5 Apr 2018 11:53:03 +0200 Subject: syscalls/core: Prepare CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y for compat syscalls It may be useful for an architecture to override the definitions of the COMPAT_SYSCALL_DEFINE0() and __COMPAT_SYSCALL_DEFINEx() macros in , in particular to use a different calling convention for syscalls. This patch provides a mechanism to do so, based on the previously introduced CONFIG_ARCH_HAS_SYSCALL_WRAPPER. If it is enabled, is included in and may be used to define the macros mentioned above. Moreover, as the syscall calling convention may be different if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is set, the compat syscall function prototypes in are #ifndef'd out in that case. As some of the syscalls and/or compat syscalls may not be present, the COND_SYSCALL() and COND_SYSCALL_COMPAT() macros in kernel/sys_ni.c as well as the SYS_NI() and COMPAT_SYS_NI() macros in kernel/time/posix-stubs.c can be re-defined in iff CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Cc: Al Viro Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180405095307.3730-5-linux@dominikbrodowski.net Signed-off-by: Ingo Molnar --- include/linux/compat.h | 22 ++++++++++++++++++++++ init/Kconfig | 9 ++++++--- kernel/sys_ni.c | 10 ++++++++++ kernel/time/posix-stubs.c | 10 ++++++++++ 4 files changed, 48 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 9847c5a013c3..2d85ec5cfda2 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -24,6 +24,17 @@ #include #include +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* + * It may be useful for an architecture to override the definitions of the + * COMPAT_SYSCALL_DEFINE0 and COMPAT_SYSCALL_DEFINEx() macros, in particular + * to use a different calling convention for syscalls. To allow for that, + + the prototypes for the compat_sys_*() functions below will *not* be included + * if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. + */ +#include +#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ + #ifndef COMPAT_USE_64BIT_TIME #define COMPAT_USE_64BIT_TIME 0 #endif @@ -32,10 +43,12 @@ #define __SC_DELOUSE(t,v) ((__force t)(unsigned long)(v)) #endif +#ifndef COMPAT_SYSCALL_DEFINE0 #define COMPAT_SYSCALL_DEFINE0(name) \ asmlinkage long compat_sys_##name(void); \ ALLOW_ERROR_INJECTION(compat_sys_##name, ERRNO); \ asmlinkage long compat_sys_##name(void) +#endif /* COMPAT_SYSCALL_DEFINE0 */ #define COMPAT_SYSCALL_DEFINE1(name, ...) \ COMPAT_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) @@ -50,6 +63,7 @@ #define COMPAT_SYSCALL_DEFINE6(name, ...) \ COMPAT_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) +#ifndef COMPAT_SYSCALL_DEFINEx #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))\ @@ -62,6 +76,7 @@ return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \ } \ static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) +#endif /* COMPAT_SYSCALL_DEFINEx */ #ifndef compat_user_stack_pointer #define compat_user_stack_pointer() current_user_stack_pointer() @@ -517,7 +532,12 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long); * Please note that these prototypes here are only provided for information * purposes, for static analysis, and for linking from the syscall table. * These functions should not be called elsewhere from kernel code. + * + * As the syscall calling convention may be different from the default + * for architectures overriding the syscall calling convention, do not + * include the prototypes if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. */ +#ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p); asmlinkage long compat_sys_io_submit(compat_aio_context_t ctx_id, int nr, u32 __user *iocb); @@ -955,6 +975,8 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr); /* obsolete: net/socket.c */ asmlinkage long compat_sys_socketcall(int call, u32 __user *args); +#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ + /* * For most but not all architectures, "am I in a compat syscall?" and diff --git a/init/Kconfig b/init/Kconfig index 068eb6c3bbf7..2dbc88051bde 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1925,8 +1925,11 @@ config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE bool # It may be useful for an architecture to override the definitions of the -# SYSCALL_DEFINE() and __SYSCALL_DEFINEx() macros in , -# in particular to use a different calling convention for syscalls. +# SYSCALL_DEFINE() and __SYSCALL_DEFINEx() macros in +# and the COMPAT_ variants in , in particular to use a +# different calling convention for syscalls. They can also override the +# macros for not-implemented syscalls in kernel/sys_ni.c and +# kernel/time/posix-stubs.c. All these overrides need to be available in +# . config ARCH_HAS_SYSCALL_WRAPPER def_bool n - depends on !COMPAT diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 6cafc008f6db..9791364925dc 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -5,6 +5,11 @@ #include +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* Architectures may override COND_SYSCALL and COND_SYSCALL_COMPAT */ +#include +#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ + /* we can't #include here, but tell gcc to not warn with -Wmissing-prototypes */ asmlinkage long sys_ni_syscall(void); @@ -17,8 +22,13 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } +#ifndef COND_SYSCALL #define COND_SYSCALL(name) cond_syscall(sys_##name) +#endif /* COND_SYSCALL */ + +#ifndef COND_SYSCALL_COMPAT #define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name) +#endif /* COND_SYSCALL_COMPAT */ /* * This list is kept in the same order as include/uapi/asm-generic/unistd.h. diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index b258bee13b02..69a937c3cd81 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -19,6 +19,11 @@ #include #include +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* Architectures may override SYS_NI and COMPAT_SYS_NI */ +#include +#endif + asmlinkage long sys_ni_posix_timers(void) { pr_err_once("process %d (%s) attempted a POSIX timer syscall " @@ -27,8 +32,13 @@ asmlinkage long sys_ni_posix_timers(void) return -ENOSYS; } +#ifndef SYS_NI #define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) +#endif + +#ifndef COMPAT_SYS_NI #define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) +#endif SYS_NI(timer_create); SYS_NI(timer_gettime); -- cgit v1.2.3-71-gd317 From 0e7767687fdabfc58d5046e7488632bf2ecd4d0c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Apr 2018 18:58:27 +0200 Subject: time: tick-sched: Reorganize idle tick management code Prepare the scheduler tick code for reworking the idle loop to avoid stopping the tick in some cases. The idea is to split the nohz idle entry call to decouple the idle time stats accounting and preparatory work from the actual tick stop code, in order to later be able to delay the tick stop once we reach more power-knowledgeable callers. Move away the tick_nohz_start_idle() invocation from __tick_nohz_idle_enter(), rename the latter to __tick_nohz_idle_stop_tick() and define tick_nohz_idle_stop_tick() as a wrapper around it for calling it from the outside. Make tick_nohz_idle_enter() only call tick_nohz_start_idle() instead of calling the entire __tick_nohz_idle_enter(), add another wrapper disabling and enabling interrupts around tick_nohz_idle_stop_tick() and make the current callers of tick_nohz_idle_enter() call it too to retain their current functionality. Signed-off-by: Rafael J. Wysocki Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) --- arch/x86/xen/smp_pv.c | 1 + include/linux/tick.h | 12 ++++++++++++ kernel/sched/idle.c | 1 + kernel/time/tick-sched.c | 46 +++++++++++++++++++++++++--------------------- 4 files changed, 39 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index c0c756c76afe..2e20ae2fa2d6 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -425,6 +425,7 @@ static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ * data back is to call: */ tick_nohz_idle_enter(); + tick_nohz_idle_stop_tick_protected(); cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE); } diff --git a/include/linux/tick.h b/include/linux/tick.h index 7f8c9a127f5a..1d253df9ea3c 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -115,6 +115,7 @@ enum tick_dep_bits { extern bool tick_nohz_enabled; extern bool tick_nohz_tick_stopped(void); extern bool tick_nohz_tick_stopped_cpu(int cpu); +extern void tick_nohz_idle_stop_tick(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); @@ -123,10 +124,19 @@ extern unsigned long tick_nohz_get_idle_calls(void); extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); + +static inline void tick_nohz_idle_stop_tick_protected(void) +{ + local_irq_disable(); + tick_nohz_idle_stop_tick(); + local_irq_enable(); +} + #else /* !CONFIG_NO_HZ_COMMON */ #define tick_nohz_enabled (0) static inline int tick_nohz_tick_stopped(void) { return 0; } static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; } +static inline void tick_nohz_idle_stop_tick(void) { } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } @@ -136,6 +146,8 @@ static inline ktime_t tick_nohz_get_sleep_length(void) } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } + +static inline void tick_nohz_idle_stop_tick_protected(void) { } #endif /* !CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2975f195e1c4..c0bc111878e6 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -216,6 +216,7 @@ static void do_idle(void) __current_set_polling(); tick_nohz_idle_enter(); + tick_nohz_idle_stop_tick_protected(); while (!need_resched()) { check_pgt_cache(); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5d4a0342f934..678349aec483 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -528,14 +528,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) sched_clock_idle_wakeup_event(); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static void tick_nohz_start_idle(struct tick_sched *ts) { - ktime_t now = ktime_get(); - - ts->idle_entrytime = now; + ts->idle_entrytime = ktime_get(); ts->idle_active = 1; sched_clock_idle_sleep_event(); - return now; } /** @@ -894,19 +891,21 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return true; } -static void __tick_nohz_idle_enter(struct tick_sched *ts) +static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { - ktime_t now, expires; + ktime_t expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(ts); - if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; ts->idle_calls++; - expires = tick_nohz_stop_sched_tick(ts, now, cpu); + /* + * The idle entry time should be a sufficient approximation of + * the current time at this point. + */ + expires = tick_nohz_stop_sched_tick(ts, ts->idle_entrytime, cpu); if (expires > 0LL) { ts->idle_sleeps++; ts->idle_expires = expires; @@ -920,16 +919,19 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) } /** - * tick_nohz_idle_enter - stop the idle tick from the idle task + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick - * Called when we start the idle loop. - * - * The arch is responsible of calling: + */ +void tick_nohz_idle_stop_tick(void) +{ + __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); +} + +/** + * tick_nohz_idle_enter - prepare for entering idle on the current CPU * - * - rcu_idle_enter() after its last use of RCU before the CPU is put - * to sleep. - * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. + * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { @@ -941,7 +943,7 @@ void tick_nohz_idle_enter(void) ts = this_cpu_ptr(&tick_cpu_sched); ts->inidle = 1; - __tick_nohz_idle_enter(ts); + tick_nohz_start_idle(ts); local_irq_enable(); } @@ -958,10 +960,12 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - if (ts->inidle) - __tick_nohz_idle_enter(ts); - else + if (ts->inidle) { + tick_nohz_start_idle(ts); + __tick_nohz_idle_stop_tick(ts); + } else { tick_nohz_full_update_tick(ts); + } } /** -- cgit v1.2.3-71-gd317 From 2aaf709a518d26563b80fd7a42379d7aa7ffed4a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 15 Mar 2018 23:05:50 +0100 Subject: sched: idle: Do not stop the tick upfront in the idle loop Push the decision whether or not to stop the tick somewhat deeper into the idle loop. Stopping the tick upfront leads to unpleasant outcomes in case the idle governor doesn't agree with the nohz code on the duration of the upcoming idle period. Specifically, if the tick has been stopped and the idle governor predicts short idle, the situation is bad regardless of whether or not the prediction is accurate. If it is accurate, the tick has been stopped unnecessarily which means excessive overhead. If it is not accurate, the CPU is likely to spend too much time in the (shallow, because short idle has been predicted) idle state selected by the governor [1]. As the first step towards addressing this problem, change the code to make the tick stopping decision inside of the loop in do_idle(). In particular, do not stop the tick in the cpu_idle_poll() code path. Also don't do that in tick_nohz_irq_exit() which doesn't really have enough information on whether or not to stop the tick. Link: https://marc.info/?l=linux-pm&m=150116085925208&w=2 # [1] Link: https://tu-dresden.de/zih/forschung/ressourcen/dateien/projekte/haec/powernightmares.pdf Suggested-by: Frederic Weisbecker Signed-off-by: Rafael J. Wysocki Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) --- include/linux/tick.h | 2 ++ kernel/sched/idle.c | 9 ++++++--- kernel/time/tick-sched.c | 26 ++++++++++++++++++-------- 3 files changed, 26 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index 1d253df9ea3c..fccebfba167e 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -116,6 +116,7 @@ extern bool tick_nohz_enabled; extern bool tick_nohz_tick_stopped(void); extern bool tick_nohz_tick_stopped_cpu(int cpu); extern void tick_nohz_idle_stop_tick(void); +extern void tick_nohz_idle_restart_tick(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); @@ -137,6 +138,7 @@ static inline void tick_nohz_idle_stop_tick_protected(void) static inline int tick_nohz_tick_stopped(void) { return 0; } static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; } static inline void tick_nohz_idle_stop_tick(void) { } +static inline void tick_nohz_idle_restart_tick(void) { } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c0bc111878e6..3777e83c0b5a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -216,13 +216,13 @@ static void do_idle(void) __current_set_polling(); tick_nohz_idle_enter(); - tick_nohz_idle_stop_tick_protected(); while (!need_resched()) { check_pgt_cache(); rmb(); if (cpu_is_offline(cpu)) { + tick_nohz_idle_stop_tick_protected(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } @@ -236,10 +236,13 @@ static void do_idle(void) * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) + if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + tick_nohz_idle_restart_tick(); cpu_idle_poll(); - else + } else { + tick_nohz_idle_stop_tick(); cpuidle_idle_call(); + } arch_cpu_idle_exit(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 678349aec483..f5d37788ea85 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -960,12 +960,10 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - if (ts->inidle) { + if (ts->inidle) tick_nohz_start_idle(ts); - __tick_nohz_idle_stop_tick(ts); - } else { + else tick_nohz_full_update_tick(ts); - } } /** @@ -1026,6 +1024,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) #endif } +static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now) +{ + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); +} + +void tick_nohz_idle_restart_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->tick_stopped) + __tick_nohz_idle_restart_tick(ts, ktime_get()); +} + /** * tick_nohz_idle_exit - restart the idle tick from the idle task * @@ -1050,10 +1062,8 @@ void tick_nohz_idle_exit(void) if (ts->idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now); - tick_nohz_account_idle_ticks(ts); - } + if (ts->tick_stopped) + __tick_nohz_idle_restart_tick(ts, now); local_irq_enable(); } -- cgit v1.2.3-71-gd317 From ed98c34919985a9f87c3edacb9a8d8c283c9e243 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 15 Mar 2018 23:07:41 +0100 Subject: sched: idle: Do not stop the tick before cpuidle_idle_call() Make cpuidle_idle_call() decide whether or not to stop the tick. First, the cpuidle_enter_s2idle() path deals with the tick (and with the entire timekeeping for that matter) by itself and it doesn't need the tick to be stopped beforehand. Second, to address the issue with short idle duration predictions by the idle governor after the tick has been stopped, it will be necessary to change the ordering of cpuidle_select() with respect to tick_nohz_idle_stop_tick(). To prepare for that, put a tick_nohz_idle_stop_tick() call in the same branch in which cpuidle_select() is called. Signed-off-by: Rafael J. Wysocki Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) --- kernel/sched/idle.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 3777e83c0b5a..4f64835d38a8 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -141,13 +141,15 @@ static void cpuidle_idle_call(void) } /* - * Tell the RCU framework we are entering an idle section, - * so no more rcu read side critical sections and one more + * The RCU framework needs to be told that we are entering an idle + * section, so no more rcu read side critical sections and one more * step to the grace period */ - rcu_idle_enter(); if (cpuidle_not_available(drv, dev)) { + tick_nohz_idle_stop_tick(); + rcu_idle_enter(); + default_idle_call(); goto exit_idle; } @@ -164,16 +166,26 @@ static void cpuidle_idle_call(void) if (idle_should_enter_s2idle() || dev->use_deepest_state) { if (idle_should_enter_s2idle()) { + rcu_idle_enter(); + entered_state = cpuidle_enter_s2idle(drv, dev); if (entered_state > 0) { local_irq_enable(); goto exit_idle; } + + rcu_idle_exit(); } + tick_nohz_idle_stop_tick(); + rcu_idle_enter(); + next_state = cpuidle_find_deepest_state(drv, dev); call_cpuidle(drv, dev, next_state); } else { + tick_nohz_idle_stop_tick(); + rcu_idle_enter(); + /* * Ask the cpuidle framework to choose a convenient idle state. */ @@ -240,7 +252,6 @@ static void do_idle(void) tick_nohz_idle_restart_tick(); cpu_idle_poll(); } else { - tick_nohz_idle_stop_tick(); cpuidle_idle_call(); } arch_cpu_idle_exit(); -- cgit v1.2.3-71-gd317 From 3eda69c92d4751977baf2d34e88a29d4b6affa7d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 5 Apr 2018 16:25:12 -0700 Subject: kernel/fork.c: detect early free of a live mm KASAN splats indicate that in some cases we free a live mm, then continue to access it, with potentially disastrous results. This is likely due to a mismatched mmdrop() somewhere in the kernel, but so far the culprit remains elusive. Let's have __mmdrop() verify that the mm isn't live for the current task, similar to the existing check for init_mm. This way, we can catch this class of issue earlier, and without requiring KASAN. Currently, idle_task_exit() leaves active_mm stale after it switches to init_mm. This isn't harmful, but will trigger the new assertions, so we must adjust idle_task_exit() to update active_mm. Link: http://lkml.kernel.org/r/20180312140103.19235-1-mark.rutland@arm.com Signed-off-by: Mark Rutland Reviewed-by: Andrew Morton Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ kernel/sched/core.c | 1 + 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f71b67dc156d..242c8c93d285 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -595,6 +595,8 @@ static void check_mm(struct mm_struct *mm) void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); + WARN_ON_ONCE(mm == current->mm); + WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); hmm_mm_destroy(mm); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28b68995a417..e8afd6086f23 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5560,6 +5560,7 @@ void idle_task_exit(void) if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + current->active_mm = &init_mm; finish_arch_post_lock_switch(); } mmdrop(mm); -- cgit v1.2.3-71-gd317 From 514c60324960137e74457fdc233a339b985fa8a8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 5 Apr 2018 16:25:34 -0700 Subject: headers: untangle kmemleak.h from mm.h Currently #includes for no obvious reason. It looks like it's only a convenience, so remove kmemleak.h from slab.h and add to any users of kmemleak_* that don't already #include it. Also remove from source files that do not use it. This is tested on i386 allmodconfig and x86_64 allmodconfig. It would be good to run it through the 0day bot for other $ARCHes. I have neither the horsepower nor the storage space for the other $ARCHes. Update: This patch has been extensively build-tested by both the 0day bot & kisskb/ozlabs build farms. Both of them reported 2 build failures for which patches are included here (in v2). [ slab.h is the second most used header file after module.h; kernel.h is right there with slab.h. There could be some minor error in the counting due to some #includes having comments after them and I didn't combine all of those. ] [akpm@linux-foundation.org: security/keys/big_key.c needs vmalloc.h, per sfr] Link: http://lkml.kernel.org/r/e4309f98-3749-93e1-4bb7-d9501a39d015@infradead.org Link: http://kisskb.ellerman.id.au/kisskb/head/13396/ Signed-off-by: Randy Dunlap Reviewed-by: Ingo Molnar Reported-by: Michael Ellerman [2 build failures] Reported-by: Fengguang Wu [2 build failures] Reviewed-by: Andrew Morton Cc: Wei Yongjun Cc: Luis R. Rodriguez Cc: Greg Kroah-Hartman Cc: Mimi Zohar Cc: John Johansen Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/sysdev/dart_iommu.c | 1 + arch/powerpc/sysdev/msi_bitmap.c | 1 + arch/s390/kernel/nmi.c | 2 +- arch/s390/kernel/smp.c | 1 - arch/sparc/kernel/irq_64.c | 1 - arch/x86/kernel/pci-dma.c | 1 - drivers/iommu/exynos-iommu.c | 1 + drivers/iommu/mtk_iommu_v1.c | 1 - drivers/net/ethernet/ti/cpsw.c | 1 + drivers/net/wireless/realtek/rtlwifi/pci.c | 1 - drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c | 1 - drivers/staging/rtl8188eu/hal/fw.c | 2 +- drivers/staging/rtlwifi/pci.c | 1 - drivers/virtio/virtio_ring.c | 1 - include/linux/slab.h | 1 - kernel/ucount.c | 1 + lib/test_firmware.c | 1 + mm/cma.c | 1 + mm/memblock.c | 1 + net/core/sysctl_net_core.c | 1 - net/ipv4/route.c | 1 - security/apparmor/lsm.c | 1 - security/keys/big_key.c | 1 + 23 files changed, 11 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index a6198d4f0f03..5ca3e22d0512 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index c4dae27172b3..6243a7e537d0 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index c7a627620e5e..8c867b43c8eb 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index a4a9fe1934e9..2f8f7d7dd9a8 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index d66dde833f5e..713670e6d13d 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 14437116ffea..77625b60a510 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 2138102ef611..c5f4f7691b57 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 542930cd183d..5a96fd14ac22 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 1b4af54a4968..30371274409d 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -35,6 +35,7 @@ #include #include #include +#include #include diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index 2437422625bf..57bb8f049e59 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -31,7 +31,6 @@ #include "efuse.h" #include #include -#include #include MODULE_AUTHOR("lizhaoming "); diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c index 015476e3f7e5..f3bff66e85d0 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c @@ -32,7 +32,6 @@ #include "../rtl8192ce/def.h" #include "fw_common.h" #include -#include static void _rtl92c_enable_fw_download(struct ieee80211_hw *hw, bool enable) { diff --git a/drivers/staging/rtl8188eu/hal/fw.c b/drivers/staging/rtl8188eu/hal/fw.c index 03d091bad13a..6b67b38a6a9f 100644 --- a/drivers/staging/rtl8188eu/hal/fw.c +++ b/drivers/staging/rtl8188eu/hal/fw.c @@ -30,7 +30,7 @@ #include "rtl8188e_hal.h" #include -#include +#include static void _rtl88e_enable_fw_download(struct adapter *adapt, bool enable) { diff --git a/drivers/staging/rtlwifi/pci.c b/drivers/staging/rtlwifi/pci.c index 70a64a5f564a..d56810eabde7 100644 --- a/drivers/staging/rtlwifi/pci.c +++ b/drivers/staging/rtlwifi/pci.c @@ -31,7 +31,6 @@ #include "efuse.h" #include #include -#include #include MODULE_AUTHOR("lizhaoming "); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 71458f493cf8..21d464a29cf8 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/include/linux/slab.h b/include/linux/slab.h index 04402c637171..81ebd71f8c03 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -125,7 +125,6 @@ #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \ (unsigned long)ZERO_SIZE_PTR) -#include #include struct mem_cgroup; diff --git a/kernel/ucount.c b/kernel/ucount.c index b4eeee03934f..f48d1b6376a4 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #define UCOUNTS_HASHTABLE_BITS 10 diff --git a/lib/test_firmware.c b/lib/test_firmware.c index 078a61480573..cee000ac54d8 100644 --- a/lib/test_firmware.c +++ b/lib/test_firmware.c @@ -21,6 +21,7 @@ #include #include #include +#include #define TEST_FIRMWARE_NAME "test-firmware.bin" #define TEST_FIRMWARE_NUM_REQS 4 diff --git a/mm/cma.c b/mm/cma.c index 0600fc08a9f4..5809bbe360d7 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "cma.h" diff --git a/mm/memblock.c b/mm/memblock.c index 04406a930bfc..ecc6cb58cd33 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b3b609f0eeb5..b1a2c5e38530 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8322e479f299..594a1c605c92 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -108,7 +108,6 @@ #include #ifdef CONFIG_SYSCTL #include -#include #endif #include #include diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 9a65eeaf7dfa..6134302c143c 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include "include/apparmor.h" diff --git a/security/keys/big_key.c b/security/keys/big_key.c index fa728f662a6f..933623784ccd 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-71-gd317 From efefc97736e6f3261879bc9dddcb161224a455f5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 20 Mar 2018 10:11:28 +0100 Subject: jiffies: Introduce USER_TICK_USEC and redefine TICK_USEC Since the subsequent changes will need a TICK_USEC definition analogous to TICK_NSEC, rename the existing TICK_USEC as USER_TICK_USEC, update its users and redefine TICK_USEC accordingly. Suggested-by: Peter Zijlstra Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker --- drivers/net/ethernet/sfc/mcdi.c | 2 +- include/linux/jiffies.h | 7 +++++-- kernel/time/ntp.c | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c index 9c2567b0d93e..dfad93fca0a6 100644 --- a/drivers/net/ethernet/sfc/mcdi.c +++ b/drivers/net/ethernet/sfc/mcdi.c @@ -375,7 +375,7 @@ static int efx_mcdi_poll(struct efx_nic *efx) * because generally mcdi responses are fast. After that, back off * and poll once a jiffy (approximately) */ - spins = TICK_USEC; + spins = USER_TICK_USEC; finish = jiffies + MCDI_RPC_TIMEOUT; while (1) { diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 9385aa57497b..a27cf6652327 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -62,8 +62,11 @@ extern int register_refined_jiffies(long clock_tick_rate); /* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */ #define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ) -/* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ -#define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) +/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ +#define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) + +/* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ +#define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) #ifndef __jiffy_arch_data #define __jiffy_arch_data diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8d70da1b9a0d..a09ded765f6c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -31,7 +31,7 @@ /* USER_HZ period (usecs): */ -unsigned long tick_usec = TICK_USEC; +unsigned long tick_usec = USER_TICK_USEC; /* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; -- cgit v1.2.3-71-gd317 From 45f1ff59e27ca59d33cc1a317e669d90022ccf7d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 22 Mar 2018 17:50:49 +0100 Subject: cpuidle: Return nohz hint from cpuidle_select() Add a new pointer argument to cpuidle_select() and to the ->select cpuidle governor callback to allow a boolean value indicating whether or not the tick should be stopped before entering the selected state to be returned from there. Make the ladder governor ignore that pointer (to preserve its current behavior) and make the menu governor return 'false" through it if: (1) the idle exit latency is constrained at 0, or (2) the selected state is a polling one, or (3) the expected idle period duration is within the tick period range. In addition to that, the correction factor computations in the menu governor need to take the possibility that the tick may not be stopped into account to avoid artificially small correction factor values. To that end, add a mechanism to record tick wakeups, as suggested by Peter Zijlstra, and use it to modify the menu_update() behavior when tick wakeup occurs. Namely, if the CPU is woken up by the tick and the return value of tick_nohz_get_sleep_length() is not within the tick boundary, the predicted idle duration is likely too short, so make menu_update() try to compensate for that by updating the governor statistics as though the CPU was idle for a long time. Since the value returned through the new argument pointer of cpuidle_select() is not used by its caller yet, this change by itself is not expected to alter the functionality of the code. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- drivers/cpuidle/cpuidle.c | 10 +++++-- drivers/cpuidle/governors/ladder.c | 3 +- drivers/cpuidle/governors/menu.c | 59 ++++++++++++++++++++++++++++++-------- include/linux/cpuidle.h | 8 ++++-- include/linux/tick.h | 2 ++ kernel/sched/idle.c | 4 ++- kernel/time/tick-sched.c | 20 +++++++++++++ 7 files changed, 87 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 0003e9a02637..6df894d65d9e 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -272,12 +272,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, * * @drv: the cpuidle driver * @dev: the cpuidle device + * @stop_tick: indication on whether or not to stop the tick * * Returns the index of the idle state. The return value must not be negative. + * + * The memory location pointed to by @stop_tick is expected to be written the + * 'false' boolean value if the scheduler tick should not be stopped before + * entering the returned state. */ -int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) +int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + bool *stop_tick) { - return cpuidle_curr_governor->select(drv, dev); + return cpuidle_curr_governor->select(drv, dev, stop_tick); } /** diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index 1ad8745fd6d6..b24883f85c99 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -63,9 +63,10 @@ static inline void ladder_do_selection(struct ladder_device *ldev, * ladder_select_state - selects the next state to enter * @drv: cpuidle driver * @dev: the CPU + * @dummy: not used */ static int ladder_select_state(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, bool *dummy) { struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); struct device *device = get_cpu_device(dev->cpu); diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index aa390404e85f..f53a929bd2bd 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -123,6 +123,7 @@ struct menu_device { int last_state_idx; int needs_update; + int tick_wakeup; unsigned int next_timer_us; unsigned int predicted_us; @@ -279,8 +280,10 @@ again: * menu_select - selects the next idle state to enter * @drv: cpuidle driver containing state data * @dev: the CPU + * @stop_tick: indication on whether or not to stop the tick */ -static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) +static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + bool *stop_tick) { struct menu_device *data = this_cpu_ptr(&menu_devices); struct device *device = get_cpu_device(dev->cpu); @@ -303,8 +306,10 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) latency_req = resume_latency; /* Special case when user has set very strict latency requirement */ - if (unlikely(latency_req == 0)) + if (unlikely(latency_req == 0)) { + *stop_tick = false; return 0; + } /* determine the expected residency time, round up */ data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length()); @@ -354,6 +359,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (latency_req > interactivity_req) latency_req = interactivity_req; + expected_interval = data->predicted_us; /* * Find the idle state with the lowest power while satisfying * our constraints. @@ -369,15 +375,30 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) idx = i; /* first enabled state */ if (s->target_residency > data->predicted_us) break; - if (s->exit_latency > latency_req) + if (s->exit_latency > latency_req) { + /* + * If we break out of the loop for latency reasons, use + * the target residency of the selected state as the + * expected idle duration so that the tick is retained + * as long as that target residency is low enough. + */ + expected_interval = drv->states[idx].target_residency; break; - + } idx = i; } if (idx == -1) idx = 0; /* No states enabled. Must use 0. */ + /* + * Don't stop the tick if the selected state is a polling one or if the + * expected idle duration is shorter than the tick period length. + */ + if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || + expected_interval < TICK_USEC) + *stop_tick = false; + data->last_state_idx = idx; return data->last_state_idx; @@ -397,6 +418,7 @@ static void menu_reflect(struct cpuidle_device *dev, int index) data->last_state_idx = index; data->needs_update = 1; + data->tick_wakeup = tick_nohz_idle_got_tick(); } /** @@ -427,14 +449,27 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * assume the state was never reached and the exit latency is 0. */ - /* measured value */ - measured_us = cpuidle_get_last_residency(dev); - - /* Deduct exit latency */ - if (measured_us > 2 * target->exit_latency) - measured_us -= target->exit_latency; - else - measured_us /= 2; + if (data->tick_wakeup && data->next_timer_us > TICK_USEC) { + /* + * The nohz code said that there wouldn't be any events within + * the tick boundary (if the tick was stopped), but the idle + * duration predictor had a differing opinion. Since the CPU + * was woken up by a tick (that wasn't stopped after all), the + * predictor was not quite right, so assume that the CPU could + * have been idle long (but not forever) to help the idle + * duration predictor do a better job next time. + */ + measured_us = 9 * MAX_INTERESTING / 10; + } else { + /* measured value */ + measured_us = cpuidle_get_last_residency(dev); + + /* Deduct exit latency */ + if (measured_us > 2 * target->exit_latency) + measured_us -= target->exit_latency; + else + measured_us /= 2; + } /* Make sure our coefficients do not exceed unity */ if (measured_us > data->next_timer_us) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index a806e94c482f..1eefabf1621f 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -135,7 +135,8 @@ extern bool cpuidle_not_available(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_select(struct cpuidle_driver *drv, - struct cpuidle_device *dev); + struct cpuidle_device *dev, + bool *stop_tick); extern int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index); extern void cpuidle_reflect(struct cpuidle_device *dev, int index); @@ -167,7 +168,7 @@ static inline bool cpuidle_not_available(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return true; } static inline int cpuidle_select(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, bool *stop_tick) {return -ENODEV; } static inline int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) @@ -250,7 +251,8 @@ struct cpuidle_governor { struct cpuidle_device *dev); int (*select) (struct cpuidle_driver *drv, - struct cpuidle_device *dev); + struct cpuidle_device *dev, + bool *stop_tick); void (*reflect) (struct cpuidle_device *dev, int index); }; diff --git a/include/linux/tick.h b/include/linux/tick.h index fccebfba167e..ef0717e5e526 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -120,6 +120,7 @@ extern void tick_nohz_idle_restart_tick(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); +extern bool tick_nohz_idle_got_tick(void); extern ktime_t tick_nohz_get_sleep_length(void); extern unsigned long tick_nohz_get_idle_calls(void); extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); @@ -141,6 +142,7 @@ static inline void tick_nohz_idle_stop_tick(void) { } static inline void tick_nohz_idle_restart_tick(void) { } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } +static inline bool tick_nohz_idle_got_tick(void) { return false; } static inline ktime_t tick_nohz_get_sleep_length(void) { diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4f64835d38a8..a966bd2a6fa0 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -183,13 +183,15 @@ static void cpuidle_idle_call(void) next_state = cpuidle_find_deepest_state(drv, dev); call_cpuidle(drv, dev, next_state); } else { + bool stop_tick = true; + tick_nohz_idle_stop_tick(); rcu_idle_enter(); /* * Ask the cpuidle framework to choose a convenient idle state. */ - next_state = cpuidle_select(drv, dev); + next_state = cpuidle_select(drv, dev, &stop_tick); entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f5d37788ea85..69fe113cfc7f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -966,6 +966,20 @@ void tick_nohz_irq_exit(void) tick_nohz_full_update_tick(ts); } +/** + * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + */ +bool tick_nohz_idle_got_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->inidle > 1) { + ts->inidle = 1; + return true; + } + return false; +} + /** * tick_nohz_get_sleep_length - return the length of the current sleep * @@ -1077,6 +1091,9 @@ static void tick_nohz_handler(struct clock_event_device *dev) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); + if (ts->inidle) + ts->inidle = 2; + dev->next_event = KTIME_MAX; tick_sched_do_timer(now); @@ -1174,6 +1191,9 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); + if (ts->inidle) + ts->inidle = 2; + tick_sched_do_timer(now); /* -- cgit v1.2.3-71-gd317 From 0211e12dd0a5385ecffd3557bc570dbad7fcf245 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Apr 2018 12:40:07 +0200 Subject: genirq/affinity: Don't return with empty affinity masks on error When the allocation of node_to_possible_cpumask fails, then irq_create_affinity_masks() returns with a pointer to the empty affinity masks array, which will cause malfunction. Reorder the allocations so the masks array allocation comes last and every failure path returns NULL. Fixes: 9a0ef98e186d ("genirq/affinity: Assign vectors to all present CPUs") Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: Ming Lei --- kernel/irq/affinity.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index a37a3b4b6342..e0665549af59 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -108,7 +108,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int affv = nvecs - affd->pre_vectors - affd->post_vectors; int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; - struct cpumask *masks; + struct cpumask *masks = NULL; cpumask_var_t nmsk, *node_to_possible_cpumask; /* @@ -121,13 +121,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; - masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); - if (!masks) - goto out; - node_to_possible_cpumask = alloc_node_to_possible_cpumask(); if (!node_to_possible_cpumask) - goto out; + goto outcpumsk; + + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto outnodemsk; /* Fill out vectors at the beginning that don't need affinity */ for (curvec = 0; curvec < affd->pre_vectors; curvec++) @@ -192,8 +192,9 @@ done: /* Fill out vectors at the end that don't need affinity */ for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); +outnodemsk: free_node_to_possible_cpumask(node_to_possible_cpumask); -out: +outcpumsk: free_cpumask_var(nmsk); return masks; } -- cgit v1.2.3-71-gd317 From 47778f33dcba7feb92031643b37e477892f82b62 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:55 +0800 Subject: genirq/affinity: Rename *node_to_possible_cpumask as *node_to_cpumask The following patches will introduce two stage irq spreading for improving irq spread on all possible CPUs. No functional change. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Laurence Oberman Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-2-ming.lei@redhat.com --- kernel/irq/affinity.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e0665549af59..272c968d9ef1 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, } } -static cpumask_var_t *alloc_node_to_possible_cpumask(void) +static cpumask_var_t *alloc_node_to_cpumask(void) { cpumask_var_t *masks; int node; @@ -62,7 +62,7 @@ out_unwind: return NULL; } -static void free_node_to_possible_cpumask(cpumask_var_t *masks) +static void free_node_to_cpumask(cpumask_var_t *masks) { int node; @@ -71,7 +71,7 @@ static void free_node_to_possible_cpumask(cpumask_var_t *masks) kfree(masks); } -static void build_node_to_possible_cpumask(cpumask_var_t *masks) +static void build_node_to_cpumask(cpumask_var_t *masks) { int cpu; @@ -79,14 +79,14 @@ static void build_node_to_possible_cpumask(cpumask_var_t *masks) cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); } -static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, +static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ for_each_node(n) { - if (cpumask_intersects(mask, node_to_possible_cpumask[n])) { + if (cpumask_intersects(mask, node_to_cpumask[n])) { node_set(n, *nodemsk); nodes++; } @@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks = NULL; - cpumask_var_t nmsk, *node_to_possible_cpumask; + cpumask_var_t nmsk, *node_to_cpumask; /* * If there aren't any vectors left after applying the pre/post @@ -121,8 +121,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; - node_to_possible_cpumask = alloc_node_to_possible_cpumask(); - if (!node_to_possible_cpumask) + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) goto outcpumsk; masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); @@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); - build_node_to_possible_cpumask(node_to_possible_cpumask); - nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask, + build_node_to_cpumask(node_to_cpumask); + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_possible_mask, &nodemsk); /* @@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (affv <= nodes) { for_each_node_mask(n, nodemsk) { cpumask_copy(masks + curvec, - node_to_possible_cpumask[n]); + node_to_cpumask[n]); if (++curvec == last_affv) break; } @@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]); + cpumask_and(nmsk, cpu_possible_mask, node_to_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -193,7 +193,7 @@ done: for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); outnodemsk: - free_node_to_possible_cpumask(node_to_possible_cpumask); + free_node_to_cpumask(node_to_cpumask); outcpumsk: free_cpumask_var(nmsk); return masks; -- cgit v1.2.3-71-gd317 From b3e6aaa8d94d618e685c4df08bef991a4fb43923 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:56 +0800 Subject: genirq/affinity: Move actual irq vector spreading into a helper function No functional change, just prepare for converting to 2-stage irq vector spreading. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Laurence Oberman Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-3-ming.lei@redhat.com --- kernel/irq/affinity.c | 97 +++++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 272c968d9ef1..a9c36904500c 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -94,50 +94,19 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -/** - * irq_create_affinity_masks - Create affinity masks for multiqueue spreading - * @nvecs: The total number of vectors - * @affd: Description of the affinity requirements - * - * Returns the masks pointer or NULL if allocation failed. - */ -struct cpumask * -irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, + struct cpumask *masks) { - int n, nodes, cpus_per_vec, extra_vecs, curvec; int affv = nvecs - affd->pre_vectors - affd->post_vectors; int last_affv = affv + affd->pre_vectors; + int curvec = affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; - struct cpumask *masks = NULL; - cpumask_var_t nmsk, *node_to_cpumask; + int n, nodes, cpus_per_vec, extra_vecs; - /* - * If there aren't any vectors left after applying the pre/post - * vectors don't bother with assigning affinity. - */ - if (!affv) - return NULL; - - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return NULL; - - node_to_cpumask = alloc_node_to_cpumask(); - if (!node_to_cpumask) - goto outcpumsk; - - masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); - if (!masks) - goto outnodemsk; - - /* Fill out vectors at the beginning that don't need affinity */ - for (curvec = 0; curvec < affd->pre_vectors; curvec++) - cpumask_copy(masks + curvec, irq_default_affinity); - - /* Stabilize the cpumasks */ - get_online_cpus(); - build_node_to_cpumask(node_to_cpumask); - nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_possible_mask, - &nodemsk); + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); /* * If the number of nodes in the mask is greater than or equal the @@ -150,7 +119,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (++curvec == last_affv) break; } - goto done; + goto out; } for_each_node_mask(n, nodemsk) { @@ -160,7 +129,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_possible_mask, node_to_cpumask[n]); + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -186,7 +155,51 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) --nodes; } -done: +out: + return curvec - affd->pre_vectors; +} + +/** + * irq_create_affinity_masks - Create affinity masks for multiqueue spreading + * @nvecs: The total number of vectors + * @affd: Description of the affinity requirements + * + * Returns the masks pointer or NULL if allocation failed. + */ +struct cpumask * +irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +{ + cpumask_var_t nmsk, *node_to_cpumask; + struct cpumask *masks = NULL; + int curvec; + + /* + * If there aren't any vectors left after applying the pre/post + * vectors don't bother with assigning affinity. + */ + if (nvecs == affd->pre_vectors + affd->post_vectors) + return NULL; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto outcpumsk; + + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto outnodemsk; + + /* Fill out vectors at the beginning that don't need affinity */ + for (curvec = 0; curvec < affd->pre_vectors; curvec++) + cpumask_copy(masks + curvec, irq_default_affinity); + + /* Stabilize the cpumasks */ + get_online_cpus(); + build_node_to_cpumask(node_to_cpumask); + curvec += irq_build_affinity_masks(nvecs, affd, node_to_cpumask, + cpu_possible_mask, nmsk, masks); put_online_cpus(); /* Fill out vectors at the end that don't need affinity */ -- cgit v1.2.3-71-gd317 From 1a2d0914e23aab386f5d5acb689777e24151c2c8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:57 +0800 Subject: genirq/affinity: Allow irq spreading from a given starting point To support two stage irq vector spreading, it's required to add a starting point to the spreading function. No functional change, just preparatory work for the actual two stage change. [ tglx: Renamed variables, tidied up the code and massaged changelog ] Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Laurence Oberman Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-4-ming.lei@redhat.com --- kernel/irq/affinity.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index a9c36904500c..213695a27ddb 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -94,17 +94,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, +static int irq_build_affinity_masks(const struct irq_affinity *affd, + int startvec, int numvecs, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct cpumask *masks) { - int affv = nvecs - affd->pre_vectors - affd->post_vectors; - int last_affv = affv + affd->pre_vectors; - int curvec = affd->pre_vectors; + int n, nodes, cpus_per_vec, extra_vecs, done = 0; + int last_affv = affd->pre_vectors + numvecs; + int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; - int n, nodes, cpus_per_vec, extra_vecs; nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); @@ -112,12 +112,13 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, * If the number of nodes in the mask is greater than or equal the * number of vectors we just spread the vectors across the nodes. */ - if (affv <= nodes) { + if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_copy(masks + curvec, - node_to_cpumask[n]); - if (++curvec == last_affv) + cpumask_copy(masks + curvec, node_to_cpumask[n]); + if (++done == numvecs) break; + if (++curvec == last_affv) + curvec = affd->pre_vectors; } goto out; } @@ -126,7 +127,7 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, int ncpus, v, vecs_to_assign, vecs_per_node; /* Spread the vectors per node */ - vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; + vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); @@ -150,13 +151,16 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); } - if (curvec >= last_affv) + done += v; + if (done >= numvecs) break; + if (curvec >= last_affv) + curvec = affd->pre_vectors; --nodes; } out: - return curvec - affd->pre_vectors; + return done; } /** @@ -169,9 +173,9 @@ out: struct cpumask * irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { + int curvec, affvecs = nvecs - affd->pre_vectors - affd->post_vectors; cpumask_var_t nmsk, *node_to_cpumask; struct cpumask *masks = NULL; - int curvec; /* * If there aren't any vectors left after applying the pre/post @@ -198,8 +202,9 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); build_node_to_cpumask(node_to_cpumask); - curvec += irq_build_affinity_masks(nvecs, affd, node_to_cpumask, - cpu_possible_mask, nmsk, masks); + curvec += irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, cpu_possible_mask, + nmsk, masks); put_online_cpus(); /* Fill out vectors at the end that don't need affinity */ -- cgit v1.2.3-71-gd317 From d3056812e7dfe6bf4f8ad9e397a9116dd5d32d15 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:58 +0800 Subject: genirq/affinity: Spread irq vectors among present CPUs as far as possible Commit 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs") tried to spread the interrupts accross all possible CPUs to make sure that in case of phsyical hotplug (e.g. virtualization) the CPUs which get plugged in after the device was initialized are targeted by a hardware queue and the corresponding interrupt. This has a downside in cases where the ACPI tables claim that there are more possible CPUs than present CPUs and the number of interrupts to spread out is smaller than the number of possible CPUs. These bogus ACPI tables are unfortunately not uncommon. In such a case the vector spreading algorithm assigns interrupts to CPUs which can never be utilized and as a consequence these interrupts are unused instead of being mapped to present CPUs. As a result the performance of the device is suboptimal. To fix this spread the interrupt vectors in two stages: 1) Spread as many interrupts as possible among the present CPUs 2) Spread the remaining vectors among non present CPUs On a 8 core system, where CPU 0-3 are present and CPU 4-7 are not present, for a device with 4 queues the resulting interrupt affinity is: 1) Before 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs") irq 39, cpu list 0 irq 40, cpu list 1 irq 41, cpu list 2 irq 42, cpu list 3 2) With 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs") irq 39, cpu list 0-2 irq 40, cpu list 3-4,6 irq 41, cpu list 5 irq 42, cpu list 7 3) With the refined vector spread applied: irq 39, cpu list 0,4 irq 40, cpu list 1,6 irq 41, cpu list 2,5 irq 42, cpu list 3,7 On a 8 core system, where all CPUs are present the resulting interrupt affinity for the 4 queues is: irq 39, cpu list 0,1 irq 40, cpu list 2,3 irq 41, cpu list 4,5 irq 42, cpu list 6,7 This is independent of the number of CPUs which are online at the point of initialization because in such a system the offline CPUs can be easily onlined afterwards, while in non-present CPUs need to be plugged physically or virtually which requires external interaction. The downside of this approach is that in case of physical hotplug the interrupt vector spreading might be suboptimal when CPUs 4-7 are physically plugged. Suboptimal from a NUMA point of view and due to the single target nature of interrupt affinities the later plugged CPUs might not be targeted by interrupts at all. Though, physical hotplug systems are not the common case while the broken ACPI table disease is wide spread. So it's preferred to have as many interrupts as possible utilized at the point where the device is initialized. Block multi-queue devices like NVME create a hardware queue per possible CPU, so the goal of commit 84676c1f21 to assign one interrupt vector per possible CPU is still achieved even with physical/virtual hotplug. [ tglx: Changed from online to present CPUs for the first spreading stage, renamed variables for readability sake, added comments and massaged changelog ] Reported-by: Laurence Oberman Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-5-ming.lei@redhat.com --- kernel/irq/affinity.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 213695a27ddb..f4f29b9d90ee 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -106,6 +106,9 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; + if (!cpumask_weight(cpu_mask)) + return 0; + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); /* @@ -173,8 +176,9 @@ out: struct cpumask * irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { - int curvec, affvecs = nvecs - affd->pre_vectors - affd->post_vectors; - cpumask_var_t nmsk, *node_to_cpumask; + int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; + int curvec, usedvecs; + cpumask_var_t nmsk, npresmsk, *node_to_cpumask; struct cpumask *masks = NULL; /* @@ -187,9 +191,12 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto outcpumsk; + node_to_cpumask = alloc_node_to_cpumask(); if (!node_to_cpumask) - goto outcpumsk; + goto outnpresmsk; masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); if (!masks) @@ -202,16 +209,40 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); build_node_to_cpumask(node_to_cpumask); - curvec += irq_build_affinity_masks(affd, curvec, affvecs, - node_to_cpumask, cpu_possible_mask, - nmsk, masks); + + /* Spread on present CPUs starting from affd->pre_vectors */ + usedvecs = irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, cpu_present_mask, + nmsk, masks); + + /* + * Spread on non present CPUs starting from the next vector to be + * handled. If the spreading of present CPUs already exhausted the + * vector space, assign the non present CPUs to the already spread + * out vectors. + */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors; + else + curvec = affd->pre_vectors + usedvecs; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + usedvecs += irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, npresmsk, + nmsk, masks); put_online_cpus(); /* Fill out vectors at the end that don't need affinity */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors + affvecs; + else + curvec = affd->pre_vectors + usedvecs; for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); + outnodemsk: free_node_to_cpumask(node_to_cpumask); +outnpresmsk: + free_cpumask_var(npresmsk); outcpumsk: free_cpumask_var(nmsk); return masks; -- cgit v1.2.3-71-gd317 From 3fd49c9e48e2c09a18902695716a0d1aa387b6f4 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 30 Mar 2018 16:01:31 +0100 Subject: tracing: Default to using trace_global_clock if sched_clock is unstable Across suspend, we may see a very large drift in timestamps if the sched clock is unstable, prompting the global trace's ringbuffer code to warn and suggest switching to the global clock. Preempt this request by detecting when the sched clock is unstable (determined during late_initcall) and automatically switching the default clock over to trace_global_clock. This should prevent requiring user interaction to resolve warnings such as: Delta way too big! 18446743856563626466 ts=18446744054496180323 write stamp = 197932553857 If you just came from a suspend/resume, please switch to the trace global clock: echo global > /sys/kernel/debug/tracing/trace_clock Link: http://lkml.kernel.org/r/20180330150132.16903-1-chris@chris-wilson.co.uk Signed-off-by: Chris Wilson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 551a7cd0d705..0f47e653ffd8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include "trace.h" @@ -8596,3 +8597,21 @@ __init static int clear_boot_tracer(void) fs_initcall(tracer_init_tracefs); late_initcall_sync(clear_boot_tracer); + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__init static int tracing_set_default_clock(void) +{ + /* sched_clock_stable() is determined in late_initcall */ + if (trace_boot_clock || sched_clock_stable()) { + printk(KERN_WARNING + "Unstable clock detected, switching default tracing clock to \"global\"\n" + "If you want to keep using the local clock, then add:\n" + " \"trace_clock=local\"\n" + "on the kernel command line\n"); + tracing_set_clock(&global_trace, "global"); + } + + return 0; +} +late_initcall_sync(tracing_set_default_clock); +#endif -- cgit v1.2.3-71-gd317 From 913ea4d0b1074bac4c42a43ac1677dc56bbbcc52 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 30 Mar 2018 16:01:32 +0100 Subject: tracing: Mention trace_clock=global when warning about unstable clocks Mention the alternative of adding trace_clock=global to the kernel command line when we detect that we've used an unstable clock across a suspend/resume cycle. Link: http://lkml.kernel.org/r/20180330150132.16903-2-chris@chris-wilson.co.uk Signed-off-by: Chris Wilson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a2fd3893cc02..515be03e3009 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2731,7 +2731,8 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + " echo global > /sys/kernel/debug/tracing/trace_clock\n" + "or add trace_clock=global to the kernel command line\n"); info->add_timestamp = 1; } -- cgit v1.2.3-71-gd317 From 419e9fe53b7941481941984ce271b0ce946c3914 Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Fri, 30 Mar 2018 10:53:08 +0200 Subject: ftrace: Drop a VLA in module_exists() Avoid a VLA by using a real constant expression instead of a variable. The compiler should be able to optimize the original code and avoid using an actual VLA. Anyway this change is useful because it will avoid a false positive with -Wvla, it might also help the compiler generating better code. Link: http://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Link: http://lkml.kernel.org/r/1522399988-8815-1-git-send-email-s.mesoraca16@gmail.com Signed-off-by: Salvatore Mesoraca Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eac9ce2c57a2..16bbf062018f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3902,14 +3902,13 @@ static bool module_exists(const char *module) { /* All modules have the symbol __this_module */ const char this_mod[] = "__this_module"; - const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1; - char modname[modname_size + 1]; + char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; - n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod); + n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); - if (n > modname_size) + if (n > sizeof(modname) - 1) return false; val = module_kallsyms_lookup_name(modname); -- cgit v1.2.3-71-gd317 From 0ae7961e75c3fe3383796323d5342cbda8f82536 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 28 Mar 2018 15:10:53 -0500 Subject: tracing: Fix display of hist trigger expressions containing timestamps When displaying hist triggers, variable references that have the timestamp field flag set are erroneously displayed as common_timestamp rather than the variable reference. Additionally, timestamp expressions are displayed in the same way. Fix this by forcing the timestamp flag handling to follow variable reference and expression handling. Before: # cat /sys/kernel/debug/tracing/events/sched/sched_switch/trigger hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs:... After: # cat /sys/kernel/debug/tracing/events/sched/sched_switch/trigger hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0.usecs:... Link: http://lkml.kernel.org/r/92746b06be67499c2a6217bd55395b350ad18fad.1522256721.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a02bc09d765a..4f4792f4c83f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1686,8 +1686,6 @@ static const char *hist_field_name(struct hist_field *field, else if (field->flags & HIST_FIELD_FL_LOG2 || field->flags & HIST_FIELD_FL_ALIAS) field_name = hist_field_name(field->operands[0], ++level); - else if (field->flags & HIST_FIELD_FL_TIMESTAMP) - field_name = "common_timestamp"; else if (field->flags & HIST_FIELD_FL_CPU) field_name = "cpu"; else if (field->flags & HIST_FIELD_FL_EXPR || @@ -1703,7 +1701,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = full_name; } else field_name = field->name; - } + } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) + field_name = "common_timestamp"; if (field_name == NULL) field_name = ""; @@ -4858,23 +4857,15 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->var.name) seq_printf(m, "%s=", hist_field->var.name); - if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) - seq_puts(m, "common_timestamp"); - else if (hist_field->flags & HIST_FIELD_FL_CPU) + if (hist_field->flags & HIST_FIELD_FL_CPU) seq_puts(m, "cpu"); else if (field_name) { if (hist_field->flags & HIST_FIELD_FL_VAR_REF || hist_field->flags & HIST_FIELD_FL_ALIAS) seq_putc(m, '$'); seq_printf(m, "%s", field_name); - } - - if (hist_field->flags) { - const char *flags_str = get_hist_field_flags(hist_field); - - if (flags_str) - seq_printf(m, ".%s", flags_str); - } + } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) + seq_puts(m, "common_timestamp"); } static int event_hist_trigger_print(struct seq_file *m, -- cgit v1.2.3-71-gd317 From 76690945f59e2f329f148e1266d9d13800629463 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 28 Mar 2018 15:10:54 -0500 Subject: tracing: Don't add flag strings when displaying variable references Variable references should never have flags appended when displayed - prevent that from happening. Before: # cat /sys/kernel/debug/tracing/events/sched/sched_switch/trigger hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0.usecs:... After: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:... Link: http://lkml.kernel.org/r/913318a5610ef6b24af2522575f671fa6ee19b6b.1522256721.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 4f4792f4c83f..d867502a56ba 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2052,7 +2052,7 @@ static void expr_field_str(struct hist_field *field, char *expr) strcat(expr, hist_field_name(field, 0)); - if (field->flags) { + if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) { const char *flags_str = get_hist_field_flags(field); if (flags_str) { -- cgit v1.2.3-71-gd317 From 48f794731e4ca7b83b8b22a48bfc8641fa77dd09 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 28 Mar 2018 15:10:55 -0500 Subject: tracing: Add action comparisons when testing matching hist triggers Actions also need to be considered when checking for matching triggers - triggers differing only by action should be allowed, but currently aren't because the matching check ignores the action and erroneously returns -EEXIST. Add and call an actions_match() function to address that. Here's an example using onmatch() actions. The first -EEXIST shouldn't occur because the onmatch() is different in the second wakeup_latency() param. The second -EEXIST shouldn't occur because it's a different action (in this case, it doesn't have an action, so shouldn't be seen as being the same and therefore rejected). In the after case, both are correctly accepted (and trying to add one of them again returns -EEXIST as it should). before: # echo 'wakeup_latency u64 lat; pid_t pid' >> /sys/kernel/debug/tracing/synthetic_events # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0 if next_comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger # echo 'hist:keys=next_pid:onmatch(sched.sched_wakeup).wakeup_latency(sched.sched_switch.$wakeup_lat,next_pid) if next_comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger # echo 'hist:keys=next_pid:onmatch(sched.sched_wakeup).wakeup_latency(sched.sched_switch.$wakeup_lat,prev_pid) if next_comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger -su: echo: write error: File exists # echo 'hist:keys=next_pid if next_comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger -su: echo: write error: File exists after: # echo 'wakeup_latency u64 lat; pid_t pid' >> /sys/kernel/debug/tracing/synthetic_events # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0 if next_comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger # echo 'hist:keys=next_pid:onmatch(sched.sched_wakeup).wakeup_latency(sched.sched_switch.$wakeup_lat,next_pid) if next_comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger # echo 'hist:keys=next_pid:onmatch(sched.sched_wakeup).wakeup_latency(sched.sched_switch.$wakeup_lat,prev_pid) if next_comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger # echo 'hist:keys=next_pid if next_comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger Link: http://lkml.kernel.org/r/a7fd668b87ec10736c8f016ac4279c8480d50c2b.1522256721.git.tom.zanussi@linux.intel.com Tested-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 50 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index d867502a56ba..6114939f065a 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4364,6 +4364,53 @@ static void print_onmatch_spec(struct seq_file *m, seq_puts(m, ")"); } +static bool actions_match(struct hist_trigger_data *hist_data, + struct hist_trigger_data *hist_data_test) +{ + unsigned int i, j; + + if (hist_data->n_actions != hist_data_test->n_actions) + return false; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + struct action_data *data_test = hist_data_test->actions[i]; + + if (data->fn != data_test->fn) + return false; + + if (data->n_params != data_test->n_params) + return false; + + for (j = 0; j < data->n_params; j++) { + if (strcmp(data->params[j], data_test->params[j]) != 0) + return false; + } + + if (data->fn == action_trace) { + if (strcmp(data->onmatch.synth_event_name, + data_test->onmatch.synth_event_name) != 0) + return false; + if (strcmp(data->onmatch.match_event_system, + data_test->onmatch.match_event_system) != 0) + return false; + if (strcmp(data->onmatch.match_event, + data_test->onmatch.match_event) != 0) + return false; + } else if (data->fn == onmax_save) { + if (strcmp(data->onmax.var_str, + data_test->onmax.var_str) != 0) + return false; + if (strcmp(data->onmax.fn_name, + data_test->onmax.fn_name) != 0) + return false; + } + } + + return true; +} + + static void print_actions_spec(struct seq_file *m, struct hist_trigger_data *hist_data) { @@ -5174,6 +5221,9 @@ static bool hist_trigger_match(struct event_trigger_data *data, (strcmp(data->filter_str, data_test->filter_str) != 0)) return false; + if (!actions_match(hist_data, hist_data_test)) + return false; + return true; } -- cgit v1.2.3-71-gd317 From ad452870c66e05819a99b491b500a13989a1c502 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 28 Mar 2018 15:10:56 -0500 Subject: tracing: Make sure variable string fields are NULL-terminated The strncpy() currently being used for variable string fields can result in a lack of termination if the string length is equal to the field size. Use the safer strscpy() instead, which will guarantee termination. Link: http://lkml.kernel.org/r/fb97c1e518fb358c12a4057d7445ba2c46956cd7.1522256721.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6114939f065a..15ea11c29a51 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -669,7 +669,7 @@ static notrace void trace_event_raw_event_synth(void *__data, char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i]; char *str_field = (char *)&entry->fields[n_u64]; - strncpy(str_field, str_val, STR_VAR_LEN_MAX); + strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { entry->fields[n_u64] = var_ref_vals[var_ref_idx + i]; @@ -3091,7 +3091,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, char *str = elt_data->field_var_str[j++]; char *val_str = (char *)(uintptr_t)var_val; - strncpy(str, val_str, STR_VAR_LEN_MAX); + strscpy(str, val_str, STR_VAR_LEN_MAX); var_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, var_val); -- cgit v1.2.3-71-gd317 From b28d7b2dc27f0eef1ae608b49d6860f2463910f1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 28 Mar 2018 14:48:15 +0300 Subject: tracing: Uninitialized variable in create_tracing_map_fields() Smatch complains that idx can be used uninitialized when we check if (idx < 0). It has to be the first iteration through the loop and the HIST_FIELD_FL_STACKTRACE bit has to be clear and the HIST_FIELD_FL_VAR bit has to be set to reach the bug. Link: http://lkml.kernel.org/r/20180328114815.GC29050@mwanda Fixes: 30350d65ac56 ("tracing: Add variable support to hist triggers") Acked-by: Tom Zanussi Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 15ea11c29a51..0d7b3ffbecc2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4458,7 +4458,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) struct tracing_map *map = hist_data->map; struct ftrace_event_field *field; struct hist_field *hist_field; - int i, idx; + int i, idx = 0; for_each_hist_field(i, hist_data) { hist_field = hist_data->fields[i]; -- cgit v1.2.3-71-gd317 From 4c281074d2e7beb8179d81c3d2c2a53ae47dfa1c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 3 Apr 2018 10:31:47 -0400 Subject: lockdep: Add print_irqtrace_events() to __warn Running a test on a x86_32 kernel I triggered a bug that an interrupt disable/enable isn't being catched by lockdep. At least knowing where the last one was found would be helpful, but the warnings that are produced do not show this information. Even without debugging lockdep, having the WARN() display the last place hard and soft irqs were enabled or disabled is valuable. Signed-off-by: Steven Rostedt (VMware) --- kernel/panic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 2cfef408fec9..fa8d4cc4956a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -554,6 +554,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, else dump_stack(); + print_irqtrace_events(current); + print_oops_end_marker(); /* Just a warning, don't kill lockdep. */ -- cgit v1.2.3-71-gd317 From 2a872fa4e9c8adc79c830e4009e1cc0c013a9d8a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Apr 2018 10:33:56 -0400 Subject: ring-buffer: Check if memory is available before allocation The ring buffer is made up of a link list of pages. When making the ring buffer bigger, it will allocate all the pages it needs before adding to the ring buffer, and if it fails, it frees them and returns an error. This makes increasing the ring buffer size an all or nothing action. When this was first created, the pages were allocated with "NORETRY". This was to not cause any Out-Of-Memory (OOM) actions from allocating the ring buffer. But NORETRY was too strict, as the ring buffer would fail to expand even when there's memory available, but was taken up in the page cache. Commit 848618857d253 ("tracing/ring_buffer: Try harder to allocate") changed the allocating from NORETRY to RETRY_MAYFAIL. The RETRY_MAYFAIL would allocate from the page cache, but if there was no memory available, it would simple fail the allocation and not trigger an OOM. This worked fine, but had one problem. As the ring buffer would allocate one page at a time, it could take up all memory in the system before it failed to allocate and free that memory. If the allocation is happening and the ring buffer allocates all memory and then tries to take more than available, its allocation will not trigger an OOM, but if there's any allocation that happens someplace else, that could trigger an OOM, even though once the ring buffer's allocation fails, it would free up all the previous memory it tried to allocate, and allow other memory allocations to succeed. Commit d02bd27bd33dd ("mm/page_alloc.c: calculate 'available' memory in a separate function") separated out si_mem_availble() as a separate function that could be used to see how much memory is available in the system. Using this function to make sure that the ring buffer could be allocated before it tries to allocate pages we can avoid allocating all memory in the system and making it vulnerable to OOMs if other allocations are taking place. Link: http://lkml.kernel.org/r/1522320104-6573-1-git-send-email-zhaoyang.huang@spreadtrum.com CC: stable@vger.kernel.org Cc: linux-mm@kvack.org Fixes: 848618857d253 ("tracing/ring_buffer: Try harder to allocate") Requires: d02bd27bd33dd ("mm/page_alloc.c: calculate 'available' memory in a separate function") Reported-by: Zhaoyang Huang Tested-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 515be03e3009..966128f02121 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1164,6 +1164,11 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) struct buffer_page *bpage, *tmp; long i; + /* Check if the available memory is there first */ + i = si_mem_available(); + if (i < nr_pages) + return -ENOMEM; + for (i = 0; i < nr_pages; i++) { struct page *page; /* -- cgit v1.2.3-71-gd317 From 927e56db6253225166d521cee3772624347b5cd5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 4 Apr 2018 11:29:57 -0400 Subject: ring-buffer: Add set/clear_current_oom_origin() during allocations As si_mem_available() can say there is enough memory even though the memory available is not useable by the ring buffer, it is best to not kill innocent applications because the ring buffer is taking up all the memory while it is trying to allocate a great deal of memory. If the allocator is user space (because kernel threads can also increase the size of the kernel ring buffer on boot up), then after si_mem_available() says there is enough memory, set the OOM killer to kill the current task if an OOM triggers during the allocation. Link: http://lkml.kernel.org/r/20180404062340.GD6312@dhcp22.suse.cz Suggested-by: Michal Hocko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 48 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 966128f02121..c9cb9767d49b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -1162,35 +1163,60 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) { struct buffer_page *bpage, *tmp; + bool user_thread = current->mm != NULL; + gfp_t mflags; long i; - /* Check if the available memory is there first */ + /* + * Check if the available memory is there first. + * Note, si_mem_available() only gives us a rough estimate of available + * memory. It may not be accurate. But we don't care, we just want + * to prevent doing any allocation when it is obvious that it is + * not going to succeed. + */ i = si_mem_available(); if (i < nr_pages) return -ENOMEM; + /* + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. + */ + mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; + + /* + * If a user thread allocates too much, and si_mem_available() + * reports there's enough memory, even though there is not. + * Make sure the OOM killer kills this thread. This can happen + * even with RETRY_MAYFAIL because another task may be doing + * an allocation after this task has taken all memory. + * This is the task the OOM killer needs to take out during this + * loop, even if it was triggered by an allocation somewhere else. + */ + if (user_thread) + set_current_oom_origin(); for (i = 0; i < nr_pages; i++) { struct page *page; - /* - * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is not - * destabilized. - */ + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_RETRY_MAYFAIL, - cpu_to_node(cpu)); + mflags, cpu_to_node(cpu)); if (!bpage) goto free_pages; list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); + page = alloc_pages_node(cpu_to_node(cpu), mflags, 0); if (!page) goto free_pages; bpage->page = page_address(page); rb_init_page(bpage->page); + + if (user_thread && fatal_signal_pending(current)) + goto free_pages; } + if (user_thread) + clear_current_oom_origin(); return 0; @@ -1199,6 +1225,8 @@ free_pages: list_del_init(&bpage->list); free_buffer_page(bpage); } + if (user_thread) + clear_current_oom_origin(); return -ENOMEM; } -- cgit v1.2.3-71-gd317 From f7a1570da91558fb85b61e53243fe3fa79e2bbae Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 4 Apr 2018 14:50:15 -0400 Subject: tracing: Hide global trace clock from lockdep Function tracing can trace in NMIs and such. If the TSC is determined to be unstable, the tracing clock will switch to the global clock on boot up, unless "trace_clock" is specified on the kernel command line. The global clock disables interrupts to access sched_clock_cpu(), and in doing so can be done within lockdep internals (because of function tracing and NMIs). This can trigger false lockdep splats. The trace_clock_global() is special, best not to trace the irq logic within it. Link: http://lkml.kernel.org/r/20180404145015.77bde42d@gandalf.local.home Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 5fdc779f411d..d8a188e0418a 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -96,7 +96,7 @@ u64 notrace trace_clock_global(void) int this_cpu; u64 now; - local_irq_save(flags); + raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); now = sched_clock_cpu(this_cpu); @@ -122,7 +122,7 @@ u64 notrace trace_clock_global(void) arch_spin_unlock(&trace_clock_struct.lock); out: - local_irq_restore(flags); + raw_local_irq_restore(flags); return now; } -- cgit v1.2.3-71-gd317 From 5125eee4e698f02b8e1a364ad5d7560f908d855f Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 4 Apr 2018 22:24:50 +0100 Subject: tracing: Fixup logic inversion on setting trace_global_clock defaults In commit 932066a15335 ("tracing: Default to using trace_global_clock if sched_clock is unstable"), the logic for deciding to override the default clock if unstable was reversed from the earlier posting. I was trying to reduce the width of the message by using an early return rather than a if-block, but reverted back to using the if-block and accidentally left the predicate inverted. Link: http://lkml.kernel.org/r/20180404212450.26646-1-chris@chris-wilson.co.uk Fixes: 932066a15335 ("tracing: Default to using trace_global_clock if sched_clock is unstable") Signed-off-by: Chris Wilson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0f47e653ffd8..e18e69183c9a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8602,7 +8602,7 @@ late_initcall_sync(clear_boot_tracer); __init static int tracing_set_default_clock(void) { /* sched_clock_stable() is determined in late_initcall */ - if (trace_boot_clock || sched_clock_stable()) { + if (!trace_boot_clock && !sched_clock_stable()) { printk(KERN_WARNING "Unstable clock detected, switching default tracing clock to \"global\"\n" "If you want to keep using the local clock, then add:\n" -- cgit v1.2.3-71-gd317 From 1f3b0faa3e9dc713efce392af1f58542e735f822 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Apr 2018 13:39:36 -0400 Subject: tracing: Add rcu dereference annotation for filter->prog ftrace_function_set_filter() referenences filter->prog without annotation and sparse complains about it. It needs a rcu_dereference_protected() wrapper. Reported-by: kbuild test robot Fixes: 80765597bc587 ("tracing: Rewrite filter logic to be simpler and faster") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 703a416aa5c2..cf8460caa95c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1992,7 +1992,8 @@ static bool is_or(struct prog_entry *prog, int i) static int ftrace_function_set_filter(struct perf_event *event, struct event_filter *filter) { - struct prog_entry *prog = filter->prog; + struct prog_entry *prog = rcu_dereference_protected(filter->prog, + lockdep_is_held(&event_mutex)); struct function_filter_data data = { .first_filter = 1, .first_notrace = 1, -- cgit v1.2.3-71-gd317 From 8ec8405f081e1e0f800b20f683451c37e81e26c1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Apr 2018 15:20:26 -0400 Subject: tracing: Add rcu dereference annotation for test func that touches filter->prog A boot up test function update_pred_fn() dereferences filter->prog without the proper rcu annotation. To do this, we must also take the event_mutex first. Normally, this isn't needed because this test function can not race with other use cases that touch the event filters (it is disabled if any events are enabled). Reported-by: kbuild test robot Fixes: 80765597bc587 ("tracing: Rewrite filter logic to be simpler and faster") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index cf8460caa95c..1bda4ec95e18 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2155,7 +2155,8 @@ static int test_pred_visited_fn(struct filter_pred *pred, void *event) static void update_pred_fn(struct event_filter *filter, char *fields) { - struct prog_entry *prog = filter->prog; + struct prog_entry *prog = rcu_dereference_protected(filter->prog, + lockdep_is_held(&event_mutex)); int i; for (i = 0; prog[i].pred; i++) { @@ -2197,6 +2198,8 @@ static __init int ftrace_test_event_filter(void) break; } + /* Needed to dereference filter->prog */ + mutex_lock(&event_mutex); /* * The preemption disabling is not really needed for self * tests, but the rcu dereference will complain without it. @@ -2209,6 +2212,8 @@ static __init int ftrace_test_event_filter(void) err = filter_match_preds(filter, &d->rec); preempt_enable(); + mutex_unlock(&event_mutex); + __free_filter(filter); if (test_pred_visited) { -- cgit v1.2.3-71-gd317 From 58eacfffc41735c9155becc73cb7f4dcc60a46a9 Mon Sep 17 00:00:00 2001 From: Abderrahmane Benbachir Date: Thu, 22 Mar 2018 20:33:28 -0400 Subject: init, tracing: instrument security and console initcall trace events Trace events have been added around the initcall functions defined in init/main.c. But console and security have their own initcalls. This adds the trace events associated for those initcall functions. Link: http://lkml.kernel.org/r/1521765208.19745.2.camel@polymtl.ca Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Abderrahmane Benbachir Signed-off-by: Steven Rostedt (VMware) --- kernel/printk/printk.c | 7 ++++++- security/security.c | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f274fbef821d..cb5b35341d69 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -52,6 +52,7 @@ #include #include +#include #define CREATE_TRACE_POINTS #include @@ -2781,6 +2782,7 @@ EXPORT_SYMBOL(unregister_console); */ void __init console_init(void) { + int ret; initcall_t *call; /* Setup the default TTY line discipline. */ @@ -2791,8 +2793,11 @@ void __init console_init(void) * inform about problems etc.. */ call = __con_initcall_start; + trace_initcall_level("console"); while (call < __con_initcall_end) { - (*call)(); + trace_initcall_start((*call)); + ret = (*call)(); + trace_initcall_finish((*call), ret); call++; } } diff --git a/security/security.c b/security/security.c index 1cd8526cb0b7..987afe3d464c 100644 --- a/security/security.c +++ b/security/security.c @@ -30,6 +30,8 @@ #include #include +#include + #define MAX_LSM_EVM_XATTR 2 /* Maximum number of letters for an LSM name string */ @@ -45,10 +47,14 @@ static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] = static void __init do_security_initcalls(void) { + int ret; initcall_t *call; call = __security_initcall_start; + trace_initcall_level("security"); while (call < __security_initcall_end) { - (*call) (); + trace_initcall_start((*call)); + ret = (*call) (); + trace_initcall_finish((*call), ret); call++; } } -- cgit v1.2.3-71-gd317 From 23a8d888107ce4ce444eab2dcebf4cfb3578770b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Apr 2018 19:07:57 +0200 Subject: time: tick-sched: Split tick_nohz_stop_sched_tick() In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, split tick_nohz_stop_sched_tick() into two separate routines, one computing the time to the next timer event and the other simply stopping the tick when the time to the next timer event is known. Prepare these two routines to be called separately, as one of them will be called by the idle governor in the cpuidle_select() code path after subsequent changes. Update the former callers of tick_nohz_stop_sched_tick() to use the new routines, tick_nohz_next_event() and tick_nohz_stop_tick(), instead of it and move the updates of the sleep_length field in struct tick_sched into __tick_nohz_idle_stop_tick() as it doesn't need to be updated anywhere else. There should be no intentional visible changes in functionality resulting from this change. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 124 +++++++++++++++++++++++++++++------------------ kernel/time/tick-sched.h | 4 ++ 2 files changed, 82 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69fe113cfc7f..f56d2c695712 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -641,13 +641,10 @@ static inline bool local_timer_softirq_pending(void) return local_softirq_pending() & TIMER_SOFTIRQ; } -static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, - ktime_t now, int cpu) +static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { - struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; unsigned long seq, basejiff; - ktime_t tick; /* Read jiffies and the time when jiffies were updated last */ do { @@ -656,6 +653,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; + ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work @@ -700,47 +698,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * next period, so no point in stopping it either, bail. */ if (!ts->tick_stopped) { - tick = 0; + ts->timer_expires = 0; goto out; } } + /* + * If this CPU is the one which had the do_timer() duty last, we limit + * the sleep time to the timekeeping max_deferment value. + * Otherwise we can sleep as long as we want. + */ + delta = timekeeping_max_deferment(); + if (cpu != tick_do_timer_cpu && + (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) + delta = KTIME_MAX; + + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; + else + expires = KTIME_MAX; + + ts->timer_expires = min_t(u64, expires, next_tick); + +out: + return ts->timer_expires; +} + +static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + u64 basemono = ts->timer_expires_base; + u64 expires = ts->timer_expires; + ktime_t tick = expires; + + /* Make sure we won't be trying to stop it twice in a row. */ + ts->timer_expires_base = 0; + /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it - * was the one which had the do_timer() duty last. If this CPU - * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferment value. - * Otherwise we can sleep as long as we want. + * was the one which had the do_timer() duty last. */ - delta = timekeeping_max_deferment(); if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - delta = KTIME_MAX; ts->do_timer_last = 0; - } else if (!ts->do_timer_last) { - delta = KTIME_MAX; } - /* Calculate the next expiry time */ - if (delta < (KTIME_MAX - basemono)) - expires = basemono + delta; - else - expires = KTIME_MAX; - - expires = min_t(u64, expires, next_tick); - tick = expires; - /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) - goto out; + return; WARN_ON_ONCE(1); printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", @@ -774,7 +788,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); - goto out; + return; } hrtimer_set_expires(&ts->sched_timer, tick); @@ -783,15 +797,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(tick, 1); -out: - /* - * Update the estimated sleep length until the next timer - * (not only the tick). - */ - ts->sleep_length = ktime_sub(dev->next_event, now); - return tick; } +static void tick_nohz_retain_tick(struct tick_sched *ts) +{ + ts->timer_expires_base = 0; +} + +#ifdef CONFIG_NO_HZ_FULL +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) +{ + if (tick_nohz_next_event(ts, cpu)) + tick_nohz_stop_tick(ts, cpu); + else + tick_nohz_retain_tick(ts); +} +#endif /* CONFIG_NO_HZ_FULL */ + static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ @@ -827,7 +849,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) return; if (can_stop_full_tick(cpu, ts)) - tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + tick_nohz_stop_sched_tick(ts, cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get()); #endif @@ -853,10 +875,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { - ts->sleep_length = NSEC_PER_SEC / HZ; + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; - } if (need_resched()) return false; @@ -893,29 +913,37 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); ktime_t expires; int cpu = smp_processor_id(); - if (can_stop_idle_tick(cpu, ts)) { + WARN_ON_ONCE(ts->timer_expires_base); + + if (!can_stop_idle_tick(cpu, ts)) + goto out; + + expires = tick_nohz_next_event(ts, cpu); + + ts->idle_calls++; + + if (expires > 0LL) { int was_stopped = ts->tick_stopped; - ts->idle_calls++; + tick_nohz_stop_tick(ts, cpu); - /* - * The idle entry time should be a sufficient approximation of - * the current time at this point. - */ - expires = tick_nohz_stop_sched_tick(ts, ts->idle_entrytime, cpu); - if (expires > 0LL) { - ts->idle_sleeps++; - ts->idle_expires = expires; - } + ts->idle_sleeps++; + ts->idle_expires = expires; if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } + } else { + tick_nohz_retain_tick(ts); } + +out: + ts->sleep_length = ktime_sub(dev->next_event, ts->idle_entrytime); } /** @@ -942,6 +970,9 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); + + WARN_ON_ONCE(ts->timer_expires_base); + ts->inidle = 1; tick_nohz_start_idle(ts); @@ -1067,6 +1098,7 @@ void tick_nohz_idle_exit(void) local_irq_disable(); WARN_ON_ONCE(!ts->inidle); + WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 954b43dbf21c..53e45a39bdbc 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -39,6 +39,8 @@ enum tick_nohz_mode { * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding * @sleep_length: Duration of the current idle sleep + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @timer_expires_base: Base time clock monotonic for @timer_expires * @do_timer_lst: CPU was the last one doing do_timer before going idle */ struct tick_sched { @@ -60,6 +62,8 @@ struct tick_sched { ktime_t iowait_sleeptime; ktime_t sleep_length; unsigned long last_jiffies; + u64 timer_expires; + u64 timer_expires_base; u64 next_timer; ktime_t idle_expires; int do_timer_last; -- cgit v1.2.3-71-gd317 From a59855cd8c613ba4bb95147f6176360d95f75e60 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 3 Apr 2018 23:17:00 +0200 Subject: time: hrtimer: Introduce hrtimer_next_event_without() The next set of changes will need to compute the time to the next hrtimer event over all hrtimers except for the scheduler tick one. To that end introduce a new helper function, hrtimer_next_event_without(), for computing the time until the next hrtimer event over all timers except for one and modify the underlying code in __hrtimer_next_event_base() to prepare it for being called by that new function. No intentional changes in functionality. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker --- include/linux/hrtimer.h | 1 + kernel/time/hrtimer.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c7902ca7c9f4..3892e9c8b2de 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -426,6 +426,7 @@ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } extern u64 hrtimer_get_next_event(void); +extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 23788100e214..6d387dbd7304 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -490,6 +490,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { @@ -502,9 +503,24 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); + if (timer == exclude) { + /* Get to the next timer in the queue. */ + struct rb_node *rbn = rb_next(&next->node); + + next = rb_entry_safe(rbn, struct timerqueue_node, node); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; + + /* Skip cpu_base update if a timer is being excluded. */ + if (exclude) + continue; + if (timer->is_soft) cpu_base->softirq_next_timer = timer; else @@ -548,7 +564,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; - expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, + active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } @@ -556,7 +573,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; - expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, + expires_next); } return expires_next; @@ -1202,6 +1220,39 @@ u64 hrtimer_get_next_event(void) return expires; } + +/** + * hrtimer_next_event_without - time until next expiry event w/o one timer + * @exclude: timer to exclude + * + * Returns the next expiry time over all timers except for the @exclude one or + * KTIME_MAX if none of them is pending. + */ +u64 hrtimer_next_event_without(const struct hrtimer *exclude) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + u64 expires = KTIME_MAX; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (__hrtimer_hres_active(cpu_base)) { + unsigned int active; + + if (!cpu_base->softirq_activated) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + expires = __hrtimer_next_event_base(cpu_base, exclude, + active, KTIME_MAX); + } + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + expires = __hrtimer_next_event_base(cpu_base, exclude, active, + expires); + } + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return expires; +} #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) -- cgit v1.2.3-71-gd317 From 554c8aa8ecade210d58a252173bb8f2106552a44 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 3 Apr 2018 23:17:11 +0200 Subject: sched: idle: Select idle state before stopping the tick In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, reorder the code in cpuidle_idle_call() so that the governor idle state selection runs before tick_nohz_idle_go_idle() and use the "nohz" hint returned by cpuidle_select() to decide whether or not to stop the tick. This isn't straightforward, because menu_select() invokes tick_nohz_get_sleep_length() to get the time to the next timer event and the number returned by the latter comes from __tick_nohz_idle_stop_tick(). Fortunately, however, it is possible to compute that number without actually stopping the tick and with the help of the existing code. Namely, tick_nohz_get_sleep_length() can be made call tick_nohz_next_event(), introduced earlier, to get the time to the next non-highres timer event. If that happens, tick_nohz_next_event() need not be called by __tick_nohz_idle_stop_tick() again. If it turns out that the scheduler tick cannot be stopped going forward or the next timer event is too close for the tick to be stopped, tick_nohz_get_sleep_length() can simply return the time to the next event currently programmed into the corresponding clock event device. In addition to knowing the return value of tick_nohz_next_event(), however, tick_nohz_get_sleep_length() needs to know the time to the next highres timer event, but with the scheduler tick timer excluded, which can be computed with the help of hrtimer_get_next_event(). That minimum of that number and the tick_nohz_next_event() return value is the total time to the next timer event with the assumption that the tick will be stopped. It can be returned to the idle governor which can use it for predicting idle duration (under the assumption that the tick will be stopped) and deciding whether or not it makes sense to stop the tick before putting the CPU into the selected idle state. With the above, the sleep_length field in struct tick_sched is not necessary any more, so drop it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227 Reported-by: Doug Smythies Reported-by: Thomas Ilsche Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker --- include/linux/tick.h | 2 ++ kernel/sched/idle.c | 11 ++++++--- kernel/time/tick-sched.c | 61 ++++++++++++++++++++++++++++++++++++++---------- kernel/time/tick-sched.h | 2 -- 4 files changed, 59 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index ef0717e5e526..e8e7ff16b929 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -116,6 +116,7 @@ extern bool tick_nohz_enabled; extern bool tick_nohz_tick_stopped(void); extern bool tick_nohz_tick_stopped_cpu(int cpu); extern void tick_nohz_idle_stop_tick(void); +extern void tick_nohz_idle_retain_tick(void); extern void tick_nohz_idle_restart_tick(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); @@ -139,6 +140,7 @@ static inline void tick_nohz_idle_stop_tick_protected(void) static inline int tick_nohz_tick_stopped(void) { return 0; } static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; } static inline void tick_nohz_idle_stop_tick(void) { } +static inline void tick_nohz_idle_retain_tick(void) { } static inline void tick_nohz_idle_restart_tick(void) { } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index a966bd2a6fa0..1a3e9bddd17b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -185,13 +185,18 @@ static void cpuidle_idle_call(void) } else { bool stop_tick = true; - tick_nohz_idle_stop_tick(); - rcu_idle_enter(); - /* * Ask the cpuidle framework to choose a convenient idle state. */ next_state = cpuidle_select(drv, dev, &stop_tick); + + if (stop_tick) + tick_nohz_idle_stop_tick(); + else + tick_nohz_idle_retain_tick(); + + rcu_idle_enter(); + entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f56d2c695712..c57c98c7e953 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -913,16 +913,19 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { - struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); ktime_t expires; int cpu = smp_processor_id(); - WARN_ON_ONCE(ts->timer_expires_base); - - if (!can_stop_idle_tick(cpu, ts)) - goto out; - - expires = tick_nohz_next_event(ts, cpu); + /* + * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the + * tick timer expiration time is known already. + */ + if (ts->timer_expires_base) + expires = ts->timer_expires; + else if (can_stop_idle_tick(cpu, ts)) + expires = tick_nohz_next_event(ts, cpu); + else + return; ts->idle_calls++; @@ -941,9 +944,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) } else { tick_nohz_retain_tick(ts); } - -out: - ts->sleep_length = ktime_sub(dev->next_event, ts->idle_entrytime); } /** @@ -956,6 +956,16 @@ void tick_nohz_idle_stop_tick(void) __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); } +void tick_nohz_idle_retain_tick(void) +{ + tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); + /* + * Undo the effect of get_next_timer_interrupt() called from + * tick_nohz_next_event(). + */ + timer_clear_idle(); +} + /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * @@ -1012,15 +1022,42 @@ bool tick_nohz_idle_got_tick(void) } /** - * tick_nohz_get_sleep_length - return the length of the current sleep + * tick_nohz_get_sleep_length - return the expected length of the current sleep * * Called from power state control code with interrupts disabled */ ktime_t tick_nohz_get_sleep_length(void) { + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + int cpu = smp_processor_id(); + /* + * The idle entry time is expected to be a sufficient approximation of + * the current time at this point. + */ + ktime_t now = ts->idle_entrytime; + ktime_t next_event; + + WARN_ON_ONCE(!ts->inidle); + + if (!can_stop_idle_tick(cpu, ts)) + goto out_dev; + + next_event = tick_nohz_next_event(ts, cpu); + if (!next_event) + goto out_dev; + + /* + * If the next highres timer to expire is earlier than next_event, the + * idle governor needs to know that. + */ + next_event = min_t(u64, next_event, + hrtimer_next_event_without(&ts->sched_timer)); + + return ktime_sub(next_event, now); - return ts->sleep_length; +out_dev: + return ktime_sub(dev->next_event, now); } /** diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 53e45a39bdbc..2b845f2c44b1 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -38,7 +38,6 @@ enum tick_nohz_mode { * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding - * @sleep_length: Duration of the current idle sleep * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) * @timer_expires_base: Base time clock monotonic for @timer_expires * @do_timer_lst: CPU was the last one doing do_timer before going idle @@ -60,7 +59,6 @@ struct tick_sched { ktime_t idle_exittime; ktime_t idle_sleeptime; ktime_t iowait_sleeptime; - ktime_t sleep_length; unsigned long last_jiffies; u64 timer_expires; u64 timer_expires_base; -- cgit v1.2.3-71-gd317 From 296bb1e51a4838a6488ec5ce676607093482ecbc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Apr 2018 19:12:34 +0200 Subject: cpuidle: menu: Refine idle state selection for running tick If the tick isn't stopped, the target residency of the state selected by the menu governor may be greater than the actual time to the next tick and that means lost energy. To avoid that, make tick_nohz_get_sleep_length() return the current time to the next event (before stopping the tick) in addition to the estimated one via an extra pointer argument and make menu_select() use that value to refine the state selection when necessary. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- drivers/cpuidle/governors/menu.c | 27 +++++++++++++++++++++++++-- include/linux/tick.h | 7 ++++--- kernel/time/tick-sched.c | 12 ++++++------ 3 files changed, 35 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index f53a929bd2bd..267982e471e0 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -295,6 +295,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, unsigned int expected_interval; unsigned long nr_iowaiters, cpu_load; int resume_latency = dev_pm_qos_raw_read_value(device); + ktime_t delta_next; if (data->needs_update) { menu_update(drv, dev); @@ -312,7 +313,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } /* determine the expected residency time, round up */ - data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length()); + data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(&delta_next)); get_iowait_load(&nr_iowaiters, &cpu_load); data->bucket = which_bucket(data->next_timer_us, nr_iowaiters); @@ -396,9 +397,31 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * expected idle duration is shorter than the tick period length. */ if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || - expected_interval < TICK_USEC) + expected_interval < TICK_USEC) { + unsigned int delta_next_us = ktime_to_us(delta_next); + *stop_tick = false; + if (!tick_nohz_tick_stopped() && idx > 0 && + drv->states[idx].target_residency > delta_next_us) { + /* + * The tick is not going to be stopped and the target + * residency of the state to be returned is not within + * the time until the next timer event including the + * tick, so try to correct that. + */ + for (i = idx - 1; i >= 0; i--) { + if (drv->states[i].disabled || + dev->states_usage[i].disable) + continue; + + idx = i; + if (drv->states[i].target_residency <= delta_next_us) + break; + } + } + } + data->last_state_idx = idx; return data->last_state_idx; diff --git a/include/linux/tick.h b/include/linux/tick.h index e8e7ff16b929..55388ab45fd4 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -122,7 +122,7 @@ extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); extern bool tick_nohz_idle_got_tick(void); -extern ktime_t tick_nohz_get_sleep_length(void); +extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next); extern unsigned long tick_nohz_get_idle_calls(void); extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); @@ -146,9 +146,10 @@ static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } static inline bool tick_nohz_idle_got_tick(void) { return false; } -static inline ktime_t tick_nohz_get_sleep_length(void) +static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { - return NSEC_PER_SEC / HZ; + *delta_next = TICK_NSEC; + return *delta_next; } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c57c98c7e953..edb9d49b4996 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1023,10 +1023,11 @@ bool tick_nohz_idle_got_tick(void) /** * tick_nohz_get_sleep_length - return the expected length of the current sleep + * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled */ -ktime_t tick_nohz_get_sleep_length(void) +ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); @@ -1040,12 +1041,14 @@ ktime_t tick_nohz_get_sleep_length(void) WARN_ON_ONCE(!ts->inidle); + *delta_next = ktime_sub(dev->next_event, now); + if (!can_stop_idle_tick(cpu, ts)) - goto out_dev; + return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) - goto out_dev; + return *delta_next; /* * If the next highres timer to expire is earlier than next_event, the @@ -1055,9 +1058,6 @@ ktime_t tick_nohz_get_sleep_length(void) hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); - -out_dev: - return ktime_sub(dev->next_event, now); } /** -- cgit v1.2.3-71-gd317 From 2bc629a692a76b9ee3dab9c303e3f501bece66a4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 6 Apr 2018 04:32:37 +0200 Subject: nohz: Gather tick_sched booleans under a common flag field Optimize the space and leave plenty of room for further flags. Signed-off-by: Frederic Weisbecker [ rjw: Do not use __this_cpu_read() to access tick_stopped and add got_idle_tick to avoid overloading inidle ] Signed-off-by: Rafael J. Wysocki --- kernel/time/tick-sched.c | 12 +++++++----- kernel/time/tick-sched.h | 12 ++++++++---- 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index edb9d49b4996..a9d5cc7406d3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -465,7 +465,9 @@ __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { - return __this_cpu_read(tick_cpu_sched.tick_stopped); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->tick_stopped; } bool tick_nohz_tick_stopped_cpu(int cpu) @@ -1014,8 +1016,8 @@ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - if (ts->inidle > 1) { - ts->inidle = 1; + if (ts->got_idle_tick) { + ts->got_idle_tick = 0; return true; } return false; @@ -1161,7 +1163,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) ktime_t now = ktime_get(); if (ts->inidle) - ts->inidle = 2; + ts->got_idle_tick = 1; dev->next_event = KTIME_MAX; @@ -1261,7 +1263,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) ktime_t now = ktime_get(); if (ts->inidle) - ts->inidle = 2; + ts->got_idle_tick = 1; tick_sched_do_timer(now); diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 2b845f2c44b1..6de959a854b2 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -41,19 +41,24 @@ enum tick_nohz_mode { * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) * @timer_expires_base: Base time clock monotonic for @timer_expires * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set */ struct tick_sched { struct hrtimer sched_timer; unsigned long check_clocks; enum tick_nohz_mode nohz_mode; + + unsigned int inidle : 1; + unsigned int tick_stopped : 1; + unsigned int idle_active : 1; + unsigned int do_timer_last : 1; + unsigned int got_idle_tick : 1; + ktime_t last_tick; ktime_t next_tick; - int inidle; - int tick_stopped; unsigned long idle_jiffies; unsigned long idle_calls; unsigned long idle_sleeps; - int idle_active; ktime_t idle_entrytime; ktime_t idle_waketime; ktime_t idle_exittime; @@ -64,7 +69,6 @@ struct tick_sched { u64 timer_expires_base; u64 next_timer; ktime_t idle_expires; - int do_timer_last; atomic_t tick_dep_mask; }; -- cgit v1.2.3-71-gd317 From ff7de6203131e3d60cda60aeda12c69373ca5d43 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Apr 2018 14:59:13 +0200 Subject: nohz: Avoid duplication of code related to got_idle_tick Move the code setting ts->got_idle_tick into tick_sched_do_timer() to avoid code duplication. No intentional changes in functionality. Suggested-by: Frederic Weisbecker Signed-off-by: Rafael J. Wysocki Reviewed-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a9d5cc7406d3..956831cf6cfb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -113,8 +113,7 @@ static ktime_t tick_init_jiffy_update(void) return period; } - -static void tick_sched_do_timer(ktime_t now) +static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); @@ -134,6 +133,9 @@ static void tick_sched_do_timer(ktime_t now) /* Check, if the jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); + + if (ts->inidle) + ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) @@ -1162,12 +1164,9 @@ static void tick_nohz_handler(struct clock_event_device *dev) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - if (ts->inidle) - ts->got_idle_tick = 1; - dev->next_event = KTIME_MAX; - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); /* No need to reprogram if we are running tickless */ @@ -1262,10 +1261,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - if (ts->inidle) - ts->got_idle_tick = 1; - - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); /* * Do not call, when we are not in irq context and have -- cgit v1.2.3-71-gd317 From 7d2f6abb402ae38ec4bb7beabb3980bb834b1e0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 7 Apr 2018 19:11:55 +0200 Subject: time: hrtimer: Use timerqueue_iterate_next() to get to the next timer Use timerqueue_iterate_next() to get to the next timer in __hrtimer_next_event_base() without browsing the timerqueue details diredctly. No intentional changes in functionality. Suggested-by: Frederic Weisbecker Signed-off-by: Rafael J. Wysocki --- kernel/time/hrtimer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 6d387dbd7304..14e858753d76 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -505,9 +505,7 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, timer = container_of(next, struct hrtimer, node); if (timer == exclude) { /* Get to the next timer in the queue. */ - struct rb_node *rbn = rb_next(&next->node); - - next = rb_entry_safe(rbn, struct timerqueue_node, node); + next = timerqueue_iterate_next(next); if (!next) continue; -- cgit v1.2.3-71-gd317 From fe43e2ce526979271f6c682727c6af9e7c19bec5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 30 Mar 2018 16:18:44 -0500 Subject: PM / QoS: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9d7503910ce2..fa39092b7aea 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -295,6 +295,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, * changed */ plist_del(node, &c->list); + /* fall through */ case PM_QOS_ADD_REQ: plist_node_init(node, new_value); plist_add(node, &c->list); @@ -367,6 +368,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, break; case PM_QOS_UPDATE_REQ: pm_qos_flags_remove_req(pqf, req); + /* fall through */ case PM_QOS_ADD_REQ: req->flags = val; INIT_LIST_HEAD(&req->node); -- cgit v1.2.3-71-gd317 From 621b6d2ea297d0fb6030452c5bcd221f12165fcf Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Mon, 9 Apr 2018 19:03:46 +0900 Subject: perf/core: Fix use-after-free in uprobe_perf_close() A use-after-free bug was caught by KASAN while running usdt related code (BCC project. bcc/tests/python/test_usdt2.py): ================================================================== BUG: KASAN: use-after-free in uprobe_perf_close+0x222/0x3b0 Read of size 4 at addr ffff880384f9b4a4 by task test_usdt2.py/870 CPU: 4 PID: 870 Comm: test_usdt2.py Tainted: G W 4.16.0-next-20180409 #215 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0xc7/0x15b ? show_regs_print_info+0x5/0x5 ? printk+0x9c/0xc3 ? kmsg_dump_rewind_nolock+0x6e/0x6e ? uprobe_perf_close+0x222/0x3b0 print_address_description+0x83/0x3a0 ? uprobe_perf_close+0x222/0x3b0 kasan_report+0x1dd/0x460 ? uprobe_perf_close+0x222/0x3b0 uprobe_perf_close+0x222/0x3b0 ? probes_open+0x180/0x180 ? free_filters_list+0x290/0x290 trace_uprobe_register+0x1bb/0x500 ? perf_event_attach_bpf_prog+0x310/0x310 ? probe_event_disable+0x4e0/0x4e0 perf_uprobe_destroy+0x63/0xd0 _free_event+0x2bc/0xbd0 ? lockdep_rcu_suspicious+0x100/0x100 ? ring_buffer_attach+0x550/0x550 ? kvm_sched_clock_read+0x1a/0x30 ? perf_event_release_kernel+0x3e4/0xc00 ? __mutex_unlock_slowpath+0x12e/0x540 ? wait_for_completion+0x430/0x430 ? lock_downgrade+0x3c0/0x3c0 ? lock_release+0x980/0x980 ? do_raw_spin_trylock+0x118/0x150 ? do_raw_spin_unlock+0x121/0x210 ? do_raw_spin_trylock+0x150/0x150 perf_event_release_kernel+0x5d4/0xc00 ? put_event+0x30/0x30 ? fsnotify+0xd2d/0xea0 ? sched_clock_cpu+0x18/0x1a0 ? __fsnotify_update_child_dentry_flags.part.0+0x1b0/0x1b0 ? pvclock_clocksource_read+0x152/0x2b0 ? pvclock_read_flags+0x80/0x80 ? kvm_sched_clock_read+0x1a/0x30 ? sched_clock_cpu+0x18/0x1a0 ? pvclock_clocksource_read+0x152/0x2b0 ? locks_remove_file+0xec/0x470 ? pvclock_read_flags+0x80/0x80 ? fcntl_setlk+0x880/0x880 ? ima_file_free+0x8d/0x390 ? lockdep_rcu_suspicious+0x100/0x100 ? ima_file_check+0x110/0x110 ? fsnotify+0xea0/0xea0 ? kvm_sched_clock_read+0x1a/0x30 ? rcu_note_context_switch+0x600/0x600 perf_release+0x21/0x40 __fput+0x264/0x620 ? fput+0xf0/0xf0 ? do_raw_spin_unlock+0x121/0x210 ? do_raw_spin_trylock+0x150/0x150 ? SyS_fchdir+0x100/0x100 ? fsnotify+0xea0/0xea0 task_work_run+0x14b/0x1e0 ? task_work_cancel+0x1c0/0x1c0 ? copy_fd_bitmaps+0x150/0x150 ? vfs_read+0xe5/0x260 exit_to_usermode_loop+0x17b/0x1b0 ? trace_event_raw_event_sys_exit+0x1a0/0x1a0 do_syscall_64+0x3f6/0x490 ? syscall_return_slowpath+0x2c0/0x2c0 ? lockdep_sys_exit+0x1f/0xaa ? syscall_return_slowpath+0x1a3/0x2c0 ? lockdep_sys_exit+0x1f/0xaa ? prepare_exit_to_usermode+0x11c/0x1e0 ? enter_from_user_mode+0x30/0x30 random: crng init done ? __put_user_4+0x1c/0x30 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x7f41d95f9340 RSP: 002b:00007fffe71e4268 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 000000000000000d RCX: 00007f41d95f9340 RDX: 0000000000000000 RSI: 0000000000002401 RDI: 000000000000000d RBP: 0000000000000000 R08: 00007f41ca8ff700 R09: 00007f41d996dd1f R10: 00007fffe71e41e0 R11: 0000000000000246 R12: 00007fffe71e4330 R13: 0000000000000000 R14: fffffffffffffffc R15: 00007fffe71e4290 Allocated by task 870: kasan_kmalloc+0xa0/0xd0 kmem_cache_alloc_node+0x11a/0x430 copy_process.part.19+0x11a0/0x41c0 _do_fork+0x1be/0xa20 do_syscall_64+0x198/0x490 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Freed by task 0: __kasan_slab_free+0x12e/0x180 kmem_cache_free+0x102/0x4d0 free_task+0xfe/0x160 __put_task_struct+0x189/0x290 delayed_put_task_struct+0x119/0x250 rcu_process_callbacks+0xa6c/0x1b60 __do_softirq+0x238/0x7ae The buggy address belongs to the object at ffff880384f9b480 which belongs to the cache task_struct of size 12928 It occurs because task_struct is freed before perf_event which refers to the task and task flags are checked while teardown of the event. perf_event_alloc() assigns task_struct to hw.target of perf_event, but there is no reference counting for it. As a fix we get_task_struct() in perf_event_alloc() at above mentioned assignment and put_task_struct() in _free_event(). Signed-off-by: Prashant Bhole Reviewed-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 63b6da39bb38e8f1a1ef3180d32a39d6 ("perf: Fix perf_event_exit_task() race") Link: http://lkml.kernel.org/r/20180409100346.6416-1-bhole_prashant_q7@lab.ntt.co.jp Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fc1c330c6bd6..d7af82827373 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4447,6 +4447,9 @@ static void _free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); + if (event->hw.target) + put_task_struct(event->hw.target); + exclusive_event_destroy(event); module_put(event->pmu->module); @@ -9955,6 +9958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ + get_task_struct(task); event->hw.target = task; } @@ -10070,6 +10074,8 @@ err_ns: perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); + if (event->hw.target) + put_task_struct(event->hw.target); kfree(event); return ERR_PTR(err); -- cgit v1.2.3-71-gd317 From 5da13ab8b0dcaa984c45ae43edf5a4d148603d42 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 9 Apr 2018 21:16:54 +0900 Subject: perf/core: Fix perf_kprobe_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix error handling in perf_kprobe_init(): ================================================================== BUG: KASAN: slab-out-of-bounds in strlen+0x8e/0xa0 lib/string.c:482 Read of size 1 at addr ffff88003f9cc5c0 by task syz-executor2/23095 CPU: 0 PID: 23095 Comm: syz-executor2 Not tainted 4.16.0+ #24 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xca/0x13e lib/dump_stack.c:113 print_address_description+0x6e/0x2c0 mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report+0x256/0x380 mm/kasan/report.c:412 strlen+0x8e/0xa0 lib/string.c:482 kstrdup+0x21/0x70 mm/util.c:55 alloc_trace_kprobe+0xc8/0x930 kernel/trace/trace_kprobe.c:325 create_local_trace_kprobe+0x4f/0x3a0 kernel/trace/trace_kprobe.c:1438 perf_kprobe_init+0x149/0x1f0 kernel/trace/trace_event_perf.c:264 perf_kprobe_event_init+0xa8/0x120 kernel/events/core.c:8407 perf_try_init_event+0xcb/0x2a0 kernel/events/core.c:9719 perf_init_event kernel/events/core.c:9750 [inline] perf_event_alloc+0x1367/0x1e20 kernel/events/core.c:10022 SYSC_perf_event_open+0x242/0x2330 kernel/events/core.c:10477 do_syscall_64+0x198/0x640 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 Reported-by: 范龙飞 Signed-off-by: Masami Hiramatsu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Song Liu Cc: Thomas Gleixner Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU") Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 2c416509b834..94600f1f7efa 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -252,6 +252,8 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) ret = strncpy_from_user( func, u64_to_user_ptr(p_event->attr.kprobe_func), KSYM_NAME_LEN); + if (ret == KSYM_NAME_LEN) + ret = -E2BIG; if (ret < 0) goto out; -- cgit v1.2.3-71-gd317 From 0eadcc7a7bc03e991d2da1cf88143fb7cc0342c1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 9 Apr 2018 18:31:30 +0000 Subject: perf/core: Fix perf_uprobe_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similarly to the uprobe PMU fix in perf_kprobe_init(), fix error handling in perf_uprobe_init() as well. Reported-by: 范龙飞 Signed-off-by: Song Liu Acked-by: Masami Hiramatsu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU") Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 94600f1f7efa..c79193e598f5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -302,6 +302,8 @@ int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) return -ENOMEM; ret = strncpy_from_user( path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); + if (ret == PATH_MAX) + return -E2BIG; if (ret < 0) goto out; if (path[0] == '\0') { -- cgit v1.2.3-71-gd317 From bbe9a70a478129f3f9b2003415d0c36afcea210f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 9 Apr 2018 14:23:33 +0200 Subject: tick-sched: avoid a maybe-uninitialized warning The use of bitfields seems to confuse gcc, leading to a false-positive warning in all compiler versions: kernel/time/tick-sched.c: In function 'tick_nohz_idle_exit': kernel/time/tick-sched.c:538:2: error: 'now' may be used uninitialized in this function [-Werror=maybe-uninitialized] This introduces a temporary variable to track the flags so gcc doesn't have to evaluate twice, eliminating the code path that leads to the warning. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85301 Fixes: 1cae544d42d2 ("nohz: Gather tick_sched booleans under a common flag field") Signed-off-by: Arnd Bergmann Signed-off-by: Rafael J. Wysocki --- kernel/time/tick-sched.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 956831cf6cfb..e35a6fced00c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1134,6 +1134,7 @@ void tick_nohz_idle_restart_tick(void) void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); @@ -1142,14 +1143,16 @@ void tick_nohz_idle_exit(void) WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; + idle_active = ts->idle_active; + tick_stopped = ts->tick_stopped; - if (ts->idle_active || ts->tick_stopped) + if (idle_active || tick_stopped) now = ktime_get(); - if (ts->idle_active) + if (idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) + if (tick_stopped) __tick_nohz_idle_restart_tick(ts, now); local_irq_enable(); -- cgit v1.2.3-71-gd317 From 50268a3d266ecfdd6c5873d62b2758d9732fc598 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Apr 2018 21:20:08 +0900 Subject: tracing/uprobe_event: Fix strncpy corner case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix string fetch function to terminate with NUL. It is OK to drop the rest of string. Signed-off-by: Masami Hiramatsu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Song Liu Cc: Thomas Gleixner Cc: security@kernel.org Cc: 范龙飞 Fixes: 5baaa59ef09e ("tracing/probes: Implement 'memory' fetch method for uprobes") Signed-off-by: Ingo Molnar --- kernel/trace/trace_uprobe.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2014f4351ae0..0d450b40988e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -151,6 +151,8 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, return; ret = strncpy_from_user(dst, src, maxlen); + if (ret == maxlen) + dst[--ret] = '\0'; if (ret < 0) { /* Failed to fetch string */ ((u8 *)get_rloc_data(dest))[0] = '\0'; -- cgit v1.2.3-71-gd317 From 0a4d0564f0fc377ad0ba66ce214b9b16324aea4b Mon Sep 17 00:00:00 2001 From: Jérémy Lefaure Date: Sun, 15 Oct 2017 21:22:49 -0400 Subject: tracing: Use ARRAY_SIZE() macro instead of open coding it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is useless to re-invent the ARRAY_SIZE macro so let's use it instead of DATA_CNT. Found with Coccinelle with the following semantic patch: @r depends on (org || report)@ type T; T[] E; position p; @@ ( (sizeof(E)@p /sizeof(*E)) | (sizeof(E)@p /sizeof(E[...])) | (sizeof(E)@p /sizeof(T)) ) Link: http://lkml.kernel.org/r/20171016012250.26453-1-jeremy.lefaure@lse.epita.fr Signed-off-by: Jérémy Lefaure [ Removed useless include of kernel.h ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1bda4ec95e18..5eba1cec945c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2140,7 +2140,7 @@ static struct test_filter_data_t { #undef YES #undef NO -#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) +#define DATA_CNT ARRAY_SIZE(test_filter_data) static int test_pred_visited; -- cgit v1.2.3-71-gd317 From f0a2aa5a2a406d0a57aa9b320ffaa5538672b6c5 Mon Sep 17 00:00:00 2001 From: Howard McLauchlan Date: Tue, 10 Apr 2018 16:10:30 -0700 Subject: tracing/uprobe: Add support for overlayfs uprobes cannot successfully attach to binaries located in a directory mounted with overlayfs. To verify, create directories for mounting overlayfs (upper,lower,work,merge), move some binary into merge/ and use readelf to obtain some known instruction of the binary. I used /bin/true and the entry instruction(0x13b0): $ mount -t overlay overlay -o lowerdir=lower,upperdir=upper,workdir=work merge $ cd /sys/kernel/debug/tracing $ echo 'p:true_entry PATH_TO_MERGE/merge/true:0x13b0' > uprobe_events $ echo 1 > events/uprobes/true_entry/enable This returns 'bash: echo: write error: Input/output error' and dmesg tells us 'event trace: Could not enable event true_entry' This change makes create_trace_uprobe() look for the real inode of a dentry. In the case of normal filesystems, this simplifies to just returning the inode. In the case of overlayfs(and similar fs) we will obtain the underlying dentry and corresponding inode, upon which uprobes can successfully register. Running the example above with the patch applied, we can see that the uprobe is enabled and will output to trace as expected. Link: http://lkml.kernel.org/r/20180410231030.2720-1-hmclauchlan@fb.com Reviewed-by: Josef Bacik Reviewed-by: Masami Hiramatsu Reviewed-by: Srikar Dronamraju Signed-off-by: Howard McLauchlan Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 268029ae1be6..8b86d76c55ee 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -446,7 +446,7 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - inode = igrab(d_inode(path.dentry)); + inode = igrab(d_real_inode(path.dentry)); path_put(&path); if (!inode || !S_ISREG(inode->i_mode)) { -- cgit v1.2.3-71-gd317 From 18d45b11d96e6f9b3814960a1394083a3d6b7f74 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 15 Mar 2018 13:57:55 +0530 Subject: trace_uprobe: Use %lx to display offset tu->offset is unsigned long, not a pointer, thus %lx should be used to print it, not the %px. Link: http://lkml.kernel.org/r/20180315082756.9050-1-ravi.bangoria@linux.vnet.ibm.com Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu Fixes: 0e4d819d0893 ("trace_uprobe: Display correct offset in uprobe_events") Suggested-by: Kees Cook Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b86d76c55ee..d7d3c9237f64 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -608,7 +608,7 @@ static int probes_seq_show(struct seq_file *m, void *v) /* Don't print "0x (null)" when offset is 0 */ if (tu->offset) { - seq_printf(m, "0x%px", (void *)tu->offset); + seq_printf(m, "0x%0*lx", (int)(sizeof(void *) * 2), tu->offset); } else { switch (sizeof(void *)) { case 4: -- cgit v1.2.3-71-gd317 From a64b2c01e67eff8b8d0d438507fd0346290697cf Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 15 Mar 2018 13:57:56 +0530 Subject: trace_uprobe: Simplify probes_seq_show() Simplify probes_seq_show() function. No change in output before and after patch. Link: http://lkml.kernel.org/r/20180315082756.9050-2-ravi.bangoria@linux.vnet.ibm.com Acked-by: Masami Hiramatsu Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d7d3c9237f64..21604754bb79 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -602,24 +602,9 @@ static int probes_seq_show(struct seq_file *m, void *v) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, - trace_event_name(&tu->tp.call)); - seq_printf(m, " %s:", tu->filename); - - /* Don't print "0x (null)" when offset is 0 */ - if (tu->offset) { - seq_printf(m, "0x%0*lx", (int)(sizeof(void *) * 2), tu->offset); - } else { - switch (sizeof(void *)) { - case 4: - seq_printf(m, "0x00000000"); - break; - case 8: - default: - seq_printf(m, "0x0000000000000000"); - break; - } - } + seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, tu->tp.call.class->system, + trace_event_name(&tu->tp.call), tu->filename, + (int)(sizeof(void *) * 2), tu->offset); for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); -- cgit v1.2.3-71-gd317 From 0b3dec05dbbce023f4f25aba975b5d253c313ebb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 11 Apr 2018 10:59:46 -0400 Subject: tracing: Enforce passing in filter=NULL to create_filter() There's some inconsistency with what to set the output parameter filterp when passing to create_filter(..., struct event_filter **filterp). Whatever filterp points to, should be NULL when calling this function. The create_filter() calls create_filter_start() with a pointer to a local "filter" variable that is set to NULL. The create_filter_start() has a WARN_ON() if the passed in pointer isn't pointing to a value set to NULL. Ideally, create_filter() should pass the filterp variable it received to create_filter_start() and not hide it as with a local variable, this allowed create_filter() to fail, and not update the passed in filter, and the caller of create_filter() then tried to free filter, which was never initialized to anything, causing memory corruption. Link: http://lkml.kernel.org/r/00000000000032a0c30569916870@google.com Fixes: 80765597bc587 ("tracing: Rewrite filter logic to be simpler and faster") Reported-by: syzbot+dadcc936587643d7f568@syzkaller.appspotmail.com Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 5eba1cec945c..9b4716bb8bb0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1704,18 +1704,16 @@ static int create_filter(struct trace_event_call *call, struct event_filter **filterp) { struct filter_parse_error *pe = NULL; - struct event_filter *filter = NULL; int err; - err = create_filter_start(filter_string, set_str, &pe, &filter); + err = create_filter_start(filter_string, set_str, &pe, filterp); if (err) return err; - err = process_preds(call, filter_string, filter, pe); + err = process_preds(call, filter_string, *filterp, pe); if (err && set_str) - append_filter_err(pe, filter); + append_filter_err(pe, *filterp); - *filterp = filter; return err; } @@ -1739,24 +1737,22 @@ static int create_system_filter(struct trace_subsystem_dir *dir, struct trace_array *tr, char *filter_str, struct event_filter **filterp) { - struct event_filter *filter = NULL; struct filter_parse_error *pe = NULL; int err; - err = create_filter_start(filter_str, true, &pe, &filter); + err = create_filter_start(filter_str, true, &pe, filterp); if (!err) { err = process_system_preds(dir, tr, pe, filter_str); if (!err) { /* System filters just show a default message */ - kfree(filter->filter_string); - filter->filter_string = NULL; + kfree((*filterp)->filter_string); + (*filterp)->filter_string = NULL; } else { - append_filter_err(pe, filter); + append_filter_err(pe, *filterp); } } create_filter_finish(pe); - *filterp = filter; return err; } @@ -1764,7 +1760,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir, int apply_event_filter(struct trace_event_file *file, char *filter_string) { struct trace_event_call *call = file->event_call; - struct event_filter *filter; + struct event_filter *filter = NULL; int err; if (!strcmp(strstrip(filter_string), "0")) { @@ -1817,7 +1813,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, { struct event_subsystem *system = dir->subsystem; struct trace_array *tr = dir->tr; - struct event_filter *filter; + struct event_filter *filter = NULL; int err = 0; mutex_lock(&event_mutex); @@ -2024,7 +2020,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str) { int err; - struct event_filter *filter; + struct event_filter *filter = NULL; struct trace_event_call *call; mutex_lock(&event_mutex); -- cgit v1.2.3-71-gd317 From 47d4b263a2f7324fb3cb641ca00b2725dd12dea0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:32:26 -0700 Subject: taint: convert to indexed initialization This converts to using indexed initializers instead of comments, adds a comment on why the taint flags can't be an enum, and make sure that no one forgets to update the taint_flags when adding new bits. Link: http://lkml.kernel.org/r/1519084390-43867-2-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Reviewed-by: Andrew Morton Cc: Al Viro Cc: Alexey Dobriyan Cc: Jonathan Corbet Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 + kernel/panic.c | 36 +++++++++++++++++++----------------- 2 files changed, 20 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 98273343bd45..086e8e80f765 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -544,6 +544,7 @@ extern enum system_states { SYSTEM_RESTART, } system_state; +/* This cannot be an enum because some may be used in assembly source. */ #define TAINT_PROPRIETARY_MODULE 0 #define TAINT_FORCED_MODULE 1 #define TAINT_CPU_OUT_OF_SPEC 2 diff --git a/kernel/panic.c b/kernel/panic.c index 6c3b08cd1139..af4cfa8eda22 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -308,23 +308,23 @@ EXPORT_SYMBOL(panic); * is being removed anyway. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */ - { 'F', ' ', true }, /* TAINT_FORCED_MODULE */ - { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */ - { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */ - { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */ - { 'B', ' ', false }, /* TAINT_BAD_PAGE */ - { 'U', ' ', false }, /* TAINT_USER */ - { 'D', ' ', false }, /* TAINT_DIE */ - { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */ - { 'W', ' ', false }, /* TAINT_WARN */ - { 'C', ' ', true }, /* TAINT_CRAP */ - { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */ - { 'O', ' ', true }, /* TAINT_OOT_MODULE */ - { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ - { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ - { 'K', ' ', true }, /* TAINT_LIVEPATCH */ - { 'X', ' ', true }, /* TAINT_AUX */ + [ TAINT_PROPRIETARY_MODULE ] = { 'P', 'G', true }, + [ TAINT_FORCED_MODULE ] = { 'F', ' ', true }, + [ TAINT_CPU_OUT_OF_SPEC ] = { 'S', ' ', false }, + [ TAINT_FORCED_RMMOD ] = { 'R', ' ', false }, + [ TAINT_MACHINE_CHECK ] = { 'M', ' ', false }, + [ TAINT_BAD_PAGE ] = { 'B', ' ', false }, + [ TAINT_USER ] = { 'U', ' ', false }, + [ TAINT_DIE ] = { 'D', ' ', false }, + [ TAINT_OVERRIDDEN_ACPI_TABLE ] = { 'A', ' ', false }, + [ TAINT_WARN ] = { 'W', ' ', false }, + [ TAINT_CRAP ] = { 'C', ' ', true }, + [ TAINT_FIRMWARE_WORKAROUND ] = { 'I', ' ', false }, + [ TAINT_OOT_MODULE ] = { 'O', ' ', true }, + [ TAINT_UNSIGNED_MODULE ] = { 'E', ' ', true }, + [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false }, + [ TAINT_LIVEPATCH ] = { 'K', ' ', true }, + [ TAINT_AUX ] = { 'X', ' ', true }, }; /** @@ -354,6 +354,8 @@ const char *print_tainted(void) { static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")]; + BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT); + if (tainted_mask) { char *s; int i; -- cgit v1.2.3-71-gd317 From 9c4560e5bbd8c839c8986f79ef536aa07bd77ec7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:32:29 -0700 Subject: taint: consolidate documentation This consolidates the taint bit documentation into a single place with both numeric and letter values. Additionally adds the missing TAINT_AUX documentation. Link: http://lkml.kernel.org/r/1519084390-43867-3-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Reviewed-by: Andrew Morton Cc: Al Viro Cc: Alexey Dobriyan Cc: Jonathan Corbet Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 53 +++++++++++++++++++++-------------------- kernel/panic.c | 23 ++++-------------- 2 files changed, 31 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 412314eebda6..4a890c7fb6c3 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -964,32 +964,33 @@ detect a hard lockup condition. tainted: -Non-zero if the kernel has been tainted. Numeric values, which -can be ORed together: - - 1 - A module with a non-GPL license has been loaded, this - includes modules with no license. - Set by modutils >= 2.4.9 and module-init-tools. - 2 - A module was force loaded by insmod -f. - Set by modutils >= 2.4.9 and module-init-tools. - 4 - Unsafe SMP processors: SMP with CPUs not designed for SMP. - 8 - A module was forcibly unloaded from the system by rmmod -f. - 16 - A hardware machine check error occurred on the system. - 32 - A bad page was discovered on the system. - 64 - The user has asked that the system be marked "tainted". This - could be because they are running software that directly modifies - the hardware, or for other reasons. - 128 - The system has died. - 256 - The ACPI DSDT has been overridden with one supplied by the user - instead of using the one provided by the hardware. - 512 - A kernel warning has occurred. -1024 - A module from drivers/staging was loaded. -2048 - The system is working around a severe firmware bug. -4096 - An out-of-tree module has been loaded. -8192 - An unsigned module has been loaded in a kernel supporting module - signature. -16384 - A soft lockup has previously occurred on the system. -32768 - The kernel has been live patched. +Non-zero if the kernel has been tainted. Numeric values, which can be +ORed together. The letters are seen in "Tainted" line of Oops reports. + + 1 (P): A module with a non-GPL license has been loaded, this + includes modules with no license. + Set by modutils >= 2.4.9 and module-init-tools. + 2 (F): A module was force loaded by insmod -f. + Set by modutils >= 2.4.9 and module-init-tools. + 4 (S): Unsafe SMP processors: SMP with CPUs not designed for SMP. + 8 (R): A module was forcibly unloaded from the system by rmmod -f. + 16 (M): A hardware machine check error occurred on the system. + 32 (B): A bad page was discovered on the system. + 64 (U): The user has asked that the system be marked "tainted". This + could be because they are running software that directly modifies + the hardware, or for other reasons. + 128 (D): The system has died. + 256 (A): The ACPI DSDT has been overridden with one supplied by the user + instead of using the one provided by the hardware. + 512 (W): A kernel warning has occurred. + 1024 (C): A module from drivers/staging was loaded. + 2048 (I): The system is working around a severe firmware bug. + 4096 (O): An out-of-tree module has been loaded. + 8192 (E): An unsigned module has been loaded in a kernel supporting module + signature. + 16384 (L): A soft lockup has previously occurred on the system. + 32768 (K): The kernel has been live patched. + 65536 (X): Auxiliary taint, defined and used by for distros. ============================================================== diff --git a/kernel/panic.c b/kernel/panic.c index af4cfa8eda22..5ceb9cbec4a2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -328,27 +328,12 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { }; /** - * print_tainted - return a string to represent the kernel taint state. + * print_tainted - return a string to represent the kernel taint state. * - * 'P' - Proprietary module has been loaded. - * 'F' - Module has been forcibly loaded. - * 'S' - SMP with CPUs not designed for SMP. - * 'R' - User forced a module unload. - * 'M' - System experienced a machine check exception. - * 'B' - System has hit bad_page. - * 'U' - Userspace-defined naughtiness. - * 'D' - Kernel has oopsed before - * 'A' - ACPI table overridden. - * 'W' - Taint on warning. - * 'C' - modules from drivers/staging are loaded. - * 'I' - Working around severe firmware bug. - * 'O' - Out-of-tree module has been loaded. - * 'E' - Unsigned module has been loaded. - * 'L' - A soft lockup has previously occurred. - * 'K' - Kernel has been live patched. - * 'X' - Auxiliary taint, for distros' use. + * For individual taint flag meanings, see Documentation/sysctl/kernel.txt * - * The string is overwritten by the next call to print_tainted(). + * The string is overwritten by the next call to print_tainted(), + * but is always NULL terminated. */ const char *print_tainted(void) { -- cgit v1.2.3-71-gd317 From bc4f2f5469ac2a52affadc4c00c1276d76151a39 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 16:32:33 -0700 Subject: taint: add taint for randstruct Since the randstruct plugin can intentionally produce extremely unusual kernel structure layouts (even performance pathological ones), some maintainers want to be able to trivially determine if an Oops is coming from a randstruct-built kernel, so as to keep their sanity when debugging. This adds the new flag and initializes taint_mask immediately when built with randstruct. Link: http://lkml.kernel.org/r/1519084390-43867-4-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Reviewed-by: Andrew Morton Cc: Al Viro Cc: Alexey Dobriyan Cc: Jonathan Corbet Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 1 + include/linux/kernel.h | 3 ++- kernel/panic.c | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 4a890c7fb6c3..eded671d55eb 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -991,6 +991,7 @@ ORed together. The letters are seen in "Tainted" line of Oops reports. 16384 (L): A soft lockup has previously occurred on the system. 32768 (K): The kernel has been live patched. 65536 (X): Auxiliary taint, defined and used by for distros. +131072 (T): The kernel was built with the struct randomization plugin. ============================================================== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 086e8e80f765..6a1eb0b0aad9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -562,7 +562,8 @@ extern enum system_states { #define TAINT_SOFTLOCKUP 14 #define TAINT_LIVEPATCH 15 #define TAINT_AUX 16 -#define TAINT_FLAGS_COUNT 17 +#define TAINT_RANDSTRUCT 17 +#define TAINT_FLAGS_COUNT 18 struct taint_flag { char c_true; /* character printed when tainted */ diff --git a/kernel/panic.c b/kernel/panic.c index 5ceb9cbec4a2..42e487488554 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,7 +34,8 @@ #define PANIC_BLINK_SPD 18 int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; -static unsigned long tainted_mask; +static unsigned long tainted_mask = + IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -325,6 +326,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false }, [ TAINT_LIVEPATCH ] = { 'K', ' ', true }, [ TAINT_AUX ] = { 'X', ' ', true }, + [ TAINT_RANDSTRUCT ] = { 'T', ' ', true }, }; /** -- cgit v1.2.3-71-gd317 From 3ea056c50476f877f8bceb560ab69871098cb3a9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Apr 2018 16:32:36 -0700 Subject: uts: create "struct uts_namespace" from kmem_cache So "struct uts_namespace" can enjoy fine-grained SLAB debugging and usercopy protection. I'd prefer shorter name "utsns" but there is "user_namespace" already. Link: http://lkml.kernel.org/r/20180228215158.GA23146@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: "Eric W. Biederman" Cc: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/utsname.h | 6 ++++++ init/main.c | 2 ++ kernel/utsname.c | 20 ++++++++++++++++---- 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/utsname.h b/include/linux/utsname.h index c8060c2ecd04..44429d9142ca 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -44,6 +44,8 @@ static inline void put_uts_ns(struct uts_namespace *ns) { kref_put(&ns->kref, free_uts_ns); } + +void uts_ns_init(void); #else static inline void get_uts_ns(struct uts_namespace *ns) { @@ -61,6 +63,10 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags, return old_ns; } + +static inline void uts_ns_init(void) +{ +} #endif #ifdef CONFIG_PROC_SYSCTL diff --git a/init/main.c b/init/main.c index d499f4a80e0b..50359a3162d0 100644 --- a/init/main.c +++ b/init/main.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -706,6 +707,7 @@ asmlinkage __visible void __init start_kernel(void) cred_init(); fork_init(); proc_caches_init(); + uts_ns_init(); buffer_init(); key_init(); security_init(); diff --git a/kernel/utsname.c b/kernel/utsname.c index 913fe4336d2b..dcd6be1996fe 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -19,6 +19,8 @@ #include #include +static struct kmem_cache *uts_ns_cache __ro_after_init; + static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); @@ -33,7 +35,7 @@ static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; - uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); if (uts_ns) kref_init(&uts_ns->kref); return uts_ns; @@ -42,7 +44,7 @@ static struct uts_namespace *create_uts_ns(void) /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone - * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise + * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise */ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) @@ -75,7 +77,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, return ns; fail_free: - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); fail_dec: dec_uts_namespaces(ucounts); fail: @@ -113,7 +115,7 @@ void free_uts_ns(struct kref *kref) dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); } static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) @@ -169,3 +171,13 @@ const struct proc_ns_operations utsns_operations = { .install = utsns_install, .owner = utsns_owner, }; + +void __init uts_ns_init(void) +{ + uts_ns_cache = kmem_cache_create_usercopy( + "uts_namespace", sizeof(struct uts_namespace), 0, + SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct uts_namespace, name), + sizeof_field(struct uts_namespace, name), + NULL); +} -- cgit v1.2.3-71-gd317 From 2d87b309a5d66c3ec0b4d985fe29b547282e7427 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Apr 2018 16:35:14 -0700 Subject: kernel/sysctl.c: fix sizeof argument to match variable name Fix sizeof argument to be the same as the data variable name. Probably a copy/paste error. Mostly harmless since both variables are unsigned int. Fixes kernel bugzilla #197371: Possible access to unintended variable in "kernel/sysctl.c" line 1339 https://bugzilla.kernel.org/show_bug.cgi?id=197371 Link: http://lkml.kernel.org/r/e0d0531f-361e-ef5f-8499-32743ba907e1@infradead.org Signed-off-by: Randy Dunlap Reported-by: Petru Mihancea Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bdf7090b106d..a2854f6e0743 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1340,7 +1340,7 @@ static struct ctl_table vm_table[] = { { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirty_expire_interval), + .maxlen = sizeof(dirtytime_expire_interval), .mode = 0644, .proc_handler = dirtytime_interval_handler, .extra1 = &zero, -- cgit v1.2.3-71-gd317 From edc41b3c5489996e4c1ec820bf102660bf745c45 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 10 Apr 2018 16:35:18 -0700 Subject: kernel/params.c: downgrade warning for unsafe parameters As using an unsafe module parameter is, by its very definition, an expected user action, emitting a warning is overkill. Nothing has yet gone wrong, and we add a taint flag for any future oops should something actually go wrong. So instead of having a user controllable pr_warn, downgrade it to a pr_notice for "a normal, but significant condition". We make use of unsafe kernel parameters in igt (https://cgit.freedesktop.org/drm/igt-gpu-tools/) (we have not yet succeeded in removing all such debugging options), which generates a warning and taints the kernel. The warning is unhelpful as we then need to filter it out again as we check that every test themselves do not provoke any kernel warnings. Link: http://lkml.kernel.org/r/20180226151919.9674-1-chris@chris-wilson.co.uk Fixes: 91f9d330cc14 ("module: make it possible to have unsafe, tainting module params") Signed-off-by: Chris Wilson Acked-by: Jani Nikula Reviewed-by: Andrew Morton Cc: Rusty Russell Cc: Jean Delvare Cc: Li Zhong Cc: Petri Latvala Cc: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index cc9108c2a1fd..ce89f757e6da 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -111,8 +111,8 @@ bool parameq(const char *a, const char *b) static void param_check_unsafe(const struct kernel_param *kp) { if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { - pr_warn("Setting dangerous option %s - tainting kernel\n", - kp->name); + pr_notice("Setting dangerous option %s - tainting kernel\n", + kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } } -- cgit v1.2.3-71-gd317 From 24704f36196ce79b48dd3921e782d15fd9c87959 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 10 Apr 2018 16:35:38 -0700 Subject: kernel/sysctl.c: add kdoc comments to do_proc_do{u}intvec_minmax_conv_param Kdoc comments are added to the do_proc_dointvec_minmax_conv_param and do_proc_douintvec_minmax_conv_param structures thare are used internally for range checking. The error codes returned by proc_dointvec_minmax() and proc_douintvec_minmax() are also documented. Link: http://lkml.kernel.org/r/1519926220-7453-3-git-send-email-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Andrew Morton Acked-by: Luis R. Rodriguez Cc: Al Viro Cc: Davidlohr Bueso Cc: Kees Cook Cc: Manfred Spraul Cc: Matthew Wilcox Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a2854f6e0743..6a78cf70761d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2511,6 +2511,15 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, } #endif +/** + * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_dointvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_dointvec_minmax() handler. + */ struct do_proc_dointvec_minmax_conv_param { int *min; int *max; @@ -2554,7 +2563,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * - * Returns 0 on success. + * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2567,6 +2576,15 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +/** + * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_douintvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_douintvec_minmax() handler. + */ struct do_proc_douintvec_minmax_conv_param { unsigned int *min; unsigned int *max; @@ -2614,7 +2632,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * check for UINT_MAX to avoid having to support wrap around uses from * userspace. * - * Returns 0 on success. + * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) -- cgit v1.2.3-71-gd317 From f6bb2a2c0b81c47282ddb7883f92e65a063c27dd Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 10 Apr 2018 16:36:52 -0700 Subject: xarray: add the xa_lock to the radix_tree_root This results in no change in structure size on 64-bit machines as it fits in the padding between the gfp_t and the void *. 32-bit machines will grow the structure from 8 to 12 bytes. Almost all radix trees are protected with (at least) a spinlock, so as they are converted from radix trees to xarrays, the data structures will shrink again. Initialising the spinlock requires a name for the benefit of lockdep, so RADIX_TREE_INIT() now needs to know the name of the radix tree it's initialising, and so do IDR_INIT() and IDA_INIT(). Also add the xa_lock() and xa_unlock() family of wrappers to make it easier to use the lock. If we could rely on -fplan9-extensions in the compiler, we could avoid all of this syntactic sugar, but that wasn't added until gcc 4.6. Link: http://lkml.kernel.org/r/20180313132639.17387-8-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Jeff Layton Cc: Darrick J. Wong Cc: Dave Chinner Cc: Ryusuke Konishi Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/gc.c | 2 +- include/linux/idr.h | 19 ++++++++++--------- include/linux/radix-tree.h | 7 +++++-- include/linux/xarray.h | 24 ++++++++++++++++++++++++ kernel/pid.c | 2 +- tools/include/linux/spinlock.h | 1 + 6 files changed, 42 insertions(+), 13 deletions(-) create mode 100644 include/linux/xarray.h (limited to 'kernel') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bfb7a4a3a929..9327411fd93b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1015,7 +1015,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), - .iroot = RADIX_TREE_INIT(GFP_NOFS), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; trace_f2fs_gc_begin(sbi->sb, sync, background, diff --git a/include/linux/idr.h b/include/linux/idr.h index 913c335054f0..e856f4e0ab35 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -32,27 +32,28 @@ struct idr { #define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \ (1 << (ROOT_TAG_SHIFT + IDR_FREE))) -#define IDR_INIT_BASE(base) { \ - .idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER), \ +#define IDR_INIT_BASE(name, base) { \ + .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \ .idr_base = (base), \ .idr_next = 0, \ } /** * IDR_INIT() - Initialise an IDR. + * @name: Name of IDR. * * A freshly-initialised IDR contains no IDs. */ -#define IDR_INIT IDR_INIT_BASE(0) +#define IDR_INIT(name) IDR_INIT_BASE(name, 0) /** - * DEFINE_IDR() - Define a statically-allocated IDR - * @name: Name of IDR + * DEFINE_IDR() - Define a statically-allocated IDR. + * @name: Name of IDR. * * An IDR defined using this macro is ready for use with no additional * initialisation required. It contains no IDs. */ -#define DEFINE_IDR(name) struct idr name = IDR_INIT +#define DEFINE_IDR(name) struct idr name = IDR_INIT(name) /** * idr_get_cursor - Return the current position of the cyclic allocator @@ -219,10 +220,10 @@ struct ida { struct radix_tree_root ida_rt; }; -#define IDA_INIT { \ - .ida_rt = RADIX_TREE_INIT(IDR_RT_MARKER | GFP_NOWAIT), \ +#define IDA_INIT(name) { \ + .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \ } -#define DEFINE_IDA(name) struct ida name = IDA_INIT +#define DEFINE_IDA(name) struct ida name = IDA_INIT(name) int ida_pre_get(struct ida *ida, gfp_t gfp_mask); int ida_get_new_above(struct ida *ida, int starting_id, int *p_id); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 6c4e2e716dac..34149e8b5f73 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -110,20 +110,23 @@ struct radix_tree_node { #define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT) struct radix_tree_root { + spinlock_t xa_lock; gfp_t gfp_mask; struct radix_tree_node __rcu *rnode; }; -#define RADIX_TREE_INIT(mask) { \ +#define RADIX_TREE_INIT(name, mask) { \ + .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .gfp_mask = (mask), \ .rnode = NULL, \ } #define RADIX_TREE(name, mask) \ - struct radix_tree_root name = RADIX_TREE_INIT(mask) + struct radix_tree_root name = RADIX_TREE_INIT(name, mask) #define INIT_RADIX_TREE(root, mask) \ do { \ + spin_lock_init(&(root)->xa_lock); \ (root)->gfp_mask = (mask); \ (root)->rnode = NULL; \ } while (0) diff --git a/include/linux/xarray.h b/include/linux/xarray.h new file mode 100644 index 000000000000..2dfc8006fe64 --- /dev/null +++ b/include/linux/xarray.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _LINUX_XARRAY_H +#define _LINUX_XARRAY_H +/* + * eXtensible Arrays + * Copyright (c) 2017 Microsoft Corporation + * Author: Matthew Wilcox + */ + +#include + +#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) +#define xa_lock(xa) spin_lock(&(xa)->xa_lock) +#define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) +#define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) +#define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) +#define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) +#define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) +#define xa_lock_irqsave(xa, flags) \ + spin_lock_irqsave(&(xa)->xa_lock, flags) +#define xa_unlock_irqrestore(xa, flags) \ + spin_unlock_irqrestore(&(xa)->xa_lock, flags) + +#endif /* _LINUX_XARRAY_H */ diff --git a/kernel/pid.c b/kernel/pid.c index ed6c343fe50d..157fe4b19971 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -70,7 +70,7 @@ int pid_max_max = PID_MAX_LIMIT; */ struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), - .idr = IDR_INIT, + .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 4ed569fcb139..b21b586b9854 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -7,6 +7,7 @@ #define spinlock_t pthread_mutex_t #define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER; +#define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER #define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x) #define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x) -- cgit v1.2.3-71-gd317 From 32e6e967fb36bf77ed99221ae3ce1909f045d8f9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 11 Apr 2018 18:02:37 +0000 Subject: perf/core: Need CAP_SYS_ADMIN to create k/uprobe with perf_event_open() Non-root user cannot create kprobe or uprobe through the text-based interface (kprobe_events, uprobe_events),so they should not be able to create probes via perf_event_open() either. Reported-by: Vince Weaver Signed-off-by: Song Liu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU") Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU") Link: http://lkml.kernel.org/r/C0B2EFB5-C403-4BDB-9046-C14B3EE66999@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d7af82827373..2d5fe26551f8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8400,6 +8400,10 @@ static int perf_kprobe_event_init(struct perf_event *event) if (event->attr.type != perf_kprobe.type) return -ENOENT; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + /* * no branch sampling for probe events */ @@ -8437,6 +8441,10 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + /* * no branch sampling for probe events */ -- cgit v1.2.3-71-gd317 From 60bb83b81169820c691fbfa33a6a4aef32aa4b0b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 13 Apr 2018 15:35:13 -0700 Subject: resource: fix integer overflow at reallocation We've got a bug report indicating a kernel panic at booting on an x86-32 system, and it turned out to be the invalid PCI resource assigned after reallocation. __find_resource() first aligns the resource start address and resets the end address with start+size-1 accordingly, then checks whether it's contained. Here the end address may overflow the integer, although resource_contains() still returns true because the function validates only start and end address. So this ends up with returning an invalid resource (start > end). There was already an attempt to cover such a problem in the commit 47ea91b4052d ("Resource: fix wrong resource window calculation"), but this case is an overseen one. This patch adds the validity check of the newly calculated resource for avoiding the integer overflow problem. Bugzilla: http://bugzilla.opensuse.org/show_bug.cgi?id=1086739 Link: http://lkml.kernel.org/r/s5hpo37d5l8.wl-tiwai@suse.de Fixes: 23c570a67448 ("resource: ability to resize an allocated resource") Signed-off-by: Takashi Iwai Reported-by: Michael Henders Tested-by: Michael Henders Reviewed-by: Andrew Morton Cc: Ram Pai Cc: Bjorn Helgaas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index e270b5048988..2af6c03858b9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -651,7 +651,8 @@ static int __find_resource(struct resource *root, struct resource *old, alloc.start = constraint->alignf(constraint->alignf_data, &avail, size, constraint->align); alloc.end = alloc.start + size - 1; - if (resource_contains(&avail, &alloc)) { + if (alloc.start <= alloc.end && + resource_contains(&avail, &alloc)) { new->start = alloc.start; new->end = alloc.end; return 0; -- cgit v1.2.3-71-gd317 From 1cbf29da3628b661379acba7b08a07ef1e5da3b5 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Fri, 13 Apr 2018 15:35:34 -0700 Subject: kexec: export PG_swapbacked to VMCOREINFO Since commit 6326fec1122c ("mm: Use owner_priv bit for PageSwapCache, valid when PageSwapBacked"), PG_swapcache is an alias for PG_owner_priv_1, which may be also used for other purposes. To know whether the bit indeed has the PG_swapcache meaning, it is necessary to check PG_swapbacked, hence this bit must be exported. Link: http://lkml.kernel.org/r/20180410161345.142e142d@ezekiel.suse.cz Signed-off-by: Petr Tesarik Reviewed-by: Andrew Morton Cc: Dave Young Cc: Xunlei Pang Cc: Baoquan He Cc: Hari Bathini Cc: "Kirill A. Shutemov" Cc: "Marc-Andr Lureau" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a93590cdd9e1..f7674d676889 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -454,6 +454,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_lru); VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_swapbacked); VMCOREINFO_NUMBER(PG_slab); #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); -- cgit v1.2.3-71-gd317 From b799a09f639beeda105fe8a9ab440d80fdabd3b3 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 13 Apr 2018 15:35:45 -0700 Subject: kexec_file: make use of purgatory optional Patch series "kexec_file, x86, powerpc: refactoring for other architecutres", v2. This is a preparatory patchset for adding kexec_file support on arm64. It was originally included in a arm64 patch set[1], but Philipp is also working on their kexec_file support on s390[2] and some changes are now conflicting. So these common parts were extracted and put into a separate patch set for better integration. What's more, my original patch#4 was split into a few small chunks for easier review after Dave's comment. As such, the resulting code is basically identical with my original, and the only *visible* differences are: - renaming of _kexec_kernel_image_probe() and _kimage_file_post_load_cleanup() - change one of types of arguments at prepare_elf64_headers() Those, unfortunately, require a couple of trivial changes on the rest (#1, #6 to #13) of my arm64 kexec_file patch set[1]. Patch #1 allows making a use of purgatory optional, particularly useful for arm64. Patch #2 commonalizes arch_kexec_kernel_{image_probe, image_load, verify_sig}() and arch_kimage_file_post_load_cleanup() across architectures. Patches #3-#7 are also intended to generalize parse_elf64_headers(), along with exclude_mem_range(), to be made best re-use of. [1] http://lists.infradead.org/pipermail/linux-arm-kernel/2018-February/561182.html [2] http://lkml.iu.edu//hypermail/linux/kernel/1802.1/02596.html This patch (of 7): On arm64, crash dump kernel's usable memory is protected by *unmapping* it from kernel virtual space unlike other architectures where the region is just made read-only. It is highly unlikely that the region is accidentally corrupted and this observation rationalizes that digest check code can also be dropped from purgatory. The resulting code is so simple as it doesn't require a bit ugly re-linking/relocation stuff, i.e. arch_kexec_apply_relocations_add(). Please see: http://lists.infradead.org/pipermail/linux-arm-kernel/2017-December/545428.html All that the purgatory does is to shuffle arguments and jump into a new kernel, while we still need to have some space for a hash value (purgatory_sha256_digest) which is never checked against. As such, it doesn't make sense to have trampline code between old kernel and new kernel on arm64. This patch introduces a new configuration, ARCH_HAS_KEXEC_PURGATORY, and allows related code to be compiled in only if necessary. [takahiro.akashi@linaro.org: fix trivial screwup] Link: http://lkml.kernel.org/r/20180309093346.GF25863@linaro.org Link: http://lkml.kernel.org/r/20180306102303.9063-2-takahiro.akashi@linaro.org Signed-off-by: AKASHI Takahiro Acked-by: Dave Young Tested-by: Dave Young Cc: Vivek Goyal Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 3 +++ arch/x86/Kconfig | 3 +++ kernel/kexec_file.c | 5 +++++ 3 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 73ce5dd07642..c32a181a7cbb 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -552,6 +552,9 @@ config KEXEC_FILE for kernel and initramfs as opposed to a list of segments as is the case for the older kexec call. +config ARCH_HAS_KEXEC_PURGATORY + def_bool KEXEC_FILE + config RELOCATABLE bool "Build a relocatable kernel" depends on PPC64 || (FLATMEM && (44x || FSL_BOOKE)) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d234cca296db..7fe107f5990b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2008,6 +2008,9 @@ config KEXEC_FILE for kernel and initramfs as opposed to list of segments as accepted by previous system call. +config ARCH_HAS_KEXEC_PURGATORY + def_bool KEXEC_FILE + config KEXEC_VERIFY_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on KEXEC_FILE diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index e5bcd94c1efb..ab1dced677fd 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -532,6 +532,9 @@ static int kexec_calculate_store_digests(struct kimage *image) struct kexec_sha_region *sha_regions; struct purgatory_info *pi = &image->purgatory_info; + if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY)) + return 0; + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); zero_buf_sz = PAGE_SIZE; @@ -633,6 +636,7 @@ out: return ret; } +#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY /* Actually load purgatory. Lot of code taken from kexec-tools */ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, unsigned long max, int top_down) @@ -1022,3 +1026,4 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } +#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */ -- cgit v1.2.3-71-gd317 From 9ec4ecef0af7790551109283ca039a7c52de343c Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 13 Apr 2018 15:35:49 -0700 Subject: kexec_file,x86,powerpc: factor out kexec_file_ops functions As arch_kexec_kernel_image_{probe,load}(), arch_kimage_file_post_load_cleanup() and arch_kexec_kernel_verify_sig() are almost duplicated among architectures, they can be commonalized with an architecture-defined kexec_file_ops array. So let's factor them out. Link: http://lkml.kernel.org/r/20180306102303.9063-3-takahiro.akashi@linaro.org Signed-off-by: AKASHI Takahiro Acked-by: Dave Young Tested-by: Dave Young Cc: Vivek Goyal Cc: Baoquan He Cc: Michael Ellerman Cc: Thiago Jung Bauermann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/kexec.h | 2 +- arch/powerpc/kernel/kexec_elf_64.c | 2 +- arch/powerpc/kernel/machine_kexec_file_64.c | 39 ++----------------- arch/x86/include/asm/kexec-bzimage64.h | 2 +- arch/x86/kernel/kexec-bzimage64.c | 2 +- arch/x86/kernel/machine_kexec_64.c | 45 +--------------------- include/linux/kexec.h | 13 +++---- kernel/kexec_file.c | 60 +++++++++++++++++++++++++++-- 8 files changed, 71 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index d8b1e8e7e035..4a585cba1787 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -95,7 +95,7 @@ static inline bool kdump_in_progress(void) } #ifdef CONFIG_KEXEC_FILE -extern struct kexec_file_ops kexec_elf64_ops; +extern const struct kexec_file_ops kexec_elf64_ops; #ifdef CONFIG_IMA_KEXEC #define ARCH_HAS_KIMAGE_ARCH diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c index 9a42309b091a..6c78c11c7faf 100644 --- a/arch/powerpc/kernel/kexec_elf_64.c +++ b/arch/powerpc/kernel/kexec_elf_64.c @@ -657,7 +657,7 @@ out: return ret ? ERR_PTR(ret) : fdt; } -struct kexec_file_ops kexec_elf64_ops = { +const struct kexec_file_ops kexec_elf64_ops = { .probe = elf64_probe, .load = elf64_load, }; diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c index 45e0b7d5f200..0bd23dc789a4 100644 --- a/arch/powerpc/kernel/machine_kexec_file_64.c +++ b/arch/powerpc/kernel/machine_kexec_file_64.c @@ -31,52 +31,19 @@ #define SLAVE_CODE_SIZE 256 -static struct kexec_file_ops *kexec_file_loaders[] = { +const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_elf64_ops, + NULL }; int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) { - int i, ret = -ENOEXEC; - struct kexec_file_ops *fops; - /* We don't support crash kernels yet. */ if (image->type == KEXEC_TYPE_CRASH) return -EOPNOTSUPP; - for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) { - fops = kexec_file_loaders[i]; - if (!fops || !fops->probe) - continue; - - ret = fops->probe(buf, buf_len); - if (!ret) { - image->fops = fops; - return ret; - } - } - - return ret; -} - -void *arch_kexec_kernel_image_load(struct kimage *image) -{ - if (!image->fops || !image->fops->load) - return ERR_PTR(-ENOEXEC); - - return image->fops->load(image, image->kernel_buf, - image->kernel_buf_len, image->initrd_buf, - image->initrd_buf_len, image->cmdline_buf, - image->cmdline_buf_len); -} - -int arch_kimage_file_post_load_cleanup(struct kimage *image) -{ - if (!image->fops || !image->fops->cleanup) - return 0; - - return image->fops->cleanup(image->image_loader_data); + return kexec_image_probe_default(image, buf, buf_len); } /** diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h index 9f07cff43705..df89ee7d3e9e 100644 --- a/arch/x86/include/asm/kexec-bzimage64.h +++ b/arch/x86/include/asm/kexec-bzimage64.h @@ -2,6 +2,6 @@ #ifndef _ASM_KEXEC_BZIMAGE64_H #define _ASM_KEXEC_BZIMAGE64_H -extern struct kexec_file_ops kexec_bzImage64_ops; +extern const struct kexec_file_ops kexec_bzImage64_ops; #endif /* _ASM_KEXE_BZIMAGE64_H */ diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index fb095ba0c02f..705654776c0c 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -538,7 +538,7 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) } #endif -struct kexec_file_ops kexec_bzImage64_ops = { +const struct kexec_file_ops kexec_bzImage64_ops = { .probe = bzImage64_probe, .load = bzImage64_load, .cleanup = bzImage64_cleanup, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 93bd4fb603d1..c51d2cf27d93 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -30,8 +30,9 @@ #include #ifdef CONFIG_KEXEC_FILE -static struct kexec_file_ops *kexec_file_loaders[] = { +const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_bzImage64_ops, + NULL }; #endif @@ -364,27 +365,6 @@ void arch_crash_save_vmcoreinfo(void) /* arch-dependent functionality related to kexec file-based syscall */ #ifdef CONFIG_KEXEC_FILE -int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - int i, ret = -ENOEXEC; - struct kexec_file_ops *fops; - - for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) { - fops = kexec_file_loaders[i]; - if (!fops || !fops->probe) - continue; - - ret = fops->probe(buf, buf_len); - if (!ret) { - image->fops = fops; - return ret; - } - } - - return ret; -} - void *arch_kexec_kernel_image_load(struct kimage *image) { vfree(image->arch.elf_headers); @@ -399,27 +379,6 @@ void *arch_kexec_kernel_image_load(struct kimage *image) image->cmdline_buf_len); } -int arch_kimage_file_post_load_cleanup(struct kimage *image) -{ - if (!image->fops || !image->fops->cleanup) - return 0; - - return image->fops->cleanup(image->image_loader_data); -} - -#ifdef CONFIG_KEXEC_VERIFY_SIG -int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, - unsigned long kernel_len) -{ - if (!image->fops || !image->fops->verify_sig) { - pr_debug("kernel loader does not support signature verification."); - return -EKEYREJECTED; - } - - return image->fops->verify_sig(kernel, kernel_len); -} -#endif - /* * Apply purgatory relocations. * diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 0ebcbeb21056..102c725421a1 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -135,6 +135,11 @@ struct kexec_file_ops { #endif }; +extern const struct kexec_file_ops * const kexec_file_loaders[]; + +int kexec_image_probe_default(struct kimage *image, void *buf, + unsigned long buf_len); + /** * struct kexec_buf - parameters for finding a place for a buffer in memory * @image: kexec image in which memory to search. @@ -209,7 +214,7 @@ struct kimage { unsigned long cmdline_buf_len; /* File operations provided by image loader */ - struct kexec_file_ops *fops; + const struct kexec_file_ops *fops; /* Image loader handling the kernel can store a pointer here */ void *image_loader_data; @@ -273,12 +278,6 @@ int crash_shrink_memory(unsigned long new_size); size_t crash_get_memory_size(void); void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); -int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len); -void * __weak arch_kexec_kernel_image_load(struct kimage *image); -int __weak arch_kimage_file_post_load_cleanup(struct kimage *image); -int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, - unsigned long buf_len); int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, unsigned int relsec); int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ab1dced677fd..332c4fd12cb1 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -28,28 +28,80 @@ static int kexec_calculate_store_digests(struct kimage *image); +/* + * Currently this is the only default function that is exported as some + * architectures need it to do additional handlings. + * In the future, other default functions may be exported too if required. + */ +int kexec_image_probe_default(struct kimage *image, void *buf, + unsigned long buf_len) +{ + const struct kexec_file_ops * const *fops; + int ret = -ENOEXEC; + + for (fops = &kexec_file_loaders[0]; *fops && (*fops)->probe; ++fops) { + ret = (*fops)->probe(buf, buf_len); + if (!ret) { + image->fops = *fops; + return ret; + } + } + + return ret; +} + /* Architectures can provide this probe function */ int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) { - return -ENOEXEC; + return kexec_image_probe_default(image, buf, buf_len); +} + +static void *kexec_image_load_default(struct kimage *image) +{ + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); + + return image->fops->load(image, image->kernel_buf, + image->kernel_buf_len, image->initrd_buf, + image->initrd_buf_len, image->cmdline_buf, + image->cmdline_buf_len); } void * __weak arch_kexec_kernel_image_load(struct kimage *image) { - return ERR_PTR(-ENOEXEC); + return kexec_image_load_default(image); +} + +static int kexec_image_post_load_cleanup_default(struct kimage *image) +{ + if (!image->fops || !image->fops->cleanup) + return 0; + + return image->fops->cleanup(image->image_loader_data); } int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) { - return -EINVAL; + return kexec_image_post_load_cleanup_default(image); } #ifdef CONFIG_KEXEC_VERIFY_SIG +static int kexec_image_verify_sig_default(struct kimage *image, void *buf, + unsigned long buf_len) +{ + if (!image->fops || !image->fops->verify_sig) { + pr_debug("kernel loader does not support signature verification.\n"); + return -EKEYREJECTED; + } + + return image->fops->verify_sig(buf, buf_len); +} + int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { - return -EKEYREJECTED; + return kexec_image_verify_sig_default(image, buf, buf_len); } #endif -- cgit v1.2.3-71-gd317 From babac4a84a88842bec477a5bdada1460f3bc374c Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 13 Apr 2018 15:36:06 -0700 Subject: kexec_file, x86: move re-factored code to generic side In the previous patches, commonly-used routines, exclude_mem_range() and prepare_elf64_headers(), were carved out. Now place them in kexec common code. A prefix "crash_" is given to each of their names to avoid possible name collisions. Link: http://lkml.kernel.org/r/20180306102303.9063-8-takahiro.akashi@linaro.org Signed-off-by: AKASHI Takahiro Acked-by: Dave Young Tested-by: Dave Young Cc: Vivek Goyal Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/crash.c | 195 ++---------------------------------------------- include/linux/kexec.h | 19 +++++ kernel/kexec_file.c | 175 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 201 insertions(+), 188 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 229b8ecf8428..f631a3f15587 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -38,19 +38,6 @@ #include #include -/* Alignment required for elf header segment */ -#define ELF_CORE_HEADER_ALIGN 4096 - -struct crash_mem_range { - u64 start, end; -}; - -struct crash_mem { - unsigned int max_nr_ranges; - unsigned int nr_ranges; - struct crash_mem_range ranges[0]; -}; - /* Used while preparing memory map entries for second kernel */ struct crash_memmap_data { struct boot_params *params; @@ -227,77 +214,6 @@ static struct crash_mem *fill_up_crash_elf_data(void) return cmem; } -static int exclude_mem_range(struct crash_mem *mem, - unsigned long long mstart, unsigned long long mend) -{ - int i, j; - unsigned long long start, end; - struct crash_mem_range temp_range = {0, 0}; - - for (i = 0; i < mem->nr_ranges; i++) { - start = mem->ranges[i].start; - end = mem->ranges[i].end; - - if (mstart > end || mend < start) - continue; - - /* Truncate any area outside of range */ - if (mstart < start) - mstart = start; - if (mend > end) - mend = end; - - /* Found completely overlapping range */ - if (mstart == start && mend == end) { - mem->ranges[i].start = 0; - mem->ranges[i].end = 0; - if (i < mem->nr_ranges - 1) { - /* Shift rest of the ranges to left */ - for (j = i; j < mem->nr_ranges - 1; j++) { - mem->ranges[j].start = - mem->ranges[j+1].start; - mem->ranges[j].end = - mem->ranges[j+1].end; - } - } - mem->nr_ranges--; - return 0; - } - - if (mstart > start && mend < end) { - /* Split original range */ - mem->ranges[i].end = mstart - 1; - temp_range.start = mend + 1; - temp_range.end = end; - } else if (mstart != start) - mem->ranges[i].end = mstart - 1; - else - mem->ranges[i].start = mend + 1; - break; - } - - /* If a split happend, add the split to array */ - if (!temp_range.end) - return 0; - - /* Split happened */ - if (i == mem->max_nr_ranges - 1) - return -ENOMEM; - - /* Location where new range should go */ - j = i + 1; - if (j < mem->nr_ranges) { - /* Move over all ranges one slot towards the end */ - for (i = mem->nr_ranges - 1; i >= j; i--) - mem->ranges[i + 1] = mem->ranges[i]; - } - - mem->ranges[j].start = temp_range.start; - mem->ranges[j].end = temp_range.end; - mem->nr_ranges++; - return 0; -} - /* * Look for any unwanted ranges between mstart, mend and remove them. This * might lead to split and split ranges are put in cmem->ranges[] array @@ -307,12 +223,13 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem) int ret = 0; /* Exclude crashkernel region */ - ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); if (ret) return ret; if (crashk_low_res.end) { - ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + ret = crash_exclude_mem_range(cmem, crashk_low_res.start, + crashk_low_res.end); if (ret) return ret; } @@ -331,105 +248,6 @@ static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) return 0; } -static int prepare_elf64_headers(struct crash_mem *cmem, bool kernel_map, - void **addr, unsigned long *sz) -{ - Elf64_Ehdr *ehdr; - Elf64_Phdr *phdr; - unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; - unsigned char *buf; - unsigned int cpu, i; - unsigned long long notes_addr; - unsigned long mstart, mend; - - /* extra phdr for vmcoreinfo elf note */ - nr_phdr = nr_cpus + 1; - nr_phdr += cmem->nr_ranges; - - /* - * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping - * area on x86_64 (ffffffff80000000 - ffffffffa0000000). - * I think this is required by tools like gdb. So same physical - * memory will be mapped in two elf headers. One will contain kernel - * text virtual addresses and other will have __va(physical) addresses. - */ - - nr_phdr++; - elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); - elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); - - buf = vzalloc(elf_sz); - if (!buf) - return -ENOMEM; - - ehdr = (Elf64_Ehdr *)buf; - phdr = (Elf64_Phdr *)(ehdr + 1); - memcpy(ehdr->e_ident, ELFMAG, SELFMAG); - ehdr->e_ident[EI_CLASS] = ELFCLASS64; - ehdr->e_ident[EI_DATA] = ELFDATA2LSB; - ehdr->e_ident[EI_VERSION] = EV_CURRENT; - ehdr->e_ident[EI_OSABI] = ELF_OSABI; - memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); - ehdr->e_type = ET_CORE; - ehdr->e_machine = ELF_ARCH; - ehdr->e_version = EV_CURRENT; - ehdr->e_phoff = sizeof(Elf64_Ehdr); - ehdr->e_ehsize = sizeof(Elf64_Ehdr); - ehdr->e_phentsize = sizeof(Elf64_Phdr); - - /* Prepare one phdr of type PT_NOTE for each present cpu */ - for_each_present_cpu(cpu) { - phdr->p_type = PT_NOTE; - notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); - phdr->p_offset = phdr->p_paddr = notes_addr; - phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); - (ehdr->e_phnum)++; - phdr++; - } - - /* Prepare one PT_NOTE header for vmcoreinfo */ - phdr->p_type = PT_NOTE; - phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; - (ehdr->e_phnum)++; - phdr++; - - /* Prepare PT_LOAD type program header for kernel text region */ - if (kernel_map) { - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_vaddr = (Elf64_Addr)_text; - phdr->p_filesz = phdr->p_memsz = _end - _text; - phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); - ehdr->e_phnum++; - phdr++; - } - - /* Go through all the ranges in cmem->ranges[] and prepare phdr */ - for (i = 0; i < cmem->nr_ranges; i++) { - mstart = cmem->ranges[i].start; - mend = cmem->ranges[i].end; - - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_offset = mstart; - - phdr->p_paddr = mstart; - phdr->p_vaddr = (unsigned long long) __va(mstart); - phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; - phdr->p_align = 0; - ehdr->e_phnum++; - phdr++; - pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", - phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, - ehdr->e_phnum, phdr->p_offset); - } - - *addr = buf; - *sz = elf_sz; - return 0; -} - /* Prepare elf headers. Return addr and size */ static int prepare_elf_headers(struct kimage *image, void **addr, unsigned long *sz) @@ -454,7 +272,8 @@ static int prepare_elf_headers(struct kimage *image, void **addr, goto out; /* By default prepare 64bit headers */ - ret = prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); + ret = crash_prepare_elf64_headers(cmem, + IS_ENABLED(CONFIG_X86_64), addr, sz); if (ret) goto out; @@ -518,14 +337,14 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, /* Exclude Backup region */ start = image->arch.backup_load_addr; end = start + image->arch.backup_src_sz - 1; - ret = exclude_mem_range(cmem, start, end); + ret = crash_exclude_mem_range(cmem, start, end); if (ret) return ret; /* Exclude elf header region */ start = image->arch.elf_load_addr; end = start + image->arch.elf_headers_sz - 1; - return exclude_mem_range(cmem, start, end); + return crash_exclude_mem_range(cmem, start, end); } /* Prepare memory map for crash dump kernel */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 102c725421a1..68865fd51aad 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -168,6 +168,25 @@ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)); extern int kexec_add_buffer(struct kexec_buf *kbuf); int kexec_locate_mem_hole(struct kexec_buf *kbuf); + +/* Alignment required for elf header segment */ +#define ELF_CORE_HEADER_ALIGN 4096 + +struct crash_mem_range { + u64 start, end; +}; + +struct crash_mem { + unsigned int max_nr_ranges; + unsigned int nr_ranges; + struct crash_mem_range ranges[0]; +}; + +extern int crash_exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, + unsigned long long mend); +extern int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, + void **addr, unsigned long *sz); #endif /* CONFIG_KEXEC_FILE */ struct kimage { diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 332c4fd12cb1..b06b1fac5252 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -22,6 +22,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include "kexec_internal.h" @@ -1079,3 +1084,173 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } #endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */ + +int crash_exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end; + struct crash_mem_range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + mstart = start; + if (mend > end) + mend = end; + + /* Found completely overlapping range */ + if (mstart == start && mend == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + } + mem->nr_ranges--; + return 0; + } + + if (mstart > start && mend < end) { + /* Split original range */ + mem->ranges[i].end = mstart - 1; + temp_range.start = mend + 1; + temp_range.end = end; + } else if (mstart != start) + mem->ranges[i].end = mstart - 1; + else + mem->ranges[i].start = mend + 1; + break; + } + + /* If a split happened, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == mem->max_nr_ranges - 1) + return -ENOMEM; + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + +int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf; + unsigned int cpu, i; + unsigned long long notes_addr; + unsigned long mstart, mend; + + /* extra phdr for vmcoreinfo elf note */ + nr_phdr = nr_cpus + 1; + nr_phdr += mem->nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two elf headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + ehdr = (Elf64_Ehdr *)buf; + phdr = (Elf64_Phdr *)(ehdr + 1); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each present cpu */ + for_each_present_cpu(cpu) { + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + phdr++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; + (ehdr->e_phnum)++; + phdr++; + + /* Prepare PT_LOAD type program header for kernel text region */ + if (kernel_map) { + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + ehdr->e_phnum++; + phdr++; + } + + /* Go through all the ranges in mem->ranges[] and prepare phdr */ + for (i = 0; i < mem->nr_ranges; i++) { + mstart = mem->ranges[i].start; + mend = mem->ranges[i].end; + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + phdr++; + pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + } + + *addr = buf; + *sz = elf_sz; + return 0; +} -- cgit v1.2.3-71-gd317 From d2b8178ca7324a21495cb71049b4e4a041ab5942 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:13 -0700 Subject: kernel/kexec_file.c: remove checks in kexec_purgatory_load Before the purgatory is loaded several checks are done whether the ELF file in kexec_purgatory is valid or not. These checks are incomplete. For example they don't check for the total size of the sections defined in the section header table or if the entry point actually points into the purgatory. On the other hand the purgatory, although an ELF file on its own, is part of the kernel. Thus not trusting the purgatory means not trusting the kernel build itself. So remove all validity checks on the purgatory and just trust the kernel build. Link: http://lkml.kernel.org/r/20180321112751.22196-3-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b06b1fac5252..81ba4f782486 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -941,22 +941,8 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, if (kexec_purgatory_size <= 0) return -EINVAL; - if (kexec_purgatory_size < sizeof(Elf_Ehdr)) - return -ENOEXEC; - pi->ehdr = (Elf_Ehdr *)kexec_purgatory; - if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 - || pi->ehdr->e_type != ET_REL - || !elf_check_arch(pi->ehdr) - || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) - return -ENOEXEC; - - if (pi->ehdr->e_shoff >= kexec_purgatory_size - || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > - kexec_purgatory_size - pi->ehdr->e_shoff)) - return -ENOEXEC; - ret = __kexec_load_purgatory(image, min, max, top_down); if (ret) return ret; -- cgit v1.2.3-71-gd317 From 65c225d3280542f3ea145e052215ce0538f6bb69 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:17 -0700 Subject: kernel/kexec_file.c: make purgatory_info->ehdr const The kexec_purgatory buffer is read-only. Thus all pointers into kexec_purgatory are read-only, too. Point this out by explicitly marking purgatory_info->ehdr as 'const' and update the comments in purgatory_info. Link: http://lkml.kernel.org/r/20180321112751.22196-4-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 17 +++++++++++------ kernel/kexec_file.c | 4 ++-- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 08b8b9d00f97..8c5819d1a808 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -99,14 +99,19 @@ struct compat_kexec_segment { #ifdef CONFIG_KEXEC_FILE struct purgatory_info { - /* Pointer to elf header of read only purgatory */ - Elf_Ehdr *ehdr; - - /* Pointer to purgatory sechdrs which are modifiable */ + /* + * Pointer to elf header at the beginning of kexec_purgatory. + * Note: kexec_purgatory is read only + */ + const Elf_Ehdr *ehdr; + /* + * Temporary, modifiable buffer for sechdrs used for relocation. + * This memory can be freed post image load. + */ Elf_Shdr *sechdrs; /* - * Temporary buffer location where purgatory is loaded and relocated - * This memory can be freed post image load + * Temporary, modifiable buffer for stripped purgatory used for + * relocation. This memory can be freed post image load. */ void *purgatory_buf; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 81ba4f782486..12cf9c9ff0bc 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -941,7 +941,7 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, if (kexec_purgatory_size <= 0) return -EINVAL; - pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + pi->ehdr = (const Elf_Ehdr *)kexec_purgatory; ret = __kexec_load_purgatory(image, min, max, top_down); if (ret) @@ -965,9 +965,9 @@ out: static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, const char *name) { + const Elf_Ehdr *ehdr; Elf_Sym *syms; Elf_Shdr *sechdrs; - Elf_Ehdr *ehdr; int i, k; const char *strtab; -- cgit v1.2.3-71-gd317 From 961d921a1b967f76e13f9e11f2b0c2bcb5741f10 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:21 -0700 Subject: kernel/kexec_file.c: search symbols in read-only kexec_purgatory The stripped purgatory does not contain a symtab. So when looking for symbols this is done in read-only kexec_purgatory. Highlight this by marking the corresponding variables as 'const'. Link: http://lkml.kernel.org/r/20180321112751.22196-5-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 12cf9c9ff0bc..9bd1ec1dd875 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -962,20 +962,27 @@ out: return ret; } -static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, - const char *name) +/* + * kexec_purgatory_find_symbol - find a symbol in the purgatory + * @pi: Purgatory to search in. + * @name: Name of the symbol. + * + * Return: pointer to symbol in read-only symtab on success, NULL on error. + */ +static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) { + const Elf_Shdr *sechdrs; const Elf_Ehdr *ehdr; - Elf_Sym *syms; - Elf_Shdr *sechdrs; - int i, k; + const Elf_Sym *syms; const char *strtab; + int i, k; - if (!pi->sechdrs || !pi->ehdr) + if (!pi->ehdr) return NULL; - sechdrs = pi->sechdrs; ehdr = pi->ehdr; + sechdrs = (void *)ehdr + ehdr->e_shoff; for (i = 0; i < ehdr->e_shnum; i++) { if (sechdrs[i].sh_type != SHT_SYMTAB) @@ -984,8 +991,8 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, if (sechdrs[i].sh_link >= ehdr->e_shnum) /* Invalid strtab section number */ continue; - strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; - syms = (Elf_Sym *)sechdrs[i].sh_offset; + strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (void *)ehdr + sechdrs[i].sh_offset; /* Go through symbols for a match */ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { @@ -1013,7 +1020,7 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) { struct purgatory_info *pi = &image->purgatory_info; - Elf_Sym *sym; + const Elf_Sym *sym; Elf_Shdr *sechdr; sym = kexec_purgatory_find_symbol(pi, name); @@ -1036,9 +1043,9 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, bool get_value) { - Elf_Sym *sym; - Elf_Shdr *sechdrs; struct purgatory_info *pi = &image->purgatory_info; + const Elf_Sym *sym; + Elf_Shdr *sec; char *sym_buf; sym = kexec_purgatory_find_symbol(pi, name); @@ -1051,16 +1058,15 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return -EINVAL; } - sechdrs = pi->sechdrs; + sec = pi->sechdrs + sym->st_shndx; - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + if (sec->sh_type == SHT_NOBITS) { pr_err("symbol %s is in a bss section. Cannot %s\n", name, get_value ? "get" : "set"); return -EINVAL; } - sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + - sym->st_value; + sym_buf = (char *)sec->sh_offset + sym->st_value; if (get_value) memcpy((void *)buf, sym_buf, size); -- cgit v1.2.3-71-gd317 From 8aec395b8478310521031157ef5d44ef19c2c581 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:24 -0700 Subject: kernel/kexec_file.c: use read-only sections in arch_kexec_apply_relocations* When the relocations are applied to the purgatory only the section the relocations are applied to is writable. The other sections, i.e. the symtab and .rel/.rela, are in read-only kexec_purgatory. Highlight this by marking the corresponding variables as 'const'. While at it also change the signatures of arch_kexec_apply_relocations* to take section pointers instead of just the index of the relocation section. This removes the second lookup and sanity check of the sections in arch code. Link: http://lkml.kernel.org/r/20180321112751.22196-6-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_64.c | 56 ++++++++++++--------------------- include/linux/kexec.h | 13 +++++--- kernel/kexec_file.c | 63 +++++++++++++++++++++++++------------- 3 files changed, 71 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index c51d2cf27d93..63dea30c8e02 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -382,52 +382,36 @@ void *arch_kexec_kernel_image_load(struct kimage *image) /* * Apply purgatory relocations. * - * ehdr: Pointer to elf headers - * sechdrs: Pointer to section headers. - * relsec: section index of SHT_RELA section. + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtabsec: Corresponding symtab. * * TODO: Some of the code belongs to generic code. Move that in kexec.c. */ -int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, - Elf64_Shdr *sechdrs, unsigned int relsec) +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, const Elf_Shdr *relsec, + const Elf_Shdr *symtabsec) { unsigned int i; Elf64_Rela *rel; Elf64_Sym *sym; void *location; - Elf64_Shdr *section, *symtabsec; unsigned long address, sec_base, value; const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; - /* - * ->sh_offset has been modified to keep the pointer to section - * contents in memory - */ - rel = (void *)sechdrs[relsec].sh_offset; - - /* Section to which relocations apply */ - section = &sechdrs[sechdrs[relsec].sh_info]; - - pr_debug("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - - /* Associated symbol table */ - symtabsec = &sechdrs[sechdrs[relsec].sh_link]; - - /* String table */ - if (symtabsec->sh_link >= ehdr->e_shnum) { - /* Invalid strtab section number */ - pr_err("Invalid string table section index %d\n", - symtabsec->sh_link); - return -ENOEXEC; - } + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; - strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset; + rel = (void *)pi->ehdr + relsec->sh_offset; - /* section header string table */ - shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset; + pr_debug("Applying relocate section %s to %u\n", + shstrtab + relsec->sh_name, relsec->sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) { /* * rel[i].r_offset contains byte offset from beginning @@ -450,8 +434,8 @@ int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get * these respectively. */ - sym = (Elf64_Sym *)symtabsec->sh_offset + - ELF64_R_SYM(rel[i].r_info); + sym = (void *)pi->ehdr + symtabsec->sh_offset; + sym += ELF64_R_SYM(rel[i].r_info); if (sym->st_name) name = strtab + sym->st_name; @@ -474,12 +458,12 @@ int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, if (sym->st_shndx == SHN_ABS) sec_base = 0; - else if (sym->st_shndx >= ehdr->e_shnum) { + else if (sym->st_shndx >= pi->ehdr->e_shnum) { pr_err("Invalid section %d for symbol %s\n", sym->st_shndx, name); return -ENOEXEC; } else - sec_base = sechdrs[sym->st_shndx].sh_addr; + sec_base = pi->sechdrs[sym->st_shndx].sh_addr; value = sym->st_value; value += sec_base; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 8c5819d1a808..0e389b9b7722 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -171,6 +171,15 @@ struct kexec_buf { bool top_down; }; +int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); +int __weak arch_kexec_apply_relocations(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); + int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)); extern int kexec_add_buffer(struct kexec_buf *kbuf); @@ -304,10 +313,6 @@ int crash_shrink_memory(unsigned long new_size); size_t crash_get_memory_size(void); void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); -int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, - Elf_Shdr *sechdrs, unsigned int relsec); -int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec); void arch_kexec_protect_crashkres(void); void arch_kexec_unprotect_crashkres(void); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 9bd1ec1dd875..5c70f7f2bae3 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -110,19 +110,35 @@ int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, } #endif -/* Apply relocations of type RELA */ +/* + * arch_kexec_apply_relocations_add - apply relocations of type RELA + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ int __weak -arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) +arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("RELA relocation unsupported.\n"); return -ENOEXEC; } -/* Apply relocations of type REL */ +/* + * arch_kexec_apply_relocations - apply relocations of type REL + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ int __weak -arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) +arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("REL relocation unsupported.\n"); return -ENOEXEC; @@ -879,14 +895,19 @@ static int kexec_apply_relocations(struct kimage *image) { int i, ret; struct purgatory_info *pi = &image->purgatory_info; - Elf_Shdr *sechdrs = pi->sechdrs; + const Elf_Shdr *sechdrs; + + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; - /* Apply relocations */ for (i = 0; i < pi->ehdr->e_shnum; i++) { - Elf_Shdr *section, *symtab; + const Elf_Shdr *relsec; + const Elf_Shdr *symtab; + Elf_Shdr *section; + + relsec = sechdrs + i; - if (sechdrs[i].sh_type != SHT_RELA && - sechdrs[i].sh_type != SHT_REL) + if (relsec->sh_type != SHT_RELA && + relsec->sh_type != SHT_REL) continue; /* @@ -895,12 +916,12 @@ static int kexec_apply_relocations(struct kimage *image) * symbol table. And ->sh_info contains section header * index of section to which relocations apply. */ - if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || - sechdrs[i].sh_link >= pi->ehdr->e_shnum) + if (relsec->sh_info >= pi->ehdr->e_shnum || + relsec->sh_link >= pi->ehdr->e_shnum) return -ENOEXEC; - section = &sechdrs[sechdrs[i].sh_info]; - symtab = &sechdrs[sechdrs[i].sh_link]; + section = pi->sechdrs + relsec->sh_info; + symtab = sechdrs + relsec->sh_link; if (!(section->sh_flags & SHF_ALLOC)) continue; @@ -917,12 +938,12 @@ static int kexec_apply_relocations(struct kimage *image) * Respective architecture needs to provide support for applying * relocations of type SHT_RELA/SHT_REL. */ - if (sechdrs[i].sh_type == SHT_RELA) - ret = arch_kexec_apply_relocations_add(pi->ehdr, - sechdrs, i); - else if (sechdrs[i].sh_type == SHT_REL) - ret = arch_kexec_apply_relocations(pi->ehdr, - sechdrs, i); + if (relsec->sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi, section, + relsec, symtab); + else if (relsec->sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi, section, + relsec, symtab); if (ret) return ret; } -- cgit v1.2.3-71-gd317 From 930457057abe4e6d57433dea75e97e0e39fd0ab6 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:28 -0700 Subject: kernel/kexec_file.c: split up __kexec_load_puragory When inspecting __kexec_load_purgatory you find that it has two tasks 1) setting up the kexec_buffer for the new kernel and, 2) setting up pi->sechdrs for the final load address. The two tasks are independent of each other. To improve readability split up __kexec_load_purgatory into two functions, one for each task, and call them directly from kexec_load_purgatory. Link: http://lkml.kernel.org/r/20180321112751.22196-7-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 200 +++++++++++++++++++++++++++------------------------- 1 file changed, 103 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 5c70f7f2bae3..878b97bd3067 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -710,39 +710,97 @@ out: } #ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY -/* Actually load purgatory. Lot of code taken from kexec-tools */ -static int __kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down) +/* + * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory. + * @pi: Purgatory to be loaded. + * @kbuf: Buffer to setup. + * + * Allocates the memory needed for the buffer. Caller is responsible to free + * the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi, + struct kexec_buf *kbuf) { - struct purgatory_info *pi = &image->purgatory_info; - unsigned long align, bss_align, bss_sz, bss_pad; - unsigned long entry, load_addr, curr_load_addr, bss_addr, offset; - unsigned char *buf_addr, *src; - int i, ret = 0, entry_sidx = -1; - const Elf_Shdr *sechdrs_c; - Elf_Shdr *sechdrs = NULL; - struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1, - .buf_min = min, .buf_max = max, - .top_down = top_down }; + const Elf_Shdr *sechdrs; + unsigned long bss_align; + unsigned long bss_sz; + unsigned long align; + int i, ret; - /* - * sechdrs_c points to section headers in purgatory and are read - * only. No modifications allowed. - */ - sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + bss_align = 1; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (kbuf->buf_align < align) + kbuf->buf_align = align; + kbuf->bufsz = ALIGN(kbuf->bufsz, align); + kbuf->bufsz += sechdrs[i].sh_size; + } else { + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + kbuf->bufsz = ALIGN(kbuf->bufsz, bss_align); + kbuf->memsz = kbuf->bufsz + bss_sz; + if (kbuf->buf_align < bss_align) + kbuf->buf_align = bss_align; + + kbuf->buffer = vzalloc(kbuf->bufsz); + if (!kbuf->buffer) + return -ENOMEM; + pi->purgatory_buf = kbuf->buffer; + + ret = kexec_add_buffer(kbuf); + if (ret) + goto out; + pi->purgatory_load_addr = kbuf->mem; + + return 0; +out: + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + return ret; +} + +/* + * kexec_purgatory_setup_sechdrs - prepares the pi->sechdrs buffer. + * @pi: Purgatory to be loaded. + * @kbuf: Buffer prepared to store purgatory. + * + * Allocates the memory needed for the buffer. Caller is responsible to free + * the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, + struct kexec_buf *kbuf) +{ + unsigned long curr_load_addr; + unsigned long load_addr; + unsigned long bss_addr; + unsigned long offset; + unsigned char *buf_addr; + unsigned char *src; + Elf_Shdr *sechdrs; + int entry_sidx = -1; + int i; - /* - * We can not modify sechdrs_c[] and its fields. It is read only. - * Copy it over to a local copy where one can store some temporary - * data and free it at the end. We need to modify ->sh_addr and - * ->sh_offset fields to keep track of permanent and temporary - * locations of sections. - */ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); if (!sechdrs) return -ENOMEM; - - memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, + pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + pi->sechdrs = sechdrs; /* * We seem to have multiple copies of sections. First copy is which @@ -770,7 +828,7 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, * Identify entry point section and make entry relative to section * start. */ - entry = pi->ehdr->e_entry; + kbuf->image->start = pi->ehdr->e_entry; for (i = 0; i < pi->ehdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) continue; @@ -783,63 +841,19 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > pi->ehdr->e_entry)) { entry_sidx = i; - entry -= sechdrs[i].sh_addr; + kbuf->image->start -= sechdrs[i].sh_addr; break; } } - /* Determine how much memory is needed to load relocatable object. */ - bss_align = 1; - bss_sz = 0; - - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - if (kbuf.buf_align < align) - kbuf.buf_align = align; - kbuf.bufsz = ALIGN(kbuf.bufsz, align); - kbuf.bufsz += sechdrs[i].sh_size; - } else { - /* bss section */ - if (bss_align < align) - bss_align = align; - bss_sz = ALIGN(bss_sz, align); - bss_sz += sechdrs[i].sh_size; - } - } - - /* Determine the bss padding required to align bss properly */ - bss_pad = 0; - if (kbuf.bufsz & (bss_align - 1)) - bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1)); - - kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz; - - /* Allocate buffer for purgatory */ - kbuf.buffer = vzalloc(kbuf.bufsz); - if (!kbuf.buffer) { - ret = -ENOMEM; - goto out; - } - - if (kbuf.buf_align < bss_align) - kbuf.buf_align = bss_align; - - /* Add buffer to segment list */ - ret = kexec_add_buffer(&kbuf); - if (ret) - goto out; - pi->purgatory_load_addr = kbuf.mem; - /* Load SHF_ALLOC sections */ - buf_addr = kbuf.buffer; - load_addr = curr_load_addr = pi->purgatory_load_addr; - bss_addr = load_addr + kbuf.bufsz + bss_pad; + buf_addr = kbuf->buffer; + load_addr = curr_load_addr = kbuf->mem; + bss_addr = load_addr + kbuf->bufsz; for (i = 0; i < pi->ehdr->e_shnum; i++) { + unsigned long align; + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) continue; @@ -871,24 +885,9 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, /* Update entry point based on load address of text section */ if (entry_sidx >= 0) - entry += sechdrs[entry_sidx].sh_addr; - - /* Make kernel jump to purgatory after shutdown */ - image->start = entry; - - /* Used later to get/set symbol values */ - pi->sechdrs = sechdrs; + kbuf->image->start += sechdrs[entry_sidx].sh_addr; - /* - * Used later to identify which section is purgatory and skip it - * from checksumming. - */ - pi->purgatory_buf = kbuf.buffer; - return ret; -out: - vfree(sechdrs); - vfree(kbuf.buffer); - return ret; + return 0; } static int kexec_apply_relocations(struct kimage *image) @@ -958,16 +957,23 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, { struct purgatory_info *pi = &image->purgatory_info; int ret; + struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1, + .buf_min = min, .buf_max = max, + .top_down = top_down }; if (kexec_purgatory_size <= 0) return -EINVAL; pi->ehdr = (const Elf_Ehdr *)kexec_purgatory; - ret = __kexec_load_purgatory(image, min, max, top_down); + ret = kexec_purgatory_setup_kbuf(pi, &kbuf); if (ret) return ret; + ret = kexec_purgatory_setup_sechdrs(pi, &kbuf); + if (ret) + goto out_free_kbuf; + ret = kexec_apply_relocations(image); if (ret) goto out; @@ -977,7 +983,7 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, out: vfree(pi->sechdrs); pi->sechdrs = NULL; - +out_free_kbuf: vfree(pi->purgatory_buf); pi->purgatory_buf = NULL; return ret; -- cgit v1.2.3-71-gd317 From f1b1cca39650c9c1dbe26140946a518953f66771 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:32 -0700 Subject: kernel/kexec_file.c: remove unneeded for-loop in kexec_purgatory_setup_sechdrs To update the entry point there is an extra loop over all section headers although this can be done in the main loop. So move it there and eliminate the extra loop and variable to store the 'entry section index'. Also, in the main loop, move the usual case, i.e. non-bss section, out of the extra if-block. Link: http://lkml.kernel.org/r/20180321112751.22196-8-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Reviewed-by: Martin Schwidefsky Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 76 +++++++++++++++++++++-------------------------------- 1 file changed, 30 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 878b97bd3067..6f0ec9f2e5c1 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -792,7 +792,6 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, unsigned char *buf_addr; unsigned char *src; Elf_Shdr *sechdrs; - int entry_sidx = -1; int i; sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); @@ -824,32 +823,11 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, sechdrs[i].sh_offset; } - /* - * Identify entry point section and make entry relative to section - * start. - */ - kbuf->image->start = pi->ehdr->e_entry; - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - /* Make entry section relative */ - if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && - ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > - pi->ehdr->e_entry)) { - entry_sidx = i; - kbuf->image->start -= sechdrs[i].sh_addr; - break; - } - } - /* Load SHF_ALLOC sections */ buf_addr = kbuf->buffer; load_addr = curr_load_addr = kbuf->mem; bss_addr = load_addr + kbuf->bufsz; + kbuf->image->start = pi->ehdr->e_entry; for (i = 0; i < pi->ehdr->e_shnum; i++) { unsigned long align; @@ -858,34 +836,40 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, continue; align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - curr_load_addr = ALIGN(curr_load_addr, align); - offset = curr_load_addr - load_addr; - /* We already modifed ->sh_offset to keep src addr */ - src = (char *) sechdrs[i].sh_offset; - memcpy(buf_addr + offset, src, sechdrs[i].sh_size); - - /* Store load address and source address of section */ - sechdrs[i].sh_addr = curr_load_addr; - - /* - * This section got copied to temporary buffer. Update - * ->sh_offset accordingly. - */ - sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); - - /* Advance to the next address */ - curr_load_addr += sechdrs[i].sh_size; - } else { + + if (sechdrs[i].sh_type == SHT_NOBITS) { bss_addr = ALIGN(bss_addr, align); sechdrs[i].sh_addr = bss_addr; bss_addr += sechdrs[i].sh_size; + continue; + } + + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *)sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + if (sechdrs[i].sh_flags & SHF_EXECINSTR && + pi->ehdr->e_entry >= sechdrs[i].sh_addr && + pi->ehdr->e_entry < (sechdrs[i].sh_addr + + sechdrs[i].sh_size)) { + kbuf->image->start -= sechdrs[i].sh_addr; + kbuf->image->start += curr_load_addr; } - } - /* Update entry point based on load address of text section */ - if (entry_sidx >= 0) - kbuf->image->start += sechdrs[entry_sidx].sh_addr; + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } return 0; } -- cgit v1.2.3-71-gd317 From 620f697cc27a6d9b09268f47cd13620488ec67af Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:35 -0700 Subject: kernel/kexec_file.c: remove unneeded variables in kexec_purgatory_setup_sechdrs The main loop currently uses quite a lot of variables to update the section headers. Some of them are unnecessary. So clean them up a little. Link: http://lkml.kernel.org/r/20180321112751.22196-9-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 6f0ec9f2e5c1..7b63de8a89b6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -785,12 +785,8 @@ out: static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, struct kexec_buf *kbuf) { - unsigned long curr_load_addr; - unsigned long load_addr; unsigned long bss_addr; unsigned long offset; - unsigned char *buf_addr; - unsigned char *src; Elf_Shdr *sechdrs; int i; @@ -823,20 +819,18 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, sechdrs[i].sh_offset; } - /* Load SHF_ALLOC sections */ - buf_addr = kbuf->buffer; - load_addr = curr_load_addr = kbuf->mem; - bss_addr = load_addr + kbuf->bufsz; + offset = 0; + bss_addr = kbuf->mem + kbuf->bufsz; kbuf->image->start = pi->ehdr->e_entry; for (i = 0; i < pi->ehdr->e_shnum; i++) { unsigned long align; + void *src, *dst; if (!(sechdrs[i].sh_flags & SHF_ALLOC)) continue; align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type == SHT_NOBITS) { bss_addr = ALIGN(bss_addr, align); sechdrs[i].sh_addr = bss_addr; @@ -844,31 +838,27 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, continue; } - curr_load_addr = ALIGN(curr_load_addr, align); - offset = curr_load_addr - load_addr; - /* We already modifed ->sh_offset to keep src addr */ - src = (char *)sechdrs[i].sh_offset; - memcpy(buf_addr + offset, src, sechdrs[i].sh_size); - + offset = ALIGN(offset, align); if (sechdrs[i].sh_flags & SHF_EXECINSTR && pi->ehdr->e_entry >= sechdrs[i].sh_addr && pi->ehdr->e_entry < (sechdrs[i].sh_addr + sechdrs[i].sh_size)) { kbuf->image->start -= sechdrs[i].sh_addr; - kbuf->image->start += curr_load_addr; + kbuf->image->start += kbuf->mem + offset; } - /* Store load address and source address of section */ - sechdrs[i].sh_addr = curr_load_addr; + src = (void *)sechdrs[i].sh_offset; + dst = pi->purgatory_buf + offset; + memcpy(dst, src, sechdrs[i].sh_size); + + sechdrs[i].sh_addr = kbuf->mem + offset; /* * This section got copied to temporary buffer. Update * ->sh_offset accordingly. */ - sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); - - /* Advance to the next address */ - curr_load_addr += sechdrs[i].sh_size; + sechdrs[i].sh_offset = (unsigned long)dst; + offset += sechdrs[i].sh_size; } return 0; -- cgit v1.2.3-71-gd317 From 8da0b724959ccd3f8435214ebdaf1aef548967bb Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:39 -0700 Subject: kernel/kexec_file.c: remove mis-use of sh_offset field during purgatory load The current code uses the sh_offset field in purgatory_info->sechdrs to store a pointer to the current load address of the section. Depending whether the section will be loaded or not this is either a pointer into purgatory_info->purgatory_buf or kexec_purgatory. This is not only a violation of the ELF standard but also makes the code very hard to understand as you cannot tell if the memory you are using is read-only or not. Remove this misuse and store the offset of the section in pugaroty_info->purgatory_buf in sh_offset. Link: http://lkml.kernel.org/r/20180321112751.22196-10-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_64.c | 10 ++++++---- kernel/kexec_file.c | 37 +++++++------------------------------ 2 files changed, 13 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 63dea30c8e02..a5e55d832d0a 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -417,13 +417,15 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, * rel[i].r_offset contains byte offset from beginning * of section to the storage unit affected. * - * This is location to update (->sh_offset). This is temporary - * buffer where section is currently loaded. This will finally - * be loaded to a different address later, pointed to by + * This is location to update. This is temporary buffer + * where section is currently loaded. This will finally be + * loaded to a different address later, pointed to by * ->sh_addr. kexec takes care of moving it * (kexec_load_segment()). */ - location = (void *)(section->sh_offset + rel[i].r_offset); + location = pi->purgatory_buf; + location += section->sh_offset; + location += rel[i].r_offset; /* Final address of the location */ address = section->sh_addr + rel[i].r_offset; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 7b63de8a89b6..269116fd932c 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -790,6 +790,10 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, Elf_Shdr *sechdrs; int i; + /* + * The section headers in kexec_purgatory are read-only. In order to + * have them modifiable make a temporary copy. + */ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); if (!sechdrs) return -ENOMEM; @@ -797,28 +801,6 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); pi->sechdrs = sechdrs; - /* - * We seem to have multiple copies of sections. First copy is which - * is embedded in kernel in read only section. Some of these sections - * will be copied to a temporary buffer and relocated. And these - * sections will finally be copied to their final destination at - * segment load time. - * - * Use ->sh_offset to reflect section address in memory. It will - * point to original read only copy if section is not allocatable. - * Otherwise it will point to temporary copy which will be relocated. - * - * Use ->sh_addr to contain final address of the section where it - * will go during execution time. - */ - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type == SHT_NOBITS) - continue; - - sechdrs[i].sh_offset = (unsigned long)pi->ehdr + - sechdrs[i].sh_offset; - } - offset = 0; bss_addr = kbuf->mem + kbuf->bufsz; kbuf->image->start = pi->ehdr->e_entry; @@ -847,17 +829,12 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, kbuf->image->start += kbuf->mem + offset; } - src = (void *)sechdrs[i].sh_offset; + src = (void *)pi->ehdr + sechdrs[i].sh_offset; dst = pi->purgatory_buf + offset; memcpy(dst, src, sechdrs[i].sh_size); sechdrs[i].sh_addr = kbuf->mem + offset; - - /* - * This section got copied to temporary buffer. Update - * ->sh_offset accordingly. - */ - sechdrs[i].sh_offset = (unsigned long)dst; + sechdrs[i].sh_offset = offset; offset += sechdrs[i].sh_size; } @@ -1067,7 +1044,7 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return -EINVAL; } - sym_buf = (char *)sec->sh_offset + sym->st_value; + sym_buf = (char *)pi->purgatory_buf + sec->sh_offset + sym->st_value; if (get_value) memcpy((void *)buf, sym_buf, size); -- cgit v1.2.3-71-gd317 From 3be3f61d25e04ecf90d65d52fad632af5ba8805b Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:43 -0700 Subject: kernel/kexec_file.c: allow archs to set purgatory load address For s390 new kernels are loaded to fixed addresses in memory before they are booted. With the current code this is a problem as it assumes the kernel will be loaded to an 'arbitrary' address. In particular, kexec_locate_mem_hole searches for a large enough memory region and sets the load address (kexec_bufer->mem) to it. Luckily there is a simple workaround for this problem. By returning 1 in arch_kexec_walk_mem, kexec_locate_mem_hole is turned off. This allows the architecture to set kbuf->mem by hand. While the trick works fine for the kernel it does not for the purgatory as here the architectures don't have access to its kexec_buffer. Give architectures access to the purgatories kexec_buffer by changing kexec_load_purgatory to take a pointer to it. With this change architectures have access to the buffer and can edit it as they need. A nice side effect of this change is that we can get rid of the purgatory_info->purgatory_load_address field. As now the information stored there can directly be accessed from kbuf->mem. Link: http://lkml.kernel.org/r/20180321112751.22196-11-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Reviewed-by: Martin Schwidefsky Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/kexec_elf_64.c | 9 +++++---- arch/x86/kernel/kexec-bzimage64.c | 8 ++++---- include/linux/kexec.h | 17 ++++++----------- kernel/kexec_file.c | 29 ++++++++++++++++------------- 4 files changed, 31 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c index 6c78c11c7faf..ba4f18a43ee8 100644 --- a/arch/powerpc/kernel/kexec_elf_64.c +++ b/arch/powerpc/kernel/kexec_elf_64.c @@ -572,7 +572,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf, { int ret; unsigned int fdt_size; - unsigned long kernel_load_addr, purgatory_load_addr; + unsigned long kernel_load_addr; unsigned long initrd_load_addr = 0, fdt_load_addr; void *fdt; const void *slave_code; @@ -580,6 +580,8 @@ static void *elf64_load(struct kimage *image, char *kernel_buf, struct elf_info elf_info; struct kexec_buf kbuf = { .image = image, .buf_min = 0, .buf_max = ppc64_rma_size }; + struct kexec_buf pbuf = { .image = image, .buf_min = 0, + .buf_max = ppc64_rma_size, .top_down = true }; ret = build_elf_exec_info(kernel_buf, kernel_len, &ehdr, &elf_info); if (ret) @@ -591,14 +593,13 @@ static void *elf64_load(struct kimage *image, char *kernel_buf, pr_debug("Loaded the kernel at 0x%lx\n", kernel_load_addr); - ret = kexec_load_purgatory(image, 0, ppc64_rma_size, true, - &purgatory_load_addr); + ret = kexec_load_purgatory(image, &pbuf); if (ret) { pr_err("Loading purgatory failed.\n"); goto out; } - pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); + pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem); if (initrd != NULL) { kbuf.buffer = initrd; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 705654776c0c..3182908b7e6c 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -334,7 +334,6 @@ static void *bzImage64_load(struct kimage *image, char *kernel, unsigned long setup_header_size, params_cmdline_sz; struct boot_params *params; unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; - unsigned long purgatory_load_addr; struct bzimage64_data *ldata; struct kexec_entry64_regs regs64; void *stack; @@ -342,6 +341,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel, unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX, .top_down = true }; + struct kexec_buf pbuf = { .image = image, .buf_min = MIN_PURGATORY_ADDR, + .buf_max = ULONG_MAX, .top_down = true }; header = (struct setup_header *)(kernel + setup_hdr_offset); setup_sects = header->setup_sects; @@ -379,14 +380,13 @@ static void *bzImage64_load(struct kimage *image, char *kernel, * Load purgatory. For 64bit entry point, purgatory code can be * anywhere. */ - ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1, - &purgatory_load_addr); + ret = kexec_load_purgatory(image, &pbuf); if (ret) { pr_err("Loading purgatory failed\n"); return ERR_PTR(ret); } - pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); + pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem); /* diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 0e389b9b7722..9e4e638fb505 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -114,9 +114,6 @@ struct purgatory_info { * relocation. This memory can be freed post image load. */ void *purgatory_buf; - - /* Address where purgatory is finally loaded and is executed from */ - unsigned long purgatory_load_addr; }; struct kimage; @@ -171,6 +168,12 @@ struct kexec_buf { bool top_down; }; +int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf); +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, + bool get_value); +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); + int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, const Elf_Shdr *relsec, @@ -266,14 +269,6 @@ extern void machine_kexec_cleanup(struct kimage *image); extern int kernel_kexec(void); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); -extern int kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down, - unsigned long *load_addr); -extern int kexec_purgatory_get_set_symbol(struct kimage *image, - const char *name, void *buf, - unsigned int size, bool get_value); -extern void *kexec_purgatory_get_symbol_addr(struct kimage *image, - const char *name); extern void __crash_kexec(struct pt_regs *); extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 269116fd932c..75d8e7cf040e 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -730,8 +730,8 @@ static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi, int i, ret; sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; - bss_align = 1; - bss_sz = 0; + kbuf->buf_align = bss_align = 1; + kbuf->bufsz = bss_sz = 0; for (i = 0; i < pi->ehdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) @@ -763,7 +763,6 @@ static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi, ret = kexec_add_buffer(kbuf); if (ret) goto out; - pi->purgatory_load_addr = kbuf->mem; return 0; out: @@ -901,27 +900,32 @@ static int kexec_apply_relocations(struct kimage *image) return 0; } -/* Load relocatable purgatory object and relocate it appropriately */ -int kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down, - unsigned long *load_addr) +/* + * kexec_load_purgatory - Load and relocate the purgatory object. + * @image: Image to add the purgatory to. + * @kbuf: Memory parameters to use. + * + * Allocates the memory needed for image->purgatory_info.sechdrs and + * image->purgatory_info.purgatory_buf/kbuf->buffer. Caller is responsible + * to free the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf) { struct purgatory_info *pi = &image->purgatory_info; int ret; - struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1, - .buf_min = min, .buf_max = max, - .top_down = top_down }; if (kexec_purgatory_size <= 0) return -EINVAL; pi->ehdr = (const Elf_Ehdr *)kexec_purgatory; - ret = kexec_purgatory_setup_kbuf(pi, &kbuf); + ret = kexec_purgatory_setup_kbuf(pi, kbuf); if (ret) return ret; - ret = kexec_purgatory_setup_sechdrs(pi, &kbuf); + ret = kexec_purgatory_setup_sechdrs(pi, kbuf); if (ret) goto out_free_kbuf; @@ -929,7 +933,6 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, if (ret) goto out; - *load_addr = pi->purgatory_load_addr; return 0; out: vfree(pi->sechdrs); -- cgit v1.2.3-71-gd317