From e12f03d7031a977356e3d7b75a68c2185ff8d155 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 6 Dec 2017 14:45:15 -0800
Subject: perf/core: Implement the 'perf_kprobe' PMU

A new PMU type, perf_kprobe is added. Based on attr from perf_event_open(),
perf_kprobe creates a kprobe (or kretprobe) for the perf_event. This
kprobe is private to this perf_event, and thus not added to global
lists, and not available in tracefs.

Two functions, create_local_trace_kprobe() and
destroy_local_trace_kprobe()  are added to created and destroy these
local trace_kprobe.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Yonghong Song <yhs@fb.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Cc: <daniel@iogearbox.net>
Cc: <davem@davemloft.net>
Cc: <kernel-team@fb.com>
Cc: <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171206224518.3598254-6-songliubraving@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c            | 142 ++++++++++++++++++++++++++++++----------
 kernel/trace/trace_event_perf.c |  49 ++++++++++++++
 kernel/trace/trace_kprobe.c     |  91 ++++++++++++++++++++++---
 kernel/trace/trace_probe.h      |   7 ++
 4 files changed, 246 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d99fe3fdec8a..333735531968 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7992,9 +7992,77 @@ static struct pmu perf_tracepoint = {
 	.read		= perf_swevent_read,
 };
 
+#ifdef CONFIG_KPROBE_EVENTS
+/*
+ * Flags in config, used by dynamic PMU kprobe and uprobe
+ * The flags should match following PMU_FORMAT_ATTR().
+ *
+ * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
+ *                               if not set, create kprobe/uprobe
+ */
+enum perf_probe_config {
+	PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
+};
+
+PMU_FORMAT_ATTR(retprobe, "config:0");
+
+static struct attribute *probe_attrs[] = {
+	&format_attr_retprobe.attr,
+	NULL,
+};
+
+static struct attribute_group probe_format_group = {
+	.name = "format",
+	.attrs = probe_attrs,
+};
+
+static const struct attribute_group *probe_attr_groups[] = {
+	&probe_format_group,
+	NULL,
+};
+
+static int perf_kprobe_event_init(struct perf_event *event);
+static struct pmu perf_kprobe = {
+	.task_ctx_nr	= perf_sw_context,
+	.event_init	= perf_kprobe_event_init,
+	.add		= perf_trace_add,
+	.del		= perf_trace_del,
+	.start		= perf_swevent_start,
+	.stop		= perf_swevent_stop,
+	.read		= perf_swevent_read,
+	.attr_groups	= probe_attr_groups,
+};
+
+static int perf_kprobe_event_init(struct perf_event *event)
+{
+	int err;
+	bool is_retprobe;
+
+	if (event->attr.type != perf_kprobe.type)
+		return -ENOENT;
+	/*
+	 * no branch sampling for probe events
+	 */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
+	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
+	err = perf_kprobe_init(event, is_retprobe);
+	if (err)
+		return err;
+
+	event->destroy = perf_kprobe_destroy;
+
+	return 0;
+}
+#endif /* CONFIG_KPROBE_EVENTS */
+
 static inline void perf_tp_register(void)
 {
 	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
+#ifdef CONFIG_KPROBE_EVENTS
+	perf_pmu_register(&perf_kprobe, "kprobe", -1);
+#endif
 }
 
 static void perf_event_free_filter(struct perf_event *event)
@@ -8071,13 +8139,28 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
 }
 #endif
 
+/*
+ * returns true if the event is a tracepoint, or a kprobe/upprobe created
+ * with perf_event_open()
+ */
+static inline bool perf_event_is_tracing(struct perf_event *event)
+{
+	if (event->pmu == &perf_tracepoint)
+		return true;
+#ifdef CONFIG_KPROBE_EVENTS
+	if (event->pmu == &perf_kprobe)
+		return true;
+#endif
+	return false;
+}
+
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
 	bool is_kprobe, is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
 	int ret;
 
-	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+	if (!perf_event_is_tracing(event))
 		return perf_event_set_bpf_handler(event, prog_fd);
 
 	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
@@ -8116,7 +8199,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 
 static void perf_event_free_bpf_prog(struct perf_event *event)
 {
-	if (event->attr.type != PERF_TYPE_TRACEPOINT) {
+	if (!perf_event_is_tracing(event)) {
 		perf_event_free_bpf_handler(event);
 		return;
 	}
@@ -8535,47 +8618,36 @@ fail_clear_files:
 	return ret;
 }
 
-static int
-perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
-{
-	struct perf_event_context *ctx = event->ctx;
-	int ret;
-
-	/*
-	 * Beware, here be dragons!!
-	 *
-	 * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
-	 * stuff does not actually need it. So temporarily drop ctx->mutex. As per
-	 * perf_event_ctx_lock() we already have a reference on ctx.
-	 *
-	 * This can result in event getting moved to a different ctx, but that
-	 * does not affect the tracepoint state.
-	 */
-	mutex_unlock(&ctx->mutex);
-	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
-	mutex_lock(&ctx->mutex);
-
-	return ret;
-}
-
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 {
-	char *filter_str;
 	int ret = -EINVAL;
-
-	if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
-	    !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
-	    !has_addr_filter(event))
-		return -EINVAL;
+	char *filter_str;
 
 	filter_str = strndup_user(arg, PAGE_SIZE);
 	if (IS_ERR(filter_str))
 		return PTR_ERR(filter_str);
 
-	if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
-	    event->attr.type == PERF_TYPE_TRACEPOINT)
-		ret = perf_tracepoint_set_filter(event, filter_str);
-	else if (has_addr_filter(event))
+#ifdef CONFIG_EVENT_TRACING
+	if (perf_event_is_tracing(event)) {
+		struct perf_event_context *ctx = event->ctx;
+
+		/*
+		 * Beware, here be dragons!!
+		 *
+		 * the tracepoint muck will deadlock against ctx->mutex, but
+		 * the tracepoint stuff does not actually need it. So
+		 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
+		 * already have a reference on ctx.
+		 *
+		 * This can result in event getting moved to a different ctx,
+		 * but that does not affect the tracepoint state.
+		 */
+		mutex_unlock(&ctx->mutex);
+		ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+		mutex_lock(&ctx->mutex);
+	} else
+#endif
+	if (has_addr_filter(event))
 		ret = perf_event_set_addr_filter(event, filter_str);
 
 	kfree(filter_str);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 55d6dff37daf..779baade9114 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include "trace.h"
+#include "trace_probe.h"
 
 static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
 
@@ -237,6 +238,54 @@ void perf_trace_destroy(struct perf_event *p_event)
 	mutex_unlock(&event_mutex);
 }
 
+#ifdef CONFIG_KPROBE_EVENTS
+int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
+{
+	int ret;
+	char *func = NULL;
+	struct trace_event_call *tp_event;
+
+	if (p_event->attr.kprobe_func) {
+		func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
+		if (!func)
+			return -ENOMEM;
+		ret = strncpy_from_user(
+			func, u64_to_user_ptr(p_event->attr.kprobe_func),
+			KSYM_NAME_LEN);
+		if (ret < 0)
+			goto out;
+
+		if (func[0] == '\0') {
+			kfree(func);
+			func = NULL;
+		}
+	}
+
+	tp_event = create_local_trace_kprobe(
+		func, (void *)(unsigned long)(p_event->attr.kprobe_addr),
+		p_event->attr.probe_offset, is_retprobe);
+	if (IS_ERR(tp_event)) {
+		ret = PTR_ERR(tp_event);
+		goto out;
+	}
+
+	ret = perf_trace_event_init(tp_event, p_event);
+	if (ret)
+		destroy_local_trace_kprobe(tp_event);
+out:
+	kfree(func);
+	return ret;
+}
+
+void perf_kprobe_destroy(struct perf_event *p_event)
+{
+	perf_trace_event_close(p_event);
+	perf_trace_event_unreg(p_event);
+
+	destroy_local_trace_kprobe(p_event->tp_event);
+}
+#endif /* CONFIG_KPROBE_EVENTS */
+
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 492700c5fb4d..246c786c851c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -438,6 +438,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
 			disable_kprobe(&tk->rp.kp);
 		wait = 1;
 	}
+
+	/*
+	 * if tk is not added to any list, it must be a local trace_kprobe
+	 * created with perf_event_open. We don't need to wait for these
+	 * trace_kprobes
+	 */
+	if (list_empty(&tk->list))
+		wait = 0;
  out:
 	if (wait) {
 		/*
@@ -1313,12 +1321,9 @@ static struct trace_event_functions kprobe_funcs = {
 	.trace		= print_kprobe_event
 };
 
-static int register_kprobe_event(struct trace_kprobe *tk)
+static inline void init_trace_event_call(struct trace_kprobe *tk,
+					 struct trace_event_call *call)
 {
-	struct trace_event_call *call = &tk->tp.call;
-	int ret;
-
-	/* Initialize trace_event_call */
 	INIT_LIST_HEAD(&call->class->fields);
 	if (trace_kprobe_is_return(tk)) {
 		call->event.funcs = &kretprobe_funcs;
@@ -1327,6 +1332,19 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		call->event.funcs = &kprobe_funcs;
 		call->class->define_fields = kprobe_event_define_fields;
 	}
+
+	call->flags = TRACE_EVENT_FL_KPROBE;
+	call->class->reg = kprobe_register;
+	call->data = tk;
+}
+
+static int register_kprobe_event(struct trace_kprobe *tk)
+{
+	struct trace_event_call *call = &tk->tp.call;
+	int ret = 0;
+
+	init_trace_event_call(tk, call);
+
 	if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
 		return -ENOMEM;
 	ret = register_trace_event(&call->event);
@@ -1334,9 +1352,6 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		kfree(call->print_fmt);
 		return -ENODEV;
 	}
-	call->flags = TRACE_EVENT_FL_KPROBE;
-	call->class->reg = kprobe_register;
-	call->data = tk;
 	ret = trace_add_event_call(call);
 	if (ret) {
 		pr_info("Failed to register kprobe event: %s\n",
@@ -1358,6 +1373,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
 	return ret;
 }
 
+#ifdef CONFIG_PERF_EVENTS
+/* create a trace_kprobe, but don't add it to global lists */
+struct trace_event_call *
+create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
+			  bool is_return)
+{
+	struct trace_kprobe *tk;
+	int ret;
+	char *event;
+
+	/*
+	 * local trace_kprobes are not added to probe_list, so they are never
+	 * searched in find_trace_kprobe(). Therefore, there is no concern of
+	 * duplicated name here.
+	 */
+	event = func ? func : "DUMMY_EVENT";
+
+	tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func,
+				offs, 0 /* maxactive */, 0 /* nargs */,
+				is_return);
+
+	if (IS_ERR(tk)) {
+		pr_info("Failed to allocate trace_probe.(%d)\n",
+			(int)PTR_ERR(tk));
+		return ERR_CAST(tk);
+	}
+
+	init_trace_event_call(tk, &tk->tp.call);
+
+	if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	ret = __register_trace_kprobe(tk);
+	if (ret < 0)
+		goto error;
+
+	return &tk->tp.call;
+error:
+	free_trace_kprobe(tk);
+	return ERR_PTR(ret);
+}
+
+void destroy_local_trace_kprobe(struct trace_event_call *event_call)
+{
+	struct trace_kprobe *tk;
+
+	tk = container_of(event_call, struct trace_kprobe, tp.call);
+
+	if (trace_probe_is_enabled(&tk->tp)) {
+		WARN_ON(1);
+		return;
+	}
+
+	__unregister_trace_kprobe(tk);
+	free_trace_kprobe(tk);
+}
+#endif /* CONFIG_PERF_EVENTS */
+
 /* Make a tracefs interface for controlling probe points */
 static __init int init_kprobe_trace(void)
 {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index fb66e3eaa192..e3d940a49dcd 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -404,3 +404,10 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
 }
 
 extern int set_print_fmt(struct trace_probe *tp, bool is_return);
+
+#ifdef CONFIG_PERF_EVENTS
+extern struct trace_event_call *
+create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
+			  bool is_return);
+extern void destroy_local_trace_kprobe(struct trace_event_call *event_call);
+#endif
-- 
cgit v1.2.3-71-gd317


From 33ea4b24277b06dbc55d7f5772a46f029600255e Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 6 Dec 2017 14:45:16 -0800
Subject: perf/core: Implement the 'perf_uprobe' PMU

This patch adds perf_uprobe support with similar pattern as previous
patch (for kprobe).

Two functions, create_local_trace_uprobe() and
destroy_local_trace_uprobe(), are created so a uprobe can be created
and attached to the file descriptor created by perf_event_open().

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Yonghong Song <yhs@fb.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Cc: <daniel@iogearbox.net>
Cc: <davem@davemloft.net>
Cc: <kernel-team@fb.com>
Cc: <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171206224518.3598254-7-songliubraving@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/trace_events.h    |  4 ++
 kernel/events/core.c            | 48 ++++++++++++++++++++++-
 kernel/trace/trace_event_perf.c | 53 +++++++++++++++++++++++++
 kernel/trace/trace_probe.h      |  4 ++
 kernel/trace/trace_uprobe.c     | 86 +++++++++++++++++++++++++++++++++++++----
 5 files changed, 186 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 21c5d43a21af..0d9d6cb454b1 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -537,6 +537,10 @@ extern void perf_trace_del(struct perf_event *event, int flags);
 extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_kprobe_destroy(struct perf_event *event);
 #endif
+#ifdef CONFIG_UPROBE_EVENTS
+extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
+extern void perf_uprobe_destroy(struct perf_event *event);
+#endif
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 333735531968..5a54630db86b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7992,7 +7992,7 @@ static struct pmu perf_tracepoint = {
 	.read		= perf_swevent_read,
 };
 
-#ifdef CONFIG_KPROBE_EVENTS
+#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
 /*
  * Flags in config, used by dynamic PMU kprobe and uprobe
  * The flags should match following PMU_FORMAT_ATTR().
@@ -8020,7 +8020,9 @@ static const struct attribute_group *probe_attr_groups[] = {
 	&probe_format_group,
 	NULL,
 };
+#endif
 
+#ifdef CONFIG_KPROBE_EVENTS
 static int perf_kprobe_event_init(struct perf_event *event);
 static struct pmu perf_kprobe = {
 	.task_ctx_nr	= perf_sw_context,
@@ -8057,12 +8059,52 @@ static int perf_kprobe_event_init(struct perf_event *event)
 }
 #endif /* CONFIG_KPROBE_EVENTS */
 
+#ifdef CONFIG_UPROBE_EVENTS
+static int perf_uprobe_event_init(struct perf_event *event);
+static struct pmu perf_uprobe = {
+	.task_ctx_nr	= perf_sw_context,
+	.event_init	= perf_uprobe_event_init,
+	.add		= perf_trace_add,
+	.del		= perf_trace_del,
+	.start		= perf_swevent_start,
+	.stop		= perf_swevent_stop,
+	.read		= perf_swevent_read,
+	.attr_groups	= probe_attr_groups,
+};
+
+static int perf_uprobe_event_init(struct perf_event *event)
+{
+	int err;
+	bool is_retprobe;
+
+	if (event->attr.type != perf_uprobe.type)
+		return -ENOENT;
+	/*
+	 * no branch sampling for probe events
+	 */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
+	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
+	err = perf_uprobe_init(event, is_retprobe);
+	if (err)
+		return err;
+
+	event->destroy = perf_uprobe_destroy;
+
+	return 0;
+}
+#endif /* CONFIG_UPROBE_EVENTS */
+
 static inline void perf_tp_register(void)
 {
 	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
 #ifdef CONFIG_KPROBE_EVENTS
 	perf_pmu_register(&perf_kprobe, "kprobe", -1);
 #endif
+#ifdef CONFIG_UPROBE_EVENTS
+	perf_pmu_register(&perf_uprobe, "uprobe", -1);
+#endif
 }
 
 static void perf_event_free_filter(struct perf_event *event)
@@ -8150,6 +8192,10 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
 #ifdef CONFIG_KPROBE_EVENTS
 	if (event->pmu == &perf_kprobe)
 		return true;
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+	if (event->pmu == &perf_uprobe)
+		return true;
 #endif
 	return false;
 }
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 779baade9114..2c416509b834 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -286,6 +286,59 @@ void perf_kprobe_destroy(struct perf_event *p_event)
 }
 #endif /* CONFIG_KPROBE_EVENTS */
 
+#ifdef CONFIG_UPROBE_EVENTS
+int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
+{
+	int ret;
+	char *path = NULL;
+	struct trace_event_call *tp_event;
+
+	if (!p_event->attr.uprobe_path)
+		return -EINVAL;
+	path = kzalloc(PATH_MAX, GFP_KERNEL);
+	if (!path)
+		return -ENOMEM;
+	ret = strncpy_from_user(
+		path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX);
+	if (ret < 0)
+		goto out;
+	if (path[0] == '\0') {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	tp_event = create_local_trace_uprobe(
+		path, p_event->attr.probe_offset, is_retprobe);
+	if (IS_ERR(tp_event)) {
+		ret = PTR_ERR(tp_event);
+		goto out;
+	}
+
+	/*
+	 * local trace_uprobe need to hold event_mutex to call
+	 * uprobe_buffer_enable() and uprobe_buffer_disable().
+	 * event_mutex is not required for local trace_kprobes.
+	 */
+	mutex_lock(&event_mutex);
+	ret = perf_trace_event_init(tp_event, p_event);
+	if (ret)
+		destroy_local_trace_uprobe(tp_event);
+	mutex_unlock(&event_mutex);
+out:
+	kfree(path);
+	return ret;
+}
+
+void perf_uprobe_destroy(struct perf_event *p_event)
+{
+	mutex_lock(&event_mutex);
+	perf_trace_event_close(p_event);
+	perf_trace_event_unreg(p_event);
+	mutex_unlock(&event_mutex);
+	destroy_local_trace_uprobe(p_event->tp_event);
+}
+#endif /* CONFIG_UPROBE_EVENTS */
+
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index e3d940a49dcd..27de3174050a 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -410,4 +410,8 @@ extern struct trace_event_call *
 create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
 			  bool is_return);
 extern void destroy_local_trace_kprobe(struct trace_event_call *event_call);
+
+extern struct trace_event_call *
+create_local_trace_uprobe(char *name, unsigned long offs, bool is_return);
+extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);
 #endif
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 40592e7b3568..e3cbae702a53 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1292,16 +1292,25 @@ static struct trace_event_functions uprobe_funcs = {
 	.trace		= print_uprobe_event
 };
 
-static int register_uprobe_event(struct trace_uprobe *tu)
+static inline void init_trace_event_call(struct trace_uprobe *tu,
+					 struct trace_event_call *call)
 {
-	struct trace_event_call *call = &tu->tp.call;
-	int ret;
-
-	/* Initialize trace_event_call */
 	INIT_LIST_HEAD(&call->class->fields);
 	call->event.funcs = &uprobe_funcs;
 	call->class->define_fields = uprobe_event_define_fields;
 
+	call->flags = TRACE_EVENT_FL_UPROBE;
+	call->class->reg = trace_uprobe_register;
+	call->data = tu;
+}
+
+static int register_uprobe_event(struct trace_uprobe *tu)
+{
+	struct trace_event_call *call = &tu->tp.call;
+	int ret = 0;
+
+	init_trace_event_call(tu, call);
+
 	if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
 		return -ENOMEM;
 
@@ -1311,9 +1320,6 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 		return -ENODEV;
 	}
 
-	call->flags = TRACE_EVENT_FL_UPROBE;
-	call->class->reg = trace_uprobe_register;
-	call->data = tu;
 	ret = trace_add_event_call(call);
 
 	if (ret) {
@@ -1339,6 +1345,70 @@ static int unregister_uprobe_event(struct trace_uprobe *tu)
 	return 0;
 }
 
+#ifdef CONFIG_PERF_EVENTS
+struct trace_event_call *
+create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
+{
+	struct trace_uprobe *tu;
+	struct inode *inode;
+	struct path path;
+	int ret;
+
+	ret = kern_path(name, LOOKUP_FOLLOW, &path);
+	if (ret)
+		return ERR_PTR(ret);
+
+	inode = igrab(d_inode(path.dentry));
+	path_put(&path);
+
+	if (!inode || !S_ISREG(inode->i_mode)) {
+		iput(inode);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/*
+	 * local trace_kprobes are not added to probe_list, so they are never
+	 * searched in find_trace_kprobe(). Therefore, there is no concern of
+	 * duplicated name "DUMMY_EVENT" here.
+	 */
+	tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0,
+				is_return);
+
+	if (IS_ERR(tu)) {
+		pr_info("Failed to allocate trace_uprobe.(%d)\n",
+			(int)PTR_ERR(tu));
+		return ERR_CAST(tu);
+	}
+
+	tu->offset = offs;
+	tu->inode = inode;
+	tu->filename = kstrdup(name, GFP_KERNEL);
+	init_trace_event_call(tu, &tu->tp.call);
+
+	if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	return &tu->tp.call;
+error:
+	free_trace_uprobe(tu);
+	return ERR_PTR(ret);
+}
+
+void destroy_local_trace_uprobe(struct trace_event_call *event_call)
+{
+	struct trace_uprobe *tu;
+
+	tu = container_of(event_call, struct trace_uprobe, tp.call);
+
+	kfree(tu->tp.call.print_fmt);
+	tu->tp.call.print_fmt = NULL;
+
+	free_trace_uprobe(tu);
+}
+#endif /* CONFIG_PERF_EVENTS */
+
 /* Make a trace interface for controling probe points */
 static __init int init_uprobe_trace(void)
 {
-- 
cgit v1.2.3-71-gd317


From c53593e5cb693d59d9e8b64fb3a79436bf99c3b3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 22 Jan 2018 11:26:18 -0800
Subject: sched, cgroup: Don't reject lower cpu.max on ancestors

While adding cgroup2 interface for the cpu controller, 0d5936344f30
("sched: Implement interface for cgroup unified hierarchy") forgot to
update input validation and left it to reject cpu.max config if any
descendant has set a higher value.

cgroup2 officially supports delegation and a descendant must not be
able to restrict what its ancestors can configure.  For absolute
limits such as cpu.max and memory.max, this means that the config at
each level should only act as the upper limit at that level and
shouldn't interfere with what other cgroups can configure.

This patch updates config validation on cgroup2 so that the cpu
controller follows the same convention.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 0d5936344f30 ("sched: Implement interface for cgroup unified hierarchy")
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org # v4.15+
---
 kernel/sched/core.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bf724c1952ea..1bc6a694c84f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6678,13 +6678,18 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
 		parent_quota = parent_b->hierarchical_quota;
 
 		/*
-		 * Ensure max(child_quota) <= parent_quota, inherit when no
+		 * Ensure max(child_quota) <= parent_quota.  On cgroup2,
+		 * always take the min.  On cgroup1, only inherit when no
 		 * limit is set:
 		 */
-		if (quota == RUNTIME_INF)
-			quota = parent_quota;
-		else if (parent_quota != RUNTIME_INF && quota > parent_quota)
-			return -EINVAL;
+		if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
+			quota = min(quota, parent_quota);
+		} else {
+			if (quota == RUNTIME_INF)
+				quota = parent_quota;
+			else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+				return -EINVAL;
+		}
 	}
 	cfs_b->hierarchical_quota = quota;
 
-- 
cgit v1.2.3-71-gd317


From 387f77cc8249c847b4fa4d8c93694818b79efee3 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 13 Feb 2018 09:59:42 +0100
Subject: sched/fair: Remove stray space in #ifdef

Remove a useless space in # ifdef and align it with others.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1518512382-29426-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eb3ffc9be84..820f94c9b200 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2823,7 +2823,7 @@ void reweight_task(struct task_struct *p, int prio)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-# ifdef CONFIG_SMP
+#ifdef CONFIG_SMP
 /*
  * All this does is approximate the hierarchical proportion which includes that
  * global sum we all love to hate.
@@ -2974,7 +2974,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
 
 	return clamp_t(long, runnable, MIN_SHARES, shares);
 }
-# endif /* CONFIG_SMP */
+#endif /* CONFIG_SMP */
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
-- 
cgit v1.2.3-71-gd317


From 398953e62ce3b27f9f7805e367195b7ee6705f57 Mon Sep 17 00:00:00 2001
From: Lihao Liang <lianglihao@huawei.com>
Date: Wed, 22 Nov 2017 19:00:55 +0000
Subject: rcu: Remove unnecessary spinlock in rcu_boot_init_percpu_data()

Since rcu_boot_init_percpu_data() is only called at boot time,
there is no data race and spinlock is not needed.

Signed-off-by: Lihao Liang <lianglihao@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 491bdf39f276..66c73a214cff 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3636,12 +3636,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 static void __init
 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
-	unsigned long flags;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Set up local state, ensuring consistent view of global state. */
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1);
@@ -3649,7 +3646,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->cpu = cpu;
 	rdp->rsp = rsp;
 	rcu_boot_init_nocb_percpu_data(rdp);
-	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From a7c8655b073d89303911c89d0fd9fc4be7631fbe Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 30 Nov 2017 15:36:35 -0800
Subject: sched/isolation: Eliminate NO_HZ_FULL_ALL

Commit 6f1982fedd59 ("sched/isolation: Handle the nohz_full= parameter")
broke CONFIG_NO_HZ_FULL_ALL=y kernels.  This breakage is due to the code
under CONFIG_NO_HZ_FULL_ALL failing to invoke the shiny new housekeeping
functions.  This means that rcutorture scenario TREE04 now emits RCU CPU
stall warnings due to the RCU grace-period kthreads not being awakened
at a time of their choosing, or perhaps even not at all:

[   27.731422] rcu_bh kthread starved for 21001 jiffies! g18446744073709551369 c18446744073709551368 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x402 ->cpu=3
[   27.731423] rcu_bh          I14936     9      2 0x80080000
[   27.731435] Call Trace:
[   27.731440]  __schedule+0x31a/0x6d0
[   27.731442]  schedule+0x31/0x80
[   27.731446]  schedule_timeout+0x15a/0x320
[   27.731453]  ? call_timer_fn+0x130/0x130
[   27.731457]  rcu_gp_kthread+0x66c/0xea0
[   27.731458]  ? rcu_gp_kthread+0x66c/0xea0

Because no one has complained about CONFIG_NO_HZ_FULL_ALL=y being broken,
I hypothesize that no one is in fact using it, other than rcutorture.
This commit therefore eliminates CONFIG_NO_HZ_FULL_ALL and updates
rcutorture's config files to instead use the nohz_full= kernel parameter
to put the desired CPUs into nohz_full mode.

Fixes: 6f1982fedd59 ("sched/isolation: Handle the nohz_full= parameter")

Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/timers/NO_HZ.txt                     |  7 -------
 kernel/time/Kconfig                                | 10 ----------
 kernel/time/tick-sched.c                           | 22 ++--------------------
 .../selftests/rcutorture/configs/rcu/TASKS03       |  1 -
 .../selftests/rcutorture/configs/rcu/TASKS03.boot  |  2 +-
 .../selftests/rcutorture/configs/rcu/TREE04        |  1 -
 .../selftests/rcutorture/configs/rcu/TREE04.boot   |  2 +-
 .../selftests/rcutorture/configs/rcu/TREE07        |  1 -
 8 files changed, 4 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt
index 2dcaf9adb7a7..9591092da5e0 100644
--- a/Documentation/timers/NO_HZ.txt
+++ b/Documentation/timers/NO_HZ.txt
@@ -131,13 +131,6 @@ error message, and the boot CPU will be removed from the mask.  Note that
 this means that your system must have at least two CPUs in order for
 CONFIG_NO_HZ_FULL=y to do anything for you.
 
-Alternatively, the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter specifies
-that all CPUs other than the boot CPU are adaptive-ticks CPUs.  This
-Kconfig parameter will be overridden by the "nohz_full=" boot parameter,
-so that if both the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter and
-the "nohz_full=1" boot parameter is specified, the boot parameter will
-prevail so that only CPU 1 will be an adaptive-ticks CPU.
-
 Finally, adaptive-ticks CPUs must have their RCU callbacks offloaded.
 This is covered in the "RCU IMPLICATIONS" section below.
 
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f6b5f19223d6..78eabc41eaa6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -113,16 +113,6 @@ config NO_HZ_FULL
 
 endchoice
 
-config NO_HZ_FULL_ALL
-       bool "Full dynticks system on all CPUs by default (except CPU 0)"
-       depends on NO_HZ_FULL
-       help
-         If the user doesn't pass the nohz_full boot option to
-	 define the range of full dynticks CPUs, consider that all
-	 CPUs in the system are full dynticks by default.
-	 Note the boot CPU will still be kept outside the range to
-	 handle the timekeeping duty.
-
 config NO_HZ
 	bool "Old Idle dynticks config"
 	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 29a5733eff83..ccd3782da0bf 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -405,30 +405,12 @@ static int tick_nohz_cpu_down(unsigned int cpu)
 	return 0;
 }
 
-static int tick_nohz_init_all(void)
-{
-	int err = -1;
-
-#ifdef CONFIG_NO_HZ_FULL_ALL
-	if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
-		WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
-		return err;
-	}
-	err = 0;
-	cpumask_setall(tick_nohz_full_mask);
-	tick_nohz_full_running = true;
-#endif
-	return err;
-}
-
 void __init tick_nohz_init(void)
 {
 	int cpu, ret;
 
-	if (!tick_nohz_full_running) {
-		if (tick_nohz_init_all() < 0)
-			return;
-	}
+	if (!tick_nohz_full_running)
+		return;
 
 	/*
 	 * Full dynticks uses irq work to drive the tick rescheduling on safe
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
index c70c51d5ded1..28568b72a31b 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
@@ -9,5 +9,4 @@ CONFIG_PREEMPT=y
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=n
 CONFIG_NO_HZ_FULL=y
-CONFIG_NO_HZ_FULL_ALL=y
 #CHECK#CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot
index cd2a188eeb6d..838297c58318 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot
@@ -1 +1 @@
-rcutorture.torture_type=tasks
+rcutorture.torture_type=tasks nohz_full=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
index 27d22695d64c..24c9f6012e35 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
@@ -7,7 +7,6 @@ CONFIG_PREEMPT=n
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=n
 CONFIG_NO_HZ_FULL=y
-CONFIG_NO_HZ_FULL_ALL=y
 CONFIG_RCU_FAST_NO_HZ=y
 CONFIG_RCU_TRACE=y
 CONFIG_HOTPLUG_CPU=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot
index e34c33430447..e6071bb96c7d 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot
@@ -1 +1 @@
-rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4
+rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4 nohz_full=1-7
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
index 0f4759f4232e..d7afb271a586 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
@@ -7,7 +7,6 @@ CONFIG_PREEMPT=n
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=n
 CONFIG_NO_HZ_FULL=y
-CONFIG_NO_HZ_FULL_ALL=n
 CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_RCU_TRACE=y
 CONFIG_HOTPLUG_CPU=y
-- 
cgit v1.2.3-71-gd317


From 3016611eedaff3e75feba54392c105db214a4f52 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 4 Dec 2017 09:48:59 -0800
Subject: rcu: Fix CPU offload boot message when no CPUs are offloaded

In CONFIG_RCU_NOCB_CPU=y kernels, if the boot parameters indicate that
none of the CPUs should in fact be offloaded, the following somewhat
obtuse message appears:

	Offload RCU callbacks from CPUs: .

This commit therefore makes the message at least grammatically correct
in this case:

	Offload RCU callbacks from CPUs: (none)

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fb88a028deec..93f5649ffd2b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2312,8 +2312,11 @@ void __init rcu_init_nohz(void)
 		cpumask_and(rcu_nocb_mask, cpu_possible_mask,
 			    rcu_nocb_mask);
 	}
-	pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
-		cpumask_pr_args(rcu_nocb_mask));
+	if (cpumask_empty(rcu_nocb_mask))
+		pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
+	else
+		pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+			cpumask_pr_args(rcu_nocb_mask));
 	if (rcu_nocb_poll)
 		pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
 
-- 
cgit v1.2.3-71-gd317


From 3caa973b7a260e7a2a69edc94c300ab9c65148c3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2018 10:38:17 -0800
Subject: rcu: Call touch_nmi_watchdog() while printing stall warnings

When RCU stall warning triggers, it can print out a lot of messages
while holding spinlocks.  If the console device is slow (e.g. an
actual or IPMI serial console), it may end up triggering NMI hard
lockup watchdog like the following.

*** CPU printking while holding RCU spinlock

  PID: 4149739  TASK: ffff881a46baa880  CPU: 13  COMMAND: "CPUThreadPool8"
   #0 [ffff881fff945e48] crash_nmi_callback at ffffffff8103f7d0
   #1 [ffff881fff945e58] nmi_handle at ffffffff81020653
   #2 [ffff881fff945eb0] default_do_nmi at ffffffff81020c36
   #3 [ffff881fff945ed0] do_nmi at ffffffff81020d32
   #4 [ffff881fff945ef0] end_repeat_nmi at ffffffff81956a7e
      [exception RIP: io_serial_in+21]
      RIP: ffffffff81630e55  RSP: ffff881fff943b88  RFLAGS: 00000002
      RAX: 000000000000ca00  RBX: ffffffff8230e188  RCX: 0000000000000000
      RDX: 00000000000002fd  RSI: 0000000000000005  RDI: ffffffff8230e188
      RBP: ffff881fff943bb0   R8: 0000000000000000   R9: ffffffff820cb3c4
      R10: 0000000000000019  R11: 0000000000002000  R12: 00000000000026e1
      R13: 0000000000000020  R14: ffffffff820cd398  R15: 0000000000000035
      ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0000
  --- <NMI exception stack> ---
   #5 [ffff881fff943b88] io_serial_in at ffffffff81630e55
   #6 [ffff881fff943b90] wait_for_xmitr at ffffffff8163175c
   #7 [ffff881fff943bb8] serial8250_console_putchar at ffffffff816317dc
   #8 [ffff881fff943bd8] uart_console_write at ffffffff8162ac00
   #9 [ffff881fff943c08] serial8250_console_write at ffffffff81634691
  #10 [ffff881fff943c80] univ8250_console_write at ffffffff8162f7c2
  #11 [ffff881fff943c90] console_unlock at ffffffff810dfc55
  #12 [ffff881fff943cf0] vprintk_emit at ffffffff810dffb5
  #13 [ffff881fff943d50] vprintk_default at ffffffff810e01bf
  #14 [ffff881fff943d60] vprintk_func at ffffffff810e1127
  #15 [ffff881fff943d70] printk at ffffffff8119a8a4
  #16 [ffff881fff943dd0] print_cpu_stall_info at ffffffff810eb78c
  #17 [ffff881fff943e88] rcu_check_callbacks at ffffffff810ef133
  #18 [ffff881fff943ee8] update_process_times at ffffffff810f3497
  #19 [ffff881fff943f10] tick_sched_timer at ffffffff81103037
  #20 [ffff881fff943f38] __hrtimer_run_queues at ffffffff810f3f38
  #21 [ffff881fff943f88] hrtimer_interrupt at ffffffff810f442b

*** CPU triggering the hardlockup watchdog

  PID: 4149709  TASK: ffff88010f88c380  CPU: 26  COMMAND: "CPUThreadPool35"
   #0 [ffff883fff1059d0] machine_kexec at ffffffff8104a874
   #1 [ffff883fff105a30] __crash_kexec at ffffffff811116cc
   #2 [ffff883fff105af0] __crash_kexec at ffffffff81111795
   #3 [ffff883fff105b08] panic at ffffffff8119a6ae
   #4 [ffff883fff105b98] watchdog_overflow_callback at ffffffff81135dbd
   #5 [ffff883fff105bb0] __perf_event_overflow at ffffffff81186866
   #6 [ffff883fff105be8] perf_event_overflow at ffffffff81192bc4
   #7 [ffff883fff105bf8] intel_pmu_handle_irq at ffffffff8100b265
   #8 [ffff883fff105df8] perf_event_nmi_handler at ffffffff8100489f
   #9 [ffff883fff105e58] nmi_handle at ffffffff81020653
  #10 [ffff883fff105eb0] default_do_nmi at ffffffff81020b94
  #11 [ffff883fff105ed0] do_nmi at ffffffff81020d32
  #12 [ffff883fff105ef0] end_repeat_nmi at ffffffff81956a7e
      [exception RIP: queued_spin_lock_slowpath+248]
      RIP: ffffffff810da958  RSP: ffff883fff103e68  RFLAGS: 00000046
      RAX: 0000000000000000  RBX: 0000000000000046  RCX: 00000000006d0000
      RDX: ffff883fff49a950  RSI: 0000000000d10101  RDI: ffffffff81e54300
      RBP: ffff883fff103e80   R8: ffff883fff11a950   R9: 0000000000000000
      R10: 000000000e5873ba  R11: 000000000000010f  R12: ffffffff81e54300
      R13: 0000000000000000  R14: ffff88010f88c380  R15: ffffffff81e54300
      ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
  --- <NMI exception stack> ---
  #13 [ffff883fff103e68] queued_spin_lock_slowpath at ffffffff810da958
  #14 [ffff883fff103e70] _raw_spin_lock_irqsave at ffffffff8195550b
  #15 [ffff883fff103e88] rcu_check_callbacks at ffffffff810eed18
  #16 [ffff883fff103ee8] update_process_times at ffffffff810f3497
  #17 [ffff883fff103f10] tick_sched_timer at ffffffff81103037
  #18 [ffff883fff103f38] __hrtimer_run_queues at ffffffff810f3f38
  #19 [ffff883fff103f88] hrtimer_interrupt at ffffffff810f442b
  --- <IRQ stack> ---

Avoid spuriously triggering NMI hardlockup watchdog by touching it
from the print functions.  show_state_filter() shares the same problem
and solution.

v2: Relocate the comment to where it belongs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 93f5649ffd2b..7e31fb1e9fd8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -560,8 +560,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 	}
 	t = list_entry(rnp->gp_tasks->prev,
 		       struct task_struct, rcu_node_entry);
-	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+		/*
+		 * We could be printing a lot while holding a spinlock.
+		 * Avoid triggering hard lockup.
+		 */
+		touch_nmi_watchdog();
 		sched_show_task(t);
+	}
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
@@ -1677,6 +1683,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 	char *ticks_title;
 	unsigned long ticks_value;
 
+	/*
+	 * We could be printing a lot while holding a spinlock.  Avoid
+	 * triggering hard lockup.
+	 */
+	touch_nmi_watchdog();
+
 	if (rsp->gpnum == rdp->gpnum) {
 		ticks_title = "ticks this GP";
 		ticks_value = rdp->ticks_this_gp;
-- 
cgit v1.2.3-71-gd317


From bec06785fe2c866ccf39cafa18935d88d77d559a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 10 Jan 2018 12:16:42 -0800
Subject: rcu: Remove obsolete boost statistics for debugfs

The debugfs interface displayed statistics on RCU priority boosting,
but this interface has since been removed.  This commit therefore
removes the no-longer-used rcu_data structure's ->n_tasks_boosted,
->n_exp_boosts, and ->n_exp_boosts and their updates.

If this information proves necessary in the future, the corresponding
event traces will be added.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.h        | 6 ------
 kernel/rcu/tree_plugin.h | 8 ++------
 2 files changed, 2 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 6488a3b0e729..01abd1c4e5da 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -146,12 +146,6 @@ struct rcu_node {
 				/*  boosting for this rcu_node structure. */
 	unsigned int boost_kthread_status;
 				/* State of boost_kthread_task for tracing. */
-	unsigned long n_tasks_boosted;
-				/* Total number of tasks boosted. */
-	unsigned long n_exp_boosts;
-				/* Number of tasks boosted for expedited GP. */
-	unsigned long n_normal_boosts;
-				/* Number of tasks boosted for normal GP. */
 #ifdef CONFIG_RCU_NOCB_CPU
 	struct swait_queue_head nocb_gp_wq[2];
 				/* Place for rcu_nocb_kthread() to wait GP. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7e31fb1e9fd8..0d552985b905 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -963,14 +963,10 @@ static int rcu_boost(struct rcu_node *rnp)
 	 * expedited grace period must boost all blocked tasks, including
 	 * those blocking the pre-existing normal grace period.
 	 */
-	if (rnp->exp_tasks != NULL) {
+	if (rnp->exp_tasks != NULL)
 		tb = rnp->exp_tasks;
-		rnp->n_exp_boosts++;
-	} else {
+	else
 		tb = rnp->boost_tasks;
-		rnp->n_normal_boosts++;
-	}
-	rnp->n_tasks_boosted++;
 
 	/*
 	 * We boost task t by manufacturing an rt_mutex that appears to
-- 
cgit v1.2.3-71-gd317


From 62df63e048daf4e29373bbbaae3751e5af5d9502 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 10 Jan 2018 12:21:40 -0800
Subject: rcu: Remove obsolete callback-invocation statistics for debugfs

The debugfs interface displayed statistics on RCU callback invocation but
this interface has since been removed.  This commit therefore removes the
no-longer-used rcu_data structure's ->n_cbs_invoked and ->n_nocbs_invoked
fields along with their updates.

If this information proves necessary in the future, the corresponding
event traces will be added.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 1 -
 kernel/rcu/tree.h        | 2 --
 kernel/rcu/tree_plugin.h | 1 -
 3 files changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 66c73a214cff..8e0711954bbf 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2691,7 +2691,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	/* Update counts and requeue any remaining callbacks. */
 	rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
 	smp_mb(); /* List handling before counting for rcu_barrier(). */
-	rdp->n_cbs_invoked += count;
 	rcu_segcblist_insert_count(&rdp->cblist, &rcl);
 
 	/* Reinstate batch limit if we have worked down the excess. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 01abd1c4e5da..b258fac73524 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -211,8 +211,6 @@ struct rcu_data {
 					/* different grace periods. */
 	long		qlen_last_fqs_check;
 					/* qlen at last check for QS forcing */
-	unsigned long	n_cbs_invoked;	/* count of RCU cbs invoked. */
-	unsigned long	n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
 	unsigned long	n_force_qs_snap;
 					/* did other CPU force QS recently? */
 	long		blimit;		/* Upper limit on a processed batch */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0d552985b905..1c2d58a85511 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2243,7 +2243,6 @@ static int rcu_nocb_kthread(void *arg)
 		smp_mb__before_atomic();  /* _add after CB invocation. */
 		atomic_long_add(-c, &rdp->nocb_q_count);
 		atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
-		rdp->n_nocbs_invoked += c;
 	}
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From 01c495f72a3b5a210e5689deba1ef33c82e8aa30 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 10 Jan 2018 12:36:00 -0800
Subject: rcu: Remove obsolete __rcu_pending() statistics for debugfs

The debugfs interface displayed statistics on RCU-pending checks
but this interface has since been removed.  This commit therefore
removes the no-longer-used rcu_data structure's ->n_rcu_pending,
->n_rp_core_needs_qs, ->n_rp_report_qs, ->n_rp_cb_ready,
->n_rp_cpu_needs_gp, ->n_rp_gp_completed, ->n_rp_gp_started,
->n_rp_nocb_defer_wakeup, and ->n_rp_need_nothing fields along with
their updates.

If this information proves necessary in the future, the corresponding
event traces will be added.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 31 ++++++-------------------------
 kernel/rcu/tree.h | 17 +++--------------
 2 files changed, 9 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e0711954bbf..99d59be761d1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3354,8 +3354,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	struct rcu_node *rnp = rdp->mynode;
 
-	rdp->n_rcu_pending++;
-
 	/* Check for CPU stalls, if enabled. */
 	check_cpu_stall(rsp, rdp);
 
@@ -3364,48 +3362,31 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 		return 0;
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
-	if (rcu_scheduler_fully_active &&
-	    rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
-	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) {
-		rdp->n_rp_core_needs_qs++;
-	} else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
-		rdp->n_rp_report_qs++;
+	if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm)
 		return 1;
-	}
 
 	/* Does this CPU have callbacks ready to invoke? */
-	if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
-		rdp->n_rp_cb_ready++;
+	if (rcu_segcblist_ready_cbs(&rdp->cblist))
 		return 1;
-	}
 
 	/* Has RCU gone idle with this CPU needing another grace period? */
-	if (cpu_needs_another_gp(rsp, rdp)) {
-		rdp->n_rp_cpu_needs_gp++;
+	if (cpu_needs_another_gp(rsp, rdp))
 		return 1;
-	}
 
 	/* Has another RCU grace period completed?  */
-	if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
-		rdp->n_rp_gp_completed++;
+	if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */
 		return 1;
-	}
 
 	/* Has a new RCU grace period started? */
 	if (READ_ONCE(rnp->gpnum) != rdp->gpnum ||
-	    unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */
-		rdp->n_rp_gp_started++;
+	    unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
 		return 1;
-	}
 
 	/* Does this CPU need a deferred NOCB wakeup? */
-	if (rcu_nocb_need_deferred_wakeup(rdp)) {
-		rdp->n_rp_nocb_defer_wakeup++;
+	if (rcu_nocb_need_deferred_wakeup(rdp))
 		return 1;
-	}
 
 	/* nothing to do */
-	rdp->n_rp_need_nothing++;
 	return 0;
 }
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b258fac73524..d29bab8dea28 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -226,18 +226,7 @@ struct rcu_data {
 					/* Grace period that needs help */
 					/*  from cond_resched(). */
 
-	/* 5) __rcu_pending() statistics. */
-	unsigned long n_rcu_pending;	/* rcu_pending() calls since boot. */
-	unsigned long n_rp_core_needs_qs;
-	unsigned long n_rp_report_qs;
-	unsigned long n_rp_cb_ready;
-	unsigned long n_rp_cpu_needs_gp;
-	unsigned long n_rp_gp_completed;
-	unsigned long n_rp_gp_started;
-	unsigned long n_rp_nocb_defer_wakeup;
-	unsigned long n_rp_need_nothing;
-
-	/* 6) _rcu_barrier(), OOM callbacks, and expediting. */
+	/* 5) _rcu_barrier(), OOM callbacks, and expediting. */
 	struct rcu_head barrier_head;
 #ifdef CONFIG_RCU_FAST_NO_HZ
 	struct rcu_head oom_head;
@@ -248,7 +237,7 @@ struct rcu_data {
 	atomic_long_t exp_workdone3;	/* # done by others #3. */
 	int exp_dynticks_snap;		/* Double-check need for IPI. */
 
-	/* 7) Callback offloading. */
+	/* 6) Callback offloading. */
 #ifdef CONFIG_RCU_NOCB_CPU
 	struct rcu_head *nocb_head;	/* CBs waiting for kthread. */
 	struct rcu_head **nocb_tail;
@@ -275,7 +264,7 @@ struct rcu_data {
 					/* Leader CPU takes GP-end wakeups. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 
-	/* 8) RCU CPU stall data. */
+	/* 7) RCU CPU stall data. */
 	unsigned int softirq_snap;	/* Snapshot of softirq activity. */
 	/* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
 	struct irq_work rcu_iw;		/* Check for non-irq activity. */
-- 
cgit v1.2.3-71-gd317


From d62df57370a5215fbf1b088d2ee51fa5d69bd0c3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 10 Jan 2018 13:10:49 -0800
Subject: rcu: Remove obsolete force-quiescent-state statistics for debugfs

The debugfs interface displayed statistics on RCU-pending checks but
this interface has since been removed.  This commit therefore removes the
no-longer-used rcu_state structure's ->n_force_qs_lh and ->n_force_qs_ngp
fields along with their updates.  (Though the ->n_force_qs_ngp field
was actually not used at all, embarrassingly enough.)

If this information proves necessary in the future, the corresponding
event traces will be added.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 5 +----
 kernel/rcu/tree.h | 4 ----
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 99d59be761d1..86fc087c7777 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2844,10 +2844,8 @@ static void force_quiescent_state(struct rcu_state *rsp)
 		      !raw_spin_trylock(&rnp->fqslock);
 		if (rnp_old != NULL)
 			raw_spin_unlock(&rnp_old->fqslock);
-		if (ret) {
-			rsp->n_force_qs_lh++;
+		if (ret)
 			return;
-		}
 		rnp_old = rnp;
 	}
 	/* rnp_old == rcu_get_root(rsp), rnp == NULL. */
@@ -2856,7 +2854,6 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
 	raw_spin_unlock(&rnp_old->fqslock);
 	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
-		rsp->n_force_qs_lh++;
 		raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
 		return;  /* Someone beat us to it. */
 	}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d29bab8dea28..2edd56397b00 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -355,10 +355,6 @@ struct rcu_state {
 						/*  kthreads, if configured. */
 	unsigned long n_force_qs;		/* Number of calls to */
 						/*  force_quiescent_state(). */
-	unsigned long n_force_qs_lh;		/* ~Number of calls leaving */
-						/*  due to lock unavailable. */
-	unsigned long n_force_qs_ngp;		/* Number of calls leaving */
-						/*  due to no GP active. */
 	unsigned long gp_start;			/* Time at which GP started, */
 						/*  but in jiffies. */
 	unsigned long gp_activity;		/* Time of last GP kthread */
-- 
cgit v1.2.3-71-gd317


From d07aee2c035e04dce11209a870b48091a47bd04a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 11 Jan 2018 12:08:20 -0800
Subject: rcu: More clearly identify grace-period kthread stack dump

It is not always obvious that the stack dump from a starved grace-period
kthread isn't instead that of a CPU stalling the current grace period.
This commit therefore adds a pr_err() flagging these dumps.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 86fc087c7777..4d7c727020f0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1350,6 +1350,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
 		       rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
 		if (rsp->gp_kthread) {
+			pr_err("RCU grace-period kthread stack dump:\n");
 			sched_show_task(rsp->gp_kthread);
 			wake_up_process(rsp->gp_kthread);
 		}
-- 
cgit v1.2.3-71-gd317


From bfbd767d4dba7fb41df0d356eecb7bbd99d0a7ee Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 11 Jan 2018 12:58:53 -0800
Subject: rcu: Consolidate rcu.h #ifdefs

The kernel/rcu/rcu.h file has a pair of consecutive #ifdefs on
CONFIG_TINY_RCU, so this commit consolidates them, thus saving a few
lines of code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcu.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 6334f2c1abd0..c90812673d54 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -356,24 +356,20 @@ static inline bool rcu_gp_is_normal(void) { return true; }
 static inline bool rcu_gp_is_expedited(void) { return false; }
 static inline void rcu_expedite_gp(void) { }
 static inline void rcu_unexpedite_gp(void) { }
+static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
 #else /* #ifdef CONFIG_TINY_RCU */
 bool rcu_gp_is_normal(void);     /* Internal RCU use. */
 bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
 void rcu_expedite_gp(void);
 void rcu_unexpedite_gp(void);
 void rcupdate_announce_bootup_oddness(void);
+void rcu_request_urgent_qs_task(struct task_struct *t);
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
 #define RCU_SCHEDULER_INACTIVE	0
 #define RCU_SCHEDULER_INIT	1
 #define RCU_SCHEDULER_RUNNING	2
 
-#ifdef CONFIG_TINY_RCU
-static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
-#else /* #ifdef CONFIG_TINY_RCU */
-void rcu_request_urgent_qs_task(struct task_struct *t);
-#endif /* #else #ifdef CONFIG_TINY_RCU */
-
 enum rcutorture_type {
 	RCU_FLAVOR,
 	RCU_BH_FLAVOR,
-- 
cgit v1.2.3-71-gd317


From 65518db86b9ed1180b013c8a34c73c6ff7275886 Mon Sep 17 00:00:00 2001
From: "Liu, Changcheng" <changcheng.liu@intel.com>
Date: Tue, 16 Jan 2018 17:05:14 +0800
Subject: rcu: Remove redundant nxttail index macro define

RCU's nxttail has been optimized to be a rcu_segcblist, which is
a multi-tailed linked list with macros defined for the indexes for
each tail.  The indexes have been defined in linux/rcu_segcblist.h,
so this commit removes the redundant definitions in kernel/rcu/tree.h.

Signed-off-by: Liu Changcheng <changcheng.liu@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2edd56397b00..f491ab4f2e8e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -178,13 +178,6 @@ union rcu_noqs {
 	u16 s; /* Set of bits, aggregate OR here. */
 };
 
-/* Index values for nxttail array in struct rcu_data. */
-#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
-#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
-#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
-#define RCU_NEXT_TAIL		3
-#define RCU_NEXT_SIZE		4
-
 /* Per-CPU data for read-copy update. */
 struct rcu_data {
 	/* 1) quiescent-state and grace-period handling : */
-- 
cgit v1.2.3-71-gd317


From a32e01ee689794a26bdfdbaa7e8c334576cee36c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Wed, 17 Jan 2018 06:24:30 -0800
Subject: rcu: Use wrapper for lockdep asserts

Commits c0b334c5bfa9 and ea9b0c8a26a2 introduced new sparse warnings
by accessing rcu_node->lock directly and ignoring the __private
marker.  Introduce a new wrapper and use it.  Also fix a similar problem
in srcutree.c introduced by a3883df3935e.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcu.h         |  5 ++++-
 kernel/rcu/srcutree.c    |  2 +-
 kernel/rcu/tree.c        | 24 ++++++++++++------------
 kernel/rcu/tree_plugin.h |  4 ++--
 4 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index c90812673d54..5d13f651cf08 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -337,7 +337,7 @@ do {									\
 } while (0)
 
 #define raw_spin_unlock_irqrestore_rcu_node(p, flags)			\
-	raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)	\
+	raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)
 
 #define raw_spin_trylock_rcu_node(p)					\
 ({									\
@@ -348,6 +348,9 @@ do {									\
 	___locked;							\
 })
 
+#define raw_lockdep_assert_held_rcu_node(p)				\
+	lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
+
 #endif /* #if defined(SRCU) || !defined(TINY_RCU) */
 
 #ifdef CONFIG_TINY_RCU
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d5cea81378cc..9c6e0eaab4d5 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -439,7 +439,7 @@ static void srcu_gp_start(struct srcu_struct *sp)
 	struct srcu_data *sdp = this_cpu_ptr(sp->sda);
 	int state;
 
-	lockdep_assert_held(&sp->lock);
+	lockdep_assert_held(&ACCESS_PRIVATE(sp, lock));
 	WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&sp->srcu_gp_seq));
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4d7c727020f0..99d404c6bbbb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1161,7 +1161,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
  */
 static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
 {
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 	if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum))
 		WRITE_ONCE(rdp->gpwrap, true);
 	if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum))
@@ -1629,7 +1629,7 @@ void rcu_cpu_stall_reset(void)
 static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
 				       struct rcu_node *rnp)
 {
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 
 	/*
 	 * If RCU is idle, we just wait for the next grace period.
@@ -1676,7 +1676,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 	bool ret = false;
 	struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
 
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 
 	/*
 	 * Pick up grace-period number for new callbacks.  If this
@@ -1804,7 +1804,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 {
 	bool ret = false;
 
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 
 	/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
 	if (!rcu_segcblist_pend_cbs(&rdp->cblist))
@@ -1844,7 +1844,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 			    struct rcu_data *rdp)
 {
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 
 	/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
 	if (!rcu_segcblist_pend_cbs(&rdp->cblist))
@@ -1872,7 +1872,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 	bool ret;
 	bool need_gp;
 
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 
 	/* Handle the ends of any preceding grace periods first. */
 	if (rdp->completed == rnp->completed &&
@@ -2297,7 +2297,7 @@ static bool
 rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
 		      struct rcu_data *rdp)
 {
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 	if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
 		/*
 		 * Either we have not yet spawned the grace-period
@@ -2359,7 +2359,7 @@ static bool rcu_start_gp(struct rcu_state *rsp)
 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
-	lockdep_assert_held(&rcu_get_root(rsp)->lock);
+	raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp));
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
@@ -2384,7 +2384,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 	unsigned long oldmask = 0;
 	struct rcu_node *rnp_c;
 
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 
 	/* Walk up the rcu_node hierarchy. */
 	for (;;) {
@@ -2448,7 +2448,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 	unsigned long mask;
 	struct rcu_node *rnp_p;
 
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 	if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
 	    rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2593,7 +2593,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 	long mask;
 	struct rcu_node *rnp = rnp_leaf;
 
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
 	    rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
 		return;
@@ -3596,7 +3596,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 	long mask;
 	struct rcu_node *rnp = rnp_leaf;
 
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 	for (;;) {
 		mask = rnp->grpmask;
 		rnp = rnp->parent;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1c2d58a85511..84fbee4686d3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,7 +180,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
 			 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
 	struct task_struct *t = current;
 
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 	WARN_ON_ONCE(rdp->mynode != rnp);
 	WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
 
@@ -1044,7 +1044,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 {
 	struct task_struct *t;
 
-	lockdep_assert_held(&rnp->lock);
+	raw_lockdep_assert_held_rcu_node(rnp);
 	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
-- 
cgit v1.2.3-71-gd317


From 274afd6bfa64984724963204b832cc71565a7740 Mon Sep 17 00:00:00 2001
From: Ildar Ismagilov <devix84@gmail.com>
Date: Tue, 30 Jan 2018 15:40:16 -0800
Subject: rcu: Fix misprint in srcu_funnel_exp_start

The srcu_funnel_exp_start() function checks to see if the srcu_struct
structure's expedited grace period counter needs updating to reflect a
newly arrived request for an expedited SRCU grace period.  Unfortunately,
the check is backwards, so this commit reverses the sense of the test.

Signed-off-by: Ildar Ismagilov <devix84@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcutree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9c6e0eaab4d5..045b559b9f22 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -626,7 +626,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
 		spin_unlock_irqrestore_rcu_node(snp, flags);
 	}
 	spin_lock_irqsave_rcu_node(sp, flags);
-	if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
 		sp->srcu_gp_seq_needed_exp = s;
 	spin_unlock_irqrestore_rcu_node(sp, flags);
 }
-- 
cgit v1.2.3-71-gd317


From 9a414201ae7ea089699a0cbd36533345ca17233b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 31 Jan 2018 19:23:24 -0800
Subject: rcu: Add more tracing of expedited grace periods

This commit adds more tracing of expedited grace periods to enable
improved debugging of slowdowns.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h |  3 +++
 kernel/rcu/rcu.h           |  8 +++++++-
 kernel/rcu/tree_exp.h      | 12 ++++++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 0b50fda80db0..e56a618f2a59 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -179,6 +179,9 @@ TRACE_EVENT(rcu_grace_period_init,
  *
  *	"snap": Captured snapshot of expedited grace period sequence number.
  *	"start": Started a real expedited grace period.
+ *	"reset": Started resetting the tree
+ *	"select": Started selecting the CPUs to wait on.
+ *	"startwait": Started waiting on selected CPUs.
  *	"end": Ended a real expedited grace period.
  *	"endwake": Woke piggybackers up.
  *	"done": Someone else did the expedited grace period for us.
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 5d13f651cf08..507a0802c717 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -77,12 +77,18 @@ static inline void rcu_seq_start(unsigned long *sp)
 	WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
 }
 
+/* Compute the end-of-grace-period value for the specified sequence number. */
+static inline unsigned long rcu_seq_endval(unsigned long *sp)
+{
+	return (*sp | RCU_SEQ_STATE_MASK) + 1;
+}
+
 /* Adjust sequence number for end of update-side operation. */
 static inline void rcu_seq_end(unsigned long *sp)
 {
 	smp_mb(); /* Ensure update-side operation before counter increment. */
 	WARN_ON_ONCE(!rcu_seq_state(*sp));
-	WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1);
+	WRITE_ONCE(*sp, rcu_seq_endval(sp));
 }
 
 /* Take a snapshot of the update side's sequence number. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 46d61b597731..70ad12abde36 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -28,6 +28,15 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
 	rcu_seq_start(&rsp->expedited_sequence);
 }
 
+/*
+ * Return then value that expedited-grace-period counter will have
+ * at the end of the current grace period.
+ */
+static unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp)
+{
+	return rcu_seq_endval(&rsp->expedited_sequence);
+}
+
 /*
  * Record the end of an expedited grace period.
  */
@@ -366,7 +375,9 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 	int ret;
 	struct rcu_node *rnp;
 
+	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
 	sync_exp_reset_tree(rsp);
+	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
 	rcu_for_each_leaf_node(rsp, rnp) {
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 
@@ -443,6 +454,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	struct rcu_node *rnp_root = rcu_get_root(rsp);
 	int ret;
 
+	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait"));
 	jiffies_stall = rcu_jiffies_till_stall_check();
 	jiffies_start = jiffies;
 
-- 
cgit v1.2.3-71-gd317


From 7f5d42d05155523a4c42c2c5170f2a368217aed5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 31 Jan 2018 19:40:22 -0800
Subject: rcu: Trace expedited GP delays due to transitioning CPUs

If a CPU is transitioning to or from offline state, an expedited
grace period may undergo a timed wait.  This timed wait can unduly
delay grace periods, so this commit adds a trace statement to make
it visible.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h | 1 +
 kernel/rcu/tree_exp.h      | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index e56a618f2a59..d8c33298c153 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -181,6 +181,7 @@ TRACE_EVENT(rcu_grace_period_init,
  *	"start": Started a real expedited grace period.
  *	"reset": Started resetting the tree
  *	"select": Started selecting the CPUs to wait on.
+ *	"selectofl": Selected CPU partially offline.
  *	"startwait": Started waiting on selected CPUs.
  *	"end": Ended a real expedited grace period.
  *	"endwake": Woke piggybackers up.
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 70ad12abde36..fecb6b6ab452 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -32,7 +32,7 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
  * Return then value that expedited-grace-period counter will have
  * at the end of the current grace period.
  */
-static unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp)
+static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp)
 {
 	return rcu_seq_endval(&rsp->expedited_sequence);
 }
@@ -428,6 +428,7 @@ retry_ipi:
 			    (rnp->expmask & mask)) {
 				/* Online, so delay for a bit and try again. */
 				raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+				trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
 				schedule_timeout_uninterruptible(1);
 				goto retry_ipi;
 			}
-- 
cgit v1.2.3-71-gd317


From 65963d246147c46aafda2b04523d6dbe6c457e7c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 31 Jan 2018 20:24:15 -0800
Subject: rcu: Make expedited RCU CPU selection avoid unnecessary stores

This commit reworks the first loop in sync_rcu_exp_select_cpus()
to avoid doing unnecssary stores to other CPUs' rcu_data
structures.  This speeds up that first loop by roughly a factor of
two on an old x86 system.  In the case where the system is mostly
idle, this loop incurs a large fraction of the overhead of the
synchronize_rcu_expedited().  There is less benefit on busy systems
because the overhead of the smp_call_function_single() in the second
loop dominates in that case.

However, it is not unusual to do configuration chances involving
RCU grace periods (both expedited and normal) while the system is
mostly idle, so this optimization is worth doing.

While we are in the area, this commit also adds parentheses to arguments
used by the for_each_leaf_node_possible_cpu() macro.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcu.h      | 16 +++++++++++++---
 kernel/rcu/tree_exp.h | 21 ++++++++++++++-------
 2 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 507a0802c717..1c868bcfd705 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -301,9 +301,19 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
  * Iterate over all possible CPUs in a leaf RCU node.
  */
 #define for_each_leaf_node_possible_cpu(rnp, cpu) \
-	for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
-	     cpu <= rnp->grphi; \
-	     cpu = cpumask_next((cpu), cpu_possible_mask))
+	for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \
+	     (cpu) <= rnp->grphi; \
+	     (cpu) = cpumask_next((cpu), cpu_possible_mask))
+
+/*
+ * Iterate over all CPUs in a leaf RCU node's specified mask.
+ */
+#define rcu_find_next_bit(rnp, cpu, mask) \
+	((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu)))
+#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \
+	for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \
+	     (cpu) <= rnp->grphi; \
+	     (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
 
 /*
  * Wrappers for the rcu_node::lock acquire and release.
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index fecb6b6ab452..6ad87642f44a 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -383,15 +383,22 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 
 		/* Each pass checks a CPU for identity, offline, and idle. */
 		mask_ofl_test = 0;
-		for_each_leaf_node_possible_cpu(rnp, cpu) {
+		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+			struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
+			int snap;
 
-			rdp->exp_dynticks_snap =
-				rcu_dynticks_snap(rdp->dynticks);
 			if (raw_smp_processor_id() == cpu ||
-			    rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) ||
-			    !(rnp->qsmaskinitnext & rdp->grpmask))
-				mask_ofl_test |= rdp->grpmask;
+			    !(rnp->qsmaskinitnext & mask)) {
+				mask_ofl_test |= mask;
+			} else {
+				snap = rcu_dynticks_snap(rdtp);
+				if (rcu_dynticks_in_eqs(snap))
+					mask_ofl_test |= mask;
+				else
+					rdp->exp_dynticks_snap = snap;
+			}
 		}
 		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
 
@@ -405,7 +412,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 		/* IPI the remaining CPUs for expedited quiescent state. */
-		for_each_leaf_node_possible_cpu(rnp, cpu) {
+		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
 			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 
-- 
cgit v1.2.3-71-gd317


From cb4081cd4e0c9fd88d186553edcc78e1bbb96ce6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 1 Dec 2017 15:28:11 -0800
Subject: srcu: Abstract function name

This commit moves to __func__ for function names in the name of better
resilience to change.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcutree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d5cea81378cc..01f4a8bf0e03 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -386,7 +386,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
 	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
 	    WARN_ON(srcu_readers_active(sp))) {
-		pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+		pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
 		return; /* Caller forgot to stop doing call_srcu()? */
 	}
 	free_percpu(sp->sda);
-- 
cgit v1.2.3-71-gd317


From a35d13ec3686612b17771bf1abcad75d2ebd42ef Mon Sep 17 00:00:00 2001
From: Ildar Ismagilov <devix84@gmail.com>
Date: Wed, 31 Jan 2018 22:39:53 +0300
Subject: srcu: Prevent sdp->srcu_gp_seq_needed_exp counter wrap

SRCU checks each srcu_data structure's grace-period number for counter
wrap four times per cycle by default.  This frequency guarantees that
normal comparisons will detect potential wrap.  However, the expedited
grace-period number is not checked.  The consquences are not too horrible
(a failure to expedite a grace period when requested), but it would be
good to avoid such things.  This commit therefore adds this check to
the expedited grace-period number.

Signed-off-by: Ildar Ismagilov <devix84@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcutree.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 01f4a8bf0e03..bb9607baad9b 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -579,6 +579,9 @@ static void srcu_gp_end(struct srcu_struct *sp)
 				if (ULONG_CMP_GE(gpseq,
 						 sdp->srcu_gp_seq_needed + 100))
 					sdp->srcu_gp_seq_needed = gpseq;
+				if (ULONG_CMP_GE(gpseq,
+						 sdp->srcu_gp_seq_needed_exp + 100))
+					sdp->srcu_gp_seq_needed_exp = gpseq;
 				spin_unlock_irqrestore_rcu_node(sdp, flags);
 			}
 	}
-- 
cgit v1.2.3-71-gd317


From 8ddbd8832d25463a996a1075c35f312fb43c86b1 Mon Sep 17 00:00:00 2001
From: Ildar Ismagilov <devix84@gmail.com>
Date: Wed, 31 Jan 2018 22:42:21 +0300
Subject: srcu: Reduce scans of srcu_data in counter wrap check

Currently, given a multi-level srcu_node tree, SRCU can scan the full
set of srcu_data structures at each level when cleaning up after a grace
period.  This, though harmless otherwise, represents pointless overhead.
This commit therefore eliminates this overhead by scanning the srcu_data
structures only when traversing the leaf srcu_node structures.

Signed-off-by: Ildar Ismagilov <devix84@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcutree.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index bb9607baad9b..d582a2352cdb 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -527,6 +527,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
 {
 	unsigned long cbdelay;
 	bool cbs;
+	bool last_lvl;
 	int cpu;
 	unsigned long flags;
 	unsigned long gpseq;
@@ -559,7 +560,8 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	rcu_for_each_node_breadth_first(sp, snp) {
 		spin_lock_irq_rcu_node(snp);
 		cbs = false;
-		if (snp >= sp->level[rcu_num_lvls - 1])
+		last_lvl = snp >= sp->level[rcu_num_lvls - 1];
+		if (last_lvl)
 			cbs = snp->srcu_have_cbs[idx] == gpseq;
 		snp->srcu_have_cbs[idx] = gpseq;
 		rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
@@ -572,7 +574,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
 			srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
 
 		/* Occasionally prevent srcu_data counter wrap. */
-		if (!(gpseq & counter_wrap_check))
+		if (!(gpseq & counter_wrap_check) && last_lvl)
 			for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
 				sdp = per_cpu_ptr(sp->sda, cpu);
 				spin_lock_irqsave_rcu_node(sdp, flags);
-- 
cgit v1.2.3-71-gd317


From a72da917f1861ed06f8824328b1b3ecd003a5b5b Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 14 Feb 2018 18:05:24 +0900
Subject: srcu: Remove dead code in srcu_gp_end()

Of course, compilers will optimize out a dead code. Anyway, remove
any dead code for better readibility.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcutree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d582a2352cdb..f962f4428c39 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -532,7 +532,6 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	unsigned long flags;
 	unsigned long gpseq;
 	int idx;
-	int idxnext;
 	unsigned long mask;
 	struct srcu_data *sdp;
 	struct srcu_node *snp;
@@ -556,7 +555,6 @@ static void srcu_gp_end(struct srcu_struct *sp)
 
 	/* Initiate callback invocation as needed. */
 	idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
-	idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
 	rcu_for_each_node_breadth_first(sp, snp) {
 		spin_lock_irq_rcu_node(snp);
 		cbs = false;
-- 
cgit v1.2.3-71-gd317


From 6308f3477510e15391ac82deaee66f8425339014 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 14 Feb 2018 14:20:45 -0800
Subject: rcu: Remove SRCU throttling

The code in srcu_gp_end() inserts a delay every 0x3ff grace periods in
order to prevent SRCU grace-period work from consuming an entire CPU
when there is a long sequence of expedited SRCU grace-period requests.
However, all of SRCU's grace-period work is carried out in workqueues,
which are in turn within kthreads, which are automatically throttled as
needed by the scheduler.  In particular, if there is plenty of idle time,
there is no point in throttling.

This commit therefore removes the expedited SRCU grace-period throttling.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcutree.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index f962f4428c39..e2796f79a597 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -596,9 +596,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	    ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
 		srcu_gp_start(sp);
 		spin_unlock_irq_rcu_node(sp);
-		/* Throttle expedited grace periods: Should be rare! */
-		srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
-				    ? 0 : SRCU_INTERVAL);
+		srcu_reschedule(sp, 0);
 	} else {
 		spin_unlock_irq_rcu_node(sp);
 	}
-- 
cgit v1.2.3-71-gd317


From 68a675d433fbaa8a39aa9a63695da68192b70e3c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 1 Dec 2017 14:26:56 -0800
Subject: rcutorture: Replace multi-instance kzalloc() with kcalloc()

This commit replaces array-allocation calls to kzalloc() with
equivalent calls to kcalloc().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 308e6fdbced8..b4a201dcbc55 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1557,11 +1557,10 @@ static int rcu_torture_barrier_init(void)
 	atomic_set(&barrier_cbs_count, 0);
 	atomic_set(&barrier_cbs_invoked, 0);
 	barrier_cbs_tasks =
-		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+		kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]),
 			GFP_KERNEL);
 	barrier_cbs_wq =
-		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
-			GFP_KERNEL);
+		kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL);
 	if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
 		return -ENOMEM;
 	for (i = 0; i < n_barrier_cbs; i++) {
@@ -1799,7 +1798,7 @@ rcu_torture_init(void)
 	if (firsterr)
 		goto unwind;
 	if (nfakewriters > 0) {
-		fakewriter_tasks = kzalloc(nfakewriters *
+		fakewriter_tasks = kcalloc(nfakewriters,
 					   sizeof(fakewriter_tasks[0]),
 					   GFP_KERNEL);
 		if (fakewriter_tasks == NULL) {
@@ -1814,7 +1813,7 @@ rcu_torture_init(void)
 		if (firsterr)
 			goto unwind;
 	}
-	reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
+	reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
 			       GFP_KERNEL);
 	if (reader_tasks == NULL) {
 		VERBOSE_TOROUT_ERRSTRING("out of memory");
-- 
cgit v1.2.3-71-gd317


From e0d31a34c6db6381bfc630a9ee93f05b9447dcc3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 1 Dec 2017 15:22:38 -0800
Subject: rcutorture: Abstract function and module names

This commit moves to __func__ for function names and for KBUILD_MODNAME
for module names, all in the name of better resilience to change.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b4a201dcbc55..0f94025c672a 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -924,19 +924,19 @@ rcu_torture_writer(void *arg)
 	if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
 		synctype[nsynctypes++] = RTWS_COND_GET;
 	else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
-		pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
+		pr_alert("%s: gp_cond without primitives.\n", __func__);
 	if (gp_exp1 && cur_ops->exp_sync)
 		synctype[nsynctypes++] = RTWS_EXP_SYNC;
 	else if (gp_exp && !cur_ops->exp_sync)
-		pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
+		pr_alert("%s: gp_exp without primitives.\n", __func__);
 	if (gp_normal1 && cur_ops->deferred_free)
 		synctype[nsynctypes++] = RTWS_DEF_FREE;
 	else if (gp_normal && !cur_ops->deferred_free)
-		pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
+		pr_alert("%s: gp_normal without primitives.\n", __func__);
 	if (gp_sync1 && cur_ops->sync)
 		synctype[nsynctypes++] = RTWS_SYNC;
 	else if (gp_sync && !cur_ops->sync)
-		pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
+		pr_alert("%s: gp_sync without primitives.\n", __func__);
 	if (WARN_ONCE(nsynctypes == 0,
 		      "rcu_torture_writer: No update-side primitives.\n")) {
 		/*
@@ -1673,7 +1673,7 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
 	 * next grace period.  Unlikely, but can happen.  If it
 	 * does happen, the debug-objects subsystem won't have splatted.
 	 */
-	pr_alert("rcutorture: duplicated callback was invoked.\n");
+	pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME);
 }
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
@@ -1690,7 +1690,7 @@ static void rcu_test_debug_objects(void)
 
 	init_rcu_head_on_stack(&rh1);
 	init_rcu_head_on_stack(&rh2);
-	pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
+	pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME);
 
 	/* Try to queue the rh2 pair of callbacks for the same grace period. */
 	preempt_disable(); /* Prevent preemption from interrupting test. */
@@ -1705,11 +1705,11 @@ static void rcu_test_debug_objects(void)
 
 	/* Wait for them all to get done so we can safely return. */
 	rcu_barrier();
-	pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
+	pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
 	destroy_rcu_head_on_stack(&rh1);
 	destroy_rcu_head_on_stack(&rh2);
 #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-	pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
+	pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
 #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 }
 
-- 
cgit v1.2.3-71-gd317


From eb0339934f1d468ff09d9be1c608c89cb1da850b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 8 Dec 2017 10:48:41 -0800
Subject: rcutorture: Avoid fake-writer use of undefined primitives

Currently the rcu_torture_fakewriter() function invokes cur_ops->sync()
and cur_ops->exp_sync() without first checking to see if they are in
fact non-NULL.  This results in kernel NULL pointer dereferences when
testing RCU implementations that choose not to provide the full set of
primitives.  Given that it is perfectly reasonable to have specialized
RCU implementations that provide only a subset of the RCU API, this is
a bug in rcutorture.

This commit therefore makes rcu_torture_fakewriter() check function
pointers before invoking them, thus allowing it to test subsetted
RCU implementations.

Reported-by: Lihao Liang <lianglihao@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 0f94025c672a..6c46cd1d8fd7 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1045,13 +1045,13 @@ rcu_torture_fakewriter(void *arg)
 		    torture_random(&rand) % (nfakewriters * 8) == 0) {
 			cur_ops->cb_barrier();
 		} else if (gp_normal == gp_exp) {
-			if (torture_random(&rand) & 0x80)
+			if (cur_ops->sync && torture_random(&rand) & 0x80)
 				cur_ops->sync();
-			else
+			else if (cur_ops->exp_sync)
 				cur_ops->exp_sync();
-		} else if (gp_normal) {
+		} else if (gp_normal && cur_ops->sync) {
 			cur_ops->sync();
-		} else {
+		} else if (cur_ops->exp_sync) {
 			cur_ops->exp_sync();
 		}
 		stutter_wait("rcu_torture_fakewriter");
-- 
cgit v1.2.3-71-gd317


From f7c0e6ad4bb1ab1cf358b8815f7d0ba7ff94d786 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 8 Dec 2017 11:37:24 -0800
Subject: rcutorture: Re-enable testing of dynamic expediting

During boot, normal grace periods are processed as expedited.  When
rcutorture is built into the kernel, it starts during boot and thus
detects that normal grace periods are unconditionally expedited.
Therefore, rcutorture concludes that there is no point in trying
to dynamically enable expediting, do it disables this aspect of testing,
which is a bit of an overreaction to the temporary boot-time expediting.

This commit therefore rechecks forced expediting throughout the test,
enabling dynamic expediting if normal grace periods are processed
normally at any point.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 6c46cd1d8fd7..2964b9236ddc 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -909,14 +909,10 @@ rcu_torture_writer(void *arg)
 	int nsynctypes = 0;
 
 	VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
-	if (!can_expedite) {
+	if (!can_expedite)
 		pr_alert("%s" TORTURE_FLAG
-			 " GP expediting controlled from boot/sysfs for %s,\n",
+			 " GP expediting controlled from boot/sysfs for %s.\n",
 			 torture_type, cur_ops->name);
-		pr_alert("%s" TORTURE_FLAG
-			 " Disabled dynamic grace-period expediting.\n",
-			 torture_type);
-	}
 
 	/* Initialize synctype[] array.  If none set, take default. */
 	if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
@@ -1011,6 +1007,9 @@ rcu_torture_writer(void *arg)
 				rcu_unexpedite_gp();
 			if (++expediting > 3)
 				expediting = -expediting;
+		} else if (!can_expedite) { /* Disabled during boot, recheck. */
+			can_expedite = !rcu_gp_is_expedited() &&
+				       !rcu_gp_is_normal();
 		}
 		rcu_torture_writer_state = RTWS_STUTTER;
 		stutter_wait("rcu_torture_writer");
@@ -1021,6 +1020,10 @@ rcu_torture_writer(void *arg)
 	while (can_expedite && expediting++ < 0)
 		rcu_unexpedite_gp();
 	WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
+	if (!can_expedite)
+		pr_alert("%s" TORTURE_FLAG
+			 " Dynamic grace-period expediting was disabled.\n",
+			 torture_type);
 	rcu_torture_writer_state = RTWS_STOPPING;
 	torture_kthread_stopping("rcu_torture_writer");
 	return 0;
-- 
cgit v1.2.3-71-gd317


From db0c1a8aba31edd1432f3a5925a32107065b5568 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 8 Dec 2017 12:23:10 -0800
Subject: rcutorture: Record which grace-period primitives are tested

The rcu_torture_writer() function adapts to requested testing from module
parameters as well as the function pointers in the structure referenced
by cur_ops.  However, as long as the module parameters do not conflict
with the function pointers, this adaptation is silent.  This silence can
result in confusion as to exactly what was tested, which could in turn
result in untested RCU code making its way into mainline.

This commit therefore makes rcu_torture_writer() announce exactly which
portions of RCU's API it ends up testing.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 2964b9236ddc..680c96d8c00f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -917,22 +917,30 @@ rcu_torture_writer(void *arg)
 	/* Initialize synctype[] array.  If none set, take default. */
 	if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
 		gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
-	if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
+	if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) {
 		synctype[nsynctypes++] = RTWS_COND_GET;
-	else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
+		pr_info("%s: Testing conditional GPs.\n", __func__);
+	} else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) {
 		pr_alert("%s: gp_cond without primitives.\n", __func__);
-	if (gp_exp1 && cur_ops->exp_sync)
+	}
+	if (gp_exp1 && cur_ops->exp_sync) {
 		synctype[nsynctypes++] = RTWS_EXP_SYNC;
-	else if (gp_exp && !cur_ops->exp_sync)
+		pr_info("%s: Testing expedited GPs.\n", __func__);
+	} else if (gp_exp && !cur_ops->exp_sync) {
 		pr_alert("%s: gp_exp without primitives.\n", __func__);
-	if (gp_normal1 && cur_ops->deferred_free)
+	}
+	if (gp_normal1 && cur_ops->deferred_free) {
 		synctype[nsynctypes++] = RTWS_DEF_FREE;
-	else if (gp_normal && !cur_ops->deferred_free)
+		pr_info("%s: Testing asynchronous GPs.\n", __func__);
+	} else if (gp_normal && !cur_ops->deferred_free) {
 		pr_alert("%s: gp_normal without primitives.\n", __func__);
-	if (gp_sync1 && cur_ops->sync)
+	}
+	if (gp_sync1 && cur_ops->sync) {
 		synctype[nsynctypes++] = RTWS_SYNC;
-	else if (gp_sync && !cur_ops->sync)
+		pr_info("%s: Testing normal GPs.\n", __func__);
+	} else if (gp_sync && !cur_ops->sync) {
 		pr_alert("%s: gp_sync without primitives.\n", __func__);
+	}
 	if (WARN_ONCE(nsynctypes == 0,
 		      "rcu_torture_writer: No update-side primitives.\n")) {
 		/*
-- 
cgit v1.2.3-71-gd317


From 85ba6bfe8bb2a4d907f7380a8f37b31616ad694e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 1 Feb 2018 19:19:04 -0800
Subject: torture: Provide more sensible nreader/nwriter defaults for rcuperf

The default values for nreader and nwriter are apparently not all that
user-friendly, resulting in people doing scalability tests that ran all
runs at large scale.  This commit therefore makes both the nreaders and
nwriters module default to the number of CPUs, and adds a comment to
rcuperf.c stating that the number of CPUs should be specified using the
nr_cpus kernel boot parameter.  This commit also eliminates the redundant
rcuperf scripting specification of default values for these parameters.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcuperf.c                               | 21 ++++++++++++++++++-
 .../rcutorture/configs/rcuperf/ver_functions.sh    | 24 +---------------------
 2 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index d1ebdf9868bb..777e7a6a0292 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -61,11 +61,30 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
 #define VERBOSE_PERFOUT_ERRSTRING(s) \
 	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
 
+/*
+ * The intended use cases for the nreaders and nwriters module parameters
+ * are as follows:
+ *
+ * 1.	Specify only the nr_cpus kernel boot parameter.  This will
+ *	set both nreaders and nwriters to the value specified by
+ *	nr_cpus for a mixed reader/writer test.
+ *
+ * 2.	Specify the nr_cpus kernel boot parameter, but set
+ *	rcuperf.nreaders to zero.  This will set nwriters to the
+ *	value specified by nr_cpus for an update-only test.
+ *
+ * 3.	Specify the nr_cpus kernel boot parameter, but set
+ *	rcuperf.nwriters to zero.  This will set nreaders to the
+ *	value specified by nr_cpus for a read-only test.
+ *
+ * Various other use cases may of course be specified.
+ */
+
 torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
 torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
 torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
 torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
-torture_param(int, nreaders, 0, "Number of RCU reader threads");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
 torture_param(int, nwriters, -1, "Number of RCU updater threads");
 torture_param(bool, shutdown, !IS_ENABLED(MODULE),
 	      "Shutdown at end of performance tests.");
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh
index b9603115d7c7..d36b8fd6f0fc 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh
@@ -20,32 +20,10 @@
 #
 # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 
-# rcuperf_param_nreaders bootparam-string
-#
-# Adds nreaders rcuperf module parameter if not already specified.
-rcuperf_param_nreaders () {
-	if ! echo "$1" | grep -q "rcuperf.nreaders"
-	then
-		echo rcuperf.nreaders=-1
-	fi
-}
-
-# rcuperf_param_nwriters bootparam-string
-#
-# Adds nwriters rcuperf module parameter if not already specified.
-rcuperf_param_nwriters () {
-	if ! echo "$1" | grep -q "rcuperf.nwriters"
-	then
-		echo rcuperf.nwriters=-1
-	fi
-}
-
 # per_version_boot_params bootparam-string config-file seconds
 #
 # Adds per-version torture-module parameters to kernels supporting them.
 per_version_boot_params () {
-	echo $1 `rcuperf_param_nreaders "$1"` \
-		`rcuperf_param_nwriters "$1"` \
-		rcuperf.shutdown=1 \
+	echo $1 rcuperf.shutdown=1 \
 		rcuperf.verbose=1
 }
-- 
cgit v1.2.3-71-gd317


From 7ebb66a12f85bc375beaf45ca900427fe47aa8f7 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 13 Feb 2018 13:37:25 +0000
Subject: sched/fair: Avoid an unnecessary lookup of current CPU ID during
 wake_affine

The only caller of wake_affine() knows the CPU ID. Pass it in instead of
rechecking it.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Giovanni Gherdovich <ggherdovich@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180213133730.24064-2-mgorman@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 820f94c9b200..0132572d7523 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5751,9 +5751,8 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 }
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
-		       int prev_cpu, int sync)
+		       int this_cpu, int prev_cpu, int sync)
 {
-	int this_cpu = smp_processor_id();
 	int target = nr_cpumask_bits;
 
 	if (sched_feat(WA_IDLE))
@@ -6376,7 +6375,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		if (cpu == prev_cpu)
 			goto pick_cpu;
 
-		new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
+		new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
 	}
 
 	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
-- 
cgit v1.2.3-71-gd317


From eeb60398639143c11ff2c8b509e3a471411bb5d3 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 13 Feb 2018 13:37:26 +0000
Subject: sched/fair: Defer calculation of 'prev_eff_load' in
 wake_affine_weight() until needed

On sync wakeups, the previous CPU effective load may not be used so delay
the calculation until it's needed.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Giovanni Gherdovich <ggherdovich@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180213133730.24064-3-mgorman@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0132572d7523..ae3e6f877711 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5724,7 +5724,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 	unsigned long task_load;
 
 	this_eff_load = target_load(this_cpu, sd->wake_idx);
-	prev_eff_load = source_load(prev_cpu, sd->wake_idx);
 
 	if (sync) {
 		unsigned long current_load = task_h_load(current);
@@ -5742,6 +5741,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 		this_eff_load *= 100;
 	this_eff_load *= capacity_of(prev_cpu);
 
+	prev_eff_load = source_load(prev_cpu, sd->wake_idx);
 	prev_eff_load -= task_load;
 	if (sched_feat(WA_BIAS))
 		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
-- 
cgit v1.2.3-71-gd317


From 082f764a2f3f2968afa1a0b04a1ccb1b70633844 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 13 Feb 2018 13:37:27 +0000
Subject: sched/fair: Do not migrate on wake_affine_weight() if weights are
 equal

wake_affine_weight() will consider migrating a task to, or near, the current
CPU if there is a load imbalance. If the CPUs share LLC then either CPU
is valid as a search-for-idle-sibling target and equally appropriate for
stacking two tasks on one CPU if an idle sibling is unavailable. If they do
not share cache then a cross-node migration potentially impacts locality
so while they are equal from a CPU capacity point of view, they are not
equal in terms of memory locality. In either case, it's more appropriate
to migrate only if there is a difference in their effective load.

This patch modifies wake_affine_weight() to only consider migrating a task
if there is a load imbalance for normal wakeups but will allow potential
stacking if the loads are equal and it's a sync wakeup.

For the most part, the different in performance is marginal. For example,
on a 4-socket server running netperf UDP_STREAM on localhost the differences
are as follows:

                                      4.15.0                 4.15.0
                                       16rc0          noequal-v1r23
 Hmean     send-64         355.47 (   0.00%)      349.50 (  -1.68%)
 Hmean     send-128        697.98 (   0.00%)      693.35 (  -0.66%)
 Hmean     send-256       1328.02 (   0.00%)     1318.77 (  -0.70%)
 Hmean     send-1024      5051.83 (   0.00%)     5051.11 (  -0.01%)
 Hmean     send-2048      9637.02 (   0.00%)     9601.34 (  -0.37%)
 Hmean     send-3312     14355.37 (   0.00%)    14414.51 (   0.41%)
 Hmean     send-4096     16464.97 (   0.00%)    16301.37 (  -0.99%)
 Hmean     send-8192     26722.42 (   0.00%)    26428.95 (  -1.10%)
 Hmean     send-16384    38137.81 (   0.00%)    38046.11 (  -0.24%)
 Hmean     recv-64         355.47 (   0.00%)      349.50 (  -1.68%)
 Hmean     recv-128        697.98 (   0.00%)      693.35 (  -0.66%)
 Hmean     recv-256       1328.02 (   0.00%)     1318.77 (  -0.70%)
 Hmean     recv-1024      5051.83 (   0.00%)     5051.11 (  -0.01%)
 Hmean     recv-2048      9636.95 (   0.00%)     9601.30 (  -0.37%)
 Hmean     recv-3312     14355.32 (   0.00%)    14414.48 (   0.41%)
 Hmean     recv-4096     16464.74 (   0.00%)    16301.16 (  -0.99%)
 Hmean     recv-8192     26721.63 (   0.00%)    26428.17 (  -1.10%)
 Hmean     recv-16384    38136.00 (   0.00%)    38044.88 (  -0.24%)
 Stddev    send-64           7.30 (   0.00%)        4.75 (  34.96%)
 Stddev    send-128         15.15 (   0.00%)       22.38 ( -47.66%)
 Stddev    send-256         13.99 (   0.00%)       19.14 ( -36.81%)
 Stddev    send-1024       105.73 (   0.00%)       67.38 (  36.27%)
 Stddev    send-2048       294.57 (   0.00%)      223.88 (  24.00%)
 Stddev    send-3312       302.28 (   0.00%)      271.74 (  10.10%)
 Stddev    send-4096       195.92 (   0.00%)      121.10 (  38.19%)
 Stddev    send-8192       399.71 (   0.00%)      563.77 ( -41.04%)
 Stddev    send-16384     1163.47 (   0.00%)     1103.68 (   5.14%)
 Stddev    recv-64           7.30 (   0.00%)        4.75 (  34.96%)
 Stddev    recv-128         15.15 (   0.00%)       22.38 ( -47.66%)
 Stddev    recv-256         13.99 (   0.00%)       19.14 ( -36.81%)
 Stddev    recv-1024       105.73 (   0.00%)       67.38 (  36.27%)
 Stddev    recv-2048       294.59 (   0.00%)      223.89 (  24.00%)
 Stddev    recv-3312       302.24 (   0.00%)      271.75 (  10.09%)
 Stddev    recv-4096       196.03 (   0.00%)      121.14 (  38.20%)
 Stddev    recv-8192       399.86 (   0.00%)      563.65 ( -40.96%)
 Stddev    recv-16384     1163.79 (   0.00%)     1103.86 (   5.15%)

The difference in overall performance is marginal but note that most
measurements are less variable. There were similar observations for other
netperf comparisons. hackbench with sockets or threads with processes or
threads showed minor difference with some reduction of migration. tbench
showed only marginal differences that were within the noise. dbench,
regardless of filesystem, showed minor differences all of which are
within noise. Multiple machines, both UMA and NUMA were tested without
any regressions showing up.

The biggest risk with a patch like this is affecting wakeup latencies.
However, the schbench load from Facebook which is very sensitive to wakeup
latency showed a mixed result with mostly improvements in wakeup latency:

                                      4.15.0                 4.15.0
                                       16rc0          noequal-v1r23
 Lat 50.00th-qrtle-1        38.00 (   0.00%)       38.00 (   0.00%)
 Lat 75.00th-qrtle-1        49.00 (   0.00%)       41.00 (  16.33%)
 Lat 90.00th-qrtle-1        52.00 (   0.00%)       50.00 (   3.85%)
 Lat 95.00th-qrtle-1        54.00 (   0.00%)       51.00 (   5.56%)
 Lat 99.00th-qrtle-1        63.00 (   0.00%)       60.00 (   4.76%)
 Lat 99.50th-qrtle-1        66.00 (   0.00%)       61.00 (   7.58%)
 Lat 99.90th-qrtle-1        78.00 (   0.00%)       65.00 (  16.67%)
 Lat 50.00th-qrtle-2        38.00 (   0.00%)       38.00 (   0.00%)
 Lat 75.00th-qrtle-2        42.00 (   0.00%)       43.00 (  -2.38%)
 Lat 90.00th-qrtle-2        46.00 (   0.00%)       48.00 (  -4.35%)
 Lat 95.00th-qrtle-2        49.00 (   0.00%)       50.00 (  -2.04%)
 Lat 99.00th-qrtle-2        55.00 (   0.00%)       57.00 (  -3.64%)
 Lat 99.50th-qrtle-2        58.00 (   0.00%)       60.00 (  -3.45%)
 Lat 99.90th-qrtle-2        65.00 (   0.00%)       68.00 (  -4.62%)
 Lat 50.00th-qrtle-4        41.00 (   0.00%)       41.00 (   0.00%)
 Lat 75.00th-qrtle-4        45.00 (   0.00%)       46.00 (  -2.22%)
 Lat 90.00th-qrtle-4        50.00 (   0.00%)       50.00 (   0.00%)
 Lat 95.00th-qrtle-4        54.00 (   0.00%)       53.00 (   1.85%)
 Lat 99.00th-qrtle-4        61.00 (   0.00%)       61.00 (   0.00%)
 Lat 99.50th-qrtle-4        65.00 (   0.00%)       64.00 (   1.54%)
 Lat 99.90th-qrtle-4        76.00 (   0.00%)       82.00 (  -7.89%)
 Lat 50.00th-qrtle-8        48.00 (   0.00%)       46.00 (   4.17%)
 Lat 75.00th-qrtle-8        55.00 (   0.00%)       54.00 (   1.82%)
 Lat 90.00th-qrtle-8        60.00 (   0.00%)       59.00 (   1.67%)
 Lat 95.00th-qrtle-8        63.00 (   0.00%)       63.00 (   0.00%)
 Lat 99.00th-qrtle-8        71.00 (   0.00%)       69.00 (   2.82%)
 Lat 99.50th-qrtle-8        74.00 (   0.00%)       73.00 (   1.35%)
 Lat 99.90th-qrtle-8        98.00 (   0.00%)       90.00 (   8.16%)
 Lat 50.00th-qrtle-16       56.00 (   0.00%)       55.00 (   1.79%)
 Lat 75.00th-qrtle-16       68.00 (   0.00%)       67.00 (   1.47%)
 Lat 90.00th-qrtle-16       77.00 (   0.00%)       78.00 (  -1.30%)
 Lat 95.00th-qrtle-16       82.00 (   0.00%)       84.00 (  -2.44%)
 Lat 99.00th-qrtle-16       90.00 (   0.00%)       93.00 (  -3.33%)
 Lat 99.50th-qrtle-16       93.00 (   0.00%)       97.00 (  -4.30%)
 Lat 99.90th-qrtle-16      110.00 (   0.00%)      110.00 (   0.00%)
 Lat 50.00th-qrtle-32       68.00 (   0.00%)       62.00 (   8.82%)
 Lat 75.00th-qrtle-32       90.00 (   0.00%)       83.00 (   7.78%)
 Lat 90.00th-qrtle-32      110.00 (   0.00%)      100.00 (   9.09%)
 Lat 95.00th-qrtle-32      122.00 (   0.00%)      111.00 (   9.02%)
 Lat 99.00th-qrtle-32      145.00 (   0.00%)      133.00 (   8.28%)
 Lat 99.50th-qrtle-32      154.00 (   0.00%)      143.00 (   7.14%)
 Lat 99.90th-qrtle-32     2316.00 (   0.00%)      515.00 (  77.76%)
 Lat 50.00th-qrtle-35       69.00 (   0.00%)       72.00 (  -4.35%)
 Lat 75.00th-qrtle-35       92.00 (   0.00%)       95.00 (  -3.26%)
 Lat 90.00th-qrtle-35      111.00 (   0.00%)      114.00 (  -2.70%)
 Lat 95.00th-qrtle-35      122.00 (   0.00%)      124.00 (  -1.64%)
 Lat 99.00th-qrtle-35      142.00 (   0.00%)      144.00 (  -1.41%)
 Lat 99.50th-qrtle-35      150.00 (   0.00%)      154.00 (  -2.67%)
 Lat 99.90th-qrtle-35     6104.00 (   0.00%)     5640.00 (   7.60%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Giovanni Gherdovich <ggherdovich@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180213133730.24064-4-mgorman@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae3e6f877711..a07920f3a2fd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5747,7 +5747,16 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
 	prev_eff_load *= capacity_of(this_cpu);
 
-	return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
+	/*
+	 * If sync, adjust the weight of prev_eff_load such that if
+	 * prev_eff == this_eff that select_idle_sibling() will consider
+	 * stacking the wakee on top of the waker if no other CPU is
+	 * idle.
+	 */
+	if (sync)
+		prev_eff_load += 1;
+
+	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
 }
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
-- 
cgit v1.2.3-71-gd317


From 24d0c1d6e65f635b2c0684d0a42ff6c0674aa0e6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 13 Feb 2018 13:37:28 +0000
Subject: sched/fair: Do not migrate due to a sync wakeup on exit

When a task exits, it notifies the parent that it has exited. This is a
sync wakeup and the exiting task may pull the parent towards the wakers
CPU. For simple workloads like using a shell, it was observed that the
shell is pulled across nodes by exiting processes. This is daft as the
parent may be long-lived and properly placed. This patch special cases a
sync wakeup on exit to avoid pulling tasks across nodes. Testing on a range
of workloads and machines showed very little differences in performance
although there was a small 3% boost on some machines running a shellscript
intensive workload (git regression test suite).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Giovanni Gherdovich <ggherdovich@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180213133730.24064-5-mgorman@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a07920f3a2fd..302dda81e192 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6350,7 +6350,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int cpu = smp_processor_id();
 	int new_cpu = prev_cpu;
 	int want_affine = 0;
-	int sync = wake_flags & WF_SYNC;
+	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
 
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
-- 
cgit v1.2.3-71-gd317


From 2c83362734dad8e48ccc0710b5cd2436a0323893 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 13 Feb 2018 13:37:29 +0000
Subject: sched/fair: Consider SD_NUMA when selecting the most idle group to
 schedule on

find_idlest_group() compares a local group with each other group to select
the one that is most idle. When comparing groups in different NUMA domains,
a very slight imbalance is enough to select a remote NUMA node even if the
runnable load on both groups is 0 or close to 0. This ignores the cost of
remote accesses entirely and is a problem when selecting the CPU for a
newly forked task to run on.  This is problematic when a forking server
is almost guaranteed to run on a remote node incurring numerous remote
accesses and potentially causing automatic NUMA balancing to try migrate
the task back or migrate the data to another node. Similar weirdness is
observed if a basic shell command pipes output to another as each process
in the pipeline is likely to start on different nodes and then get adjusted
later by wake_affine().

This patch adds imbalance to remote domains when considering whether to
select CPUs from remote domains. If the local domain is selected, imbalance
will still be used to try select a CPU from a lower scheduler domain's group
instead of stacking tasks on the same CPU.

A variety of workloads and machines were tested and as expected, there is no
difference on UMA. The difference on NUMA can be dramatic. This is a comparison
of elapsed times running the git regression test suite. It's fork-intensive with
short-lived processes:

                                  4.15.0                 4.15.0
                            noexit-v1r23           sdnuma-v1r23
 Elapsed min          1706.06 (   0.00%)     1435.94 (  15.83%)
 Elapsed mean         1709.53 (   0.00%)     1436.98 (  15.94%)
 Elapsed stddev          2.16 (   0.00%)        1.01 (  53.38%)
 Elapsed coeffvar        0.13 (   0.00%)        0.07 (  44.54%)
 Elapsed max          1711.59 (   0.00%)     1438.01 (  15.98%)

               4.15.0      4.15.0
         noexit-v1r23 sdnuma-v1r23
 User         5434.12     5188.41
 System       4878.77     3467.09
 Elapsed     10259.06     8624.21

That shows a considerable reduction in elapsed times. It's important to
note that automatic NUMA balancing does not affect this load as processes
are too short-lived.

There is also a noticable impact on hackbench such as this example using
processes and pipes:

 hackbench-process-pipes
                               4.15.0                 4.15.0
                         noexit-v1r23           sdnuma-v1r23
 Amean     1        1.0973 (   0.00%)      0.9393 (  14.40%)
 Amean     4        1.3427 (   0.00%)      1.3730 (  -2.26%)
 Amean     7        1.4233 (   0.00%)      1.6670 ( -17.12%)
 Amean     12       3.0250 (   0.00%)      3.3013 (  -9.13%)
 Amean     21       9.0860 (   0.00%)      9.5343 (  -4.93%)
 Amean     30      14.6547 (   0.00%)     13.2433 (   9.63%)
 Amean     48      22.5447 (   0.00%)     20.4303 (   9.38%)
 Amean     79      29.2010 (   0.00%)     26.7853 (   8.27%)
 Amean     110     36.7443 (   0.00%)     35.8453 (   2.45%)
 Amean     141     45.8533 (   0.00%)     42.6223 (   7.05%)
 Amean     172     55.1317 (   0.00%)     50.6473 (   8.13%)
 Amean     203     64.4420 (   0.00%)     58.3957 (   9.38%)
 Amean     234     73.2293 (   0.00%)     67.1047 (   8.36%)
 Amean     265     80.5220 (   0.00%)     75.7330 (   5.95%)
 Amean     296     88.7567 (   0.00%)     82.1533 (   7.44%)

It's not a universal win as there are occasions when spreading wide and
quickly is a benefit but it's more of a win than it is a loss. For other
workloads, there is little difference but netperf is interesting. Without
the patch, the server and client starts on different nodes but quickly get
migrated due to wake_affine. Hence, the difference is overall performance
is marginal but detectable:

                                      4.15.0                 4.15.0
                                noexit-v1r23           sdnuma-v1r23
 Hmean     send-64         349.09 (   0.00%)      354.67 (   1.60%)
 Hmean     send-128        699.16 (   0.00%)      702.91 (   0.54%)
 Hmean     send-256       1316.34 (   0.00%)     1350.07 (   2.56%)
 Hmean     send-1024      5063.99 (   0.00%)     5124.38 (   1.19%)
 Hmean     send-2048      9705.19 (   0.00%)     9687.44 (  -0.18%)
 Hmean     send-3312     14359.48 (   0.00%)    14577.64 (   1.52%)
 Hmean     send-4096     16324.20 (   0.00%)    16393.62 (   0.43%)
 Hmean     send-8192     26112.61 (   0.00%)    26877.26 (   2.93%)
 Hmean     send-16384    37208.44 (   0.00%)    38683.43 (   3.96%)
 Hmean     recv-64         349.09 (   0.00%)      354.67 (   1.60%)
 Hmean     recv-128        699.16 (   0.00%)      702.91 (   0.54%)
 Hmean     recv-256       1316.34 (   0.00%)     1350.07 (   2.56%)
 Hmean     recv-1024      5063.99 (   0.00%)     5124.38 (   1.19%)
 Hmean     recv-2048      9705.16 (   0.00%)     9687.43 (  -0.18%)
 Hmean     recv-3312     14359.42 (   0.00%)    14577.59 (   1.52%)
 Hmean     recv-4096     16323.98 (   0.00%)    16393.55 (   0.43%)
 Hmean     recv-8192     26111.85 (   0.00%)    26876.96 (   2.93%)
 Hmean     recv-16384    37206.99 (   0.00%)    38682.41 (   3.97%)

However, what is very interesting is how automatic NUMA balancing behaves.
Each netperf instance runs long enough for balancing to activate:

 NUMA base PTE updates             4620        1473
 NUMA huge PMD updates                0           0
 NUMA page range updates           4620        1473
 NUMA hint faults                  4301        1383
 NUMA hint local faults            1309         451
 NUMA hint local percent             30          32
 NUMA pages migrated               1335         491
 AutoNUMA cost                      21%          6%

There is an unfortunate number of remote faults although tracing indicated
that the vast majority are in shared libraries. However, the tendency to
start tasks on the same node if there is capacity means that there were
far fewer PTE updates and faults incurred overall.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Giovanni Gherdovich <ggherdovich@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180213133730.24064-6-mgorman@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 302dda81e192..94aea5b91a96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5911,6 +5911,18 @@ skip_spare:
 	if (!idlest)
 		return NULL;
 
+	/*
+	 * When comparing groups across NUMA domains, it's possible for the
+	 * local domain to be very lightly loaded relative to the remote
+	 * domains but "imbalance" skews the comparison making remote CPUs
+	 * look much more favourable. When considering cross-domain, add
+	 * imbalance to the runnable load on the remote node and consider
+	 * staying local.
+	 */
+	if ((sd->flags & SD_NUMA) &&
+	    min_runnable_load + imbalance >= this_runnable_load)
+		return NULL;
+
 	if (min_runnable_load > (this_runnable_load + imbalance))
 		return NULL;
 
-- 
cgit v1.2.3-71-gd317


From 7347fc87dfe6b7315e74310ee1243dc222c68086 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 13 Feb 2018 13:37:30 +0000
Subject: sched/numa: Delay retrying placement for automatic NUMA balance after
 wake_affine()

If wake_affine() pulls a task to another node for any reason and the node is
no longer preferred then temporarily stop automatic NUMA balancing pulling
the task back. Otherwise, tasks with a strong waker/wakee relationship
may constantly fight automatic NUMA balancing over where a task should
be placed.

Once again netperf is interesting here. The performance barely changes
but automatic NUMA balancing is interesting:

 Hmean     send-64         354.67 (   0.00%)      352.15 (  -0.71%)
 Hmean     send-128        702.91 (   0.00%)      693.84 (  -1.29%)
 Hmean     send-256       1350.07 (   0.00%)     1344.19 (  -0.44%)
 Hmean     send-1024      5124.38 (   0.00%)     4941.24 (  -3.57%)
 Hmean     send-2048      9687.44 (   0.00%)     9624.45 (  -0.65%)
 Hmean     send-3312     14577.64 (   0.00%)    14514.35 (  -0.43%)
 Hmean     send-4096     16393.62 (   0.00%)    16488.30 (   0.58%)
 Hmean     send-8192     26877.26 (   0.00%)    26431.63 (  -1.66%)
 Hmean     send-16384    38683.43 (   0.00%)    38264.91 (  -1.08%)
 Hmean     recv-64         354.67 (   0.00%)      352.15 (  -0.71%)
 Hmean     recv-128        702.91 (   0.00%)      693.84 (  -1.29%)
 Hmean     recv-256       1350.07 (   0.00%)     1344.19 (  -0.44%)
 Hmean     recv-1024      5124.38 (   0.00%)     4941.24 (  -3.57%)
 Hmean     recv-2048      9687.43 (   0.00%)     9624.45 (  -0.65%)
 Hmean     recv-3312     14577.59 (   0.00%)    14514.35 (  -0.43%)
 Hmean     recv-4096     16393.55 (   0.00%)    16488.20 (   0.58%)
 Hmean     recv-8192     26876.96 (   0.00%)    26431.29 (  -1.66%)
 Hmean     recv-16384    38682.41 (   0.00%)    38263.94 (  -1.08%)

 NUMA alloc hit                 1465986     1423090
 NUMA alloc miss                      0           0
 NUMA interleave hit                  0           0
 NUMA alloc local               1465897     1423003
 NUMA base PTE updates             1473        1420
 NUMA huge PMD updates                0           0
 NUMA page range updates           1473        1420
 NUMA hint faults                  1383        1312
 NUMA hint local faults             451         124
 NUMA hint local percent             32           9

There is a slight degrading in performance but there are slightly fewer
NUMA faults. There is a large drop in the percentage of local faults but
the bulk of migrations for netperf are in small shared libraries so it's
reflecting the fact that automatic NUMA balancing has backed off. This is
a case where despite wake_affine() and automatic NUMA balancing fighting
for placement that there is a marginal benefit to rescheduling to local
data quickly. However, it should be noted that wake_affine() and automatic
NUMA balancing fighting each other constantly is undesirable.

However, the benefit in other cases is large. This is the result for NAS
with the D class sizing on a 4-socket machine:

 nas-mpi
                           4.15.0                 4.15.0
                     sdnuma-v1r23       delayretry-v1r23
 Time cg.D      557.00 (   0.00%)      431.82 (  22.47%)
 Time ep.D       77.83 (   0.00%)       79.01 (  -1.52%)
 Time is.D       26.46 (   0.00%)       26.64 (  -0.68%)
 Time lu.D      727.14 (   0.00%)      597.94 (  17.77%)
 Time mg.D      191.35 (   0.00%)      146.85 (  23.26%)

               4.15.0      4.15.0
         sdnuma-v1r23delayretry-v1r23
 User        75665.20    70413.30
 System      20321.59     8861.67
 Elapsed       766.13      634.92

 Minor Faults                  16528502     7127941
 Major Faults                      4553        5068
 NUMA alloc local               6963197     6749135
 NUMA base PTE updates        366409093   107491434
 NUMA huge PMD updates           687556      198880
 NUMA page range updates      718437765   209317994
 NUMA hint faults              13643410     4601187
 NUMA hint local faults         9212593     3063996
 NUMA hint local percent             67          66

Note the massive reduction in system CPU usage even though the percentage
of local faults is barely affected. There is a massive reduction in the
number of PTE updates showing that automatic NUMA balancing has backed off.
A critical observation is also that there is a massive reduction in minor
faults which is due to far fewer NUMA hinting faults being trapped.

There were questions on NAS OMP and how it behaved related to threads
being bound to CPUs. First, there are more gains than losses with this
patch applied and a reduction in system CPU usage:

nas-omp
                      4.16.0-rc1             4.16.0-rc1
                     sdnuma-v2r1        delayretry-v2r1
Time bt.D      436.71 (   0.00%)      430.05 (   1.53%)
Time cg.D      201.02 (   0.00%)      180.87 (  10.02%)
Time ep.D       32.84 (   0.00%)       32.68 (   0.49%)
Time is.D        9.63 (   0.00%)        9.64 (  -0.10%)
Time lu.D      331.20 (   0.00%)      304.80 (   7.97%)
Time mg.D       54.87 (   0.00%)       52.72 (   3.92%)
Time sp.D     1108.78 (   0.00%)      917.10 (  17.29%)
Time ua.D      378.81 (   0.00%)      398.83 (  -5.28%)

          4.16.0-rc1  4.16.0-rc1
         sdnuma-v2r1delayretry-v2r1
User       305633.08   296751.91
System        451.75      357.80
Elapsed      2595.73     2368.13

However, it does not close the gap between binding and being unbound. There
is negligible difference between the performance of the baseline and a
patched kernel when threads are bound so it is not presented here:

                      4.16.0-rc1             4.16.0-rc1
                 delayretry-bind     delayretry-unbound
Time bt.D      385.02 (   0.00%)      430.05 ( -11.70%)
Time cg.D      144.02 (   0.00%)      180.87 ( -25.59%)
Time ep.D       32.85 (   0.00%)       32.68 (   0.52%)
Time is.D       10.52 (   0.00%)        9.64 (   8.37%)
Time lu.D      285.31 (   0.00%)      304.80 (  -6.83%)
Time mg.D       43.21 (   0.00%)       52.72 ( -22.01%)
Time sp.D      820.24 (   0.00%)      917.10 ( -11.81%)
Time ua.D      337.09 (   0.00%)      398.83 ( -18.32%)

          4.16.0-rc1  4.16.0-rc1
        delayretry-binddelayretry-unbound
User       277731.25   296751.91
System        261.29      357.80
Elapsed      2100.55     2368.13

Unfortunately, while performance is improved by the patch, there is still
quite a long way to go before it's equivalent to hard binding.

Other workloads like hackbench, tbench, dbench and schbench are barely
affected. dbench shows a mix of gains and losses depending on the machine
although in general, the results are more stable.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Giovanni Gherdovich <ggherdovich@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180213133730.24064-7-mgorman@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 94aea5b91a96..33662a3bdc6d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1869,6 +1869,7 @@ static int task_numa_migrate(struct task_struct *p)
 static void numa_migrate_preferred(struct task_struct *p)
 {
 	unsigned long interval = HZ;
+	unsigned long numa_migrate_retry;
 
 	/* This task has no NUMA fault statistics yet */
 	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1876,7 +1877,18 @@ static void numa_migrate_preferred(struct task_struct *p)
 
 	/* Periodically retry migrating the task to the preferred node */
 	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
-	p->numa_migrate_retry = jiffies + interval;
+	numa_migrate_retry = jiffies + interval;
+
+	/*
+	 * Check that the new retry threshold is after the current one. If
+	 * the retry is in the future, it implies that wake_affine has
+	 * temporarily asked NUMA balancing to backoff from placement.
+	 */
+	if (numa_migrate_retry > p->numa_migrate_retry)
+		return;
+
+	/* Safe to try placing the task on the preferred node */
+	p->numa_migrate_retry = numa_migrate_retry;
 
 	/* Success if task is already running on preferred CPU */
 	if (task_node(p) == p->numa_preferred_nid)
@@ -5759,6 +5771,48 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static void
+update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
+{
+	unsigned long interval;
+
+	if (!static_branch_likely(&sched_numa_balancing))
+		return;
+
+	/* If balancing has no preference then continue gathering data */
+	if (p->numa_preferred_nid == -1)
+		return;
+
+	/*
+	 * If the wakeup is not affecting locality then it is neutral from
+	 * the perspective of NUMA balacing so continue gathering data.
+	 */
+	if (cpu_to_node(prev_cpu) == cpu_to_node(target))
+		return;
+
+	/*
+	 * Temporarily prevent NUMA balancing trying to place waker/wakee after
+	 * wakee has been moved by wake_affine. This will potentially allow
+	 * related tasks to converge and update their data placement. The
+	 * 4 * numa_scan_period is to allow the two-pass filter to migrate
+	 * hot data to the wakers node.
+	 */
+	interval = max(sysctl_numa_balancing_scan_delay,
+			 p->numa_scan_period << 2);
+	p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
+
+	interval = max(sysctl_numa_balancing_scan_delay,
+			 current->numa_scan_period << 2);
+	current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
+}
+#else
+static void
+update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
+{
+}
+#endif
+
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int this_cpu, int prev_cpu, int sync)
 {
@@ -5774,6 +5828,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	if (target == nr_cpumask_bits)
 		return prev_cpu;
 
+	update_wa_numa_placement(p, prev_cpu, target);
 	schedstat_inc(sd->ttwu_move_affine);
 	schedstat_inc(p->se.statistics.nr_wakeups_affine);
 	return target;
-- 
cgit v1.2.3-71-gd317


From 77a021be383ebdacb17594e280242f7fd116095f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 21 Feb 2018 05:17:23 +0100
Subject: sched/core: Rename init_rq_hrtick() to hrtick_rq_init()

Do that rename in order to normalize the hrtick namespace.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1519186649-3242-2-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7c535eee0a6..e72ca3c574fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -333,7 +333,7 @@ void hrtick_start(struct rq *rq, u64 delay)
 }
 #endif /* CONFIG_SMP */
 
-static void init_rq_hrtick(struct rq *rq)
+static void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	rq->hrtick_csd_pending = 0;
@@ -351,7 +351,7 @@ static inline void hrtick_clear(struct rq *rq)
 {
 }
 
-static inline void init_rq_hrtick(struct rq *rq)
+static inline void hrtick_rq_init(struct rq *rq)
 {
 }
 #endif	/* CONFIG_SCHED_HRTICK */
@@ -6028,7 +6028,7 @@ void __init sched_init(void)
 		rq->last_sched_tick = 0;
 #endif
 #endif /* CONFIG_SMP */
-		init_rq_hrtick(rq);
+		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
 	}
 
-- 
cgit v1.2.3-71-gd317


From a364298359e74a414857bbbf3b725564feb22d09 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 21 Feb 2018 05:17:24 +0100
Subject: nohz: Convert tick_nohz_tick_stopped() to bool

It makes this function more self-explanatory about what it does and how
to use it.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1519186649-3242-3-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/tick.h     | 2 +-
 kernel/time/tick-sched.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7cc35921218e..86576d9d2311 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -113,7 +113,7 @@ enum tick_dep_bits {
 
 #ifdef CONFIG_NO_HZ_COMMON
 extern bool tick_nohz_enabled;
-extern int tick_nohz_tick_stopped(void);
+extern bool tick_nohz_tick_stopped(void);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 29a5733eff83..0aba0412ede5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -481,7 +481,7 @@ static int __init setup_tick_nohz(char *str)
 
 __setup("nohz=", setup_tick_nohz);
 
-int tick_nohz_tick_stopped(void)
+bool tick_nohz_tick_stopped(void)
 {
 	return __this_cpu_read(tick_cpu_sched.tick_stopped);
 }
-- 
cgit v1.2.3-71-gd317


From 22ab8bc02a5f6e8ffc418759894f7a6b0b632331 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 21 Feb 2018 05:17:25 +0100
Subject: nohz: Allow to check if remote CPU tick is stopped

This check is racy but provides a good heuristic to determine whether
a CPU may need a remote tick or not.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1519186649-3242-4-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/tick.h     | 2 ++
 kernel/time/tick-sched.c | 7 +++++++
 2 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 86576d9d2311..7f8c9a127f5a 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -114,6 +114,7 @@ enum tick_dep_bits {
 #ifdef CONFIG_NO_HZ_COMMON
 extern bool tick_nohz_enabled;
 extern bool tick_nohz_tick_stopped(void);
+extern bool tick_nohz_tick_stopped_cpu(int cpu);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 #else /* !CONFIG_NO_HZ_COMMON */
 #define tick_nohz_enabled (0)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
+static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0aba0412ede5..d479b21a848b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -486,6 +486,13 @@ bool tick_nohz_tick_stopped(void)
 	return __this_cpu_read(tick_cpu_sched.tick_stopped);
 }
 
+bool tick_nohz_tick_stopped_cpu(int cpu)
+{
+	struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+	return ts->tick_stopped;
+}
+
 /**
  * tick_nohz_update_jiffies - update jiffies when idle was interrupted
  *
-- 
cgit v1.2.3-71-gd317


From 1bda3f8087fce9063da0b8aef87f17a3fe541aca Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 21 Feb 2018 05:17:26 +0100
Subject: sched/isolation: Isolate workqueues when "nohz_full=" is set

As we prepare for offloading the residual 1hz scheduler ticks to
workqueue, let's affine those to housekeepers so that they don't
interrupt the CPUs that don't want to be disturbed.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1519186649-3242-5-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 1 +
 kernel/sched/isolation.c        | 3 ++-
 kernel/workqueue.c              | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index d849431c8060..4a6582c27dea 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -12,6 +12,7 @@ enum hk_flags {
 	HK_FLAG_SCHED		= (1 << 3),
 	HK_FLAG_TICK		= (1 << 4),
 	HK_FLAG_DOMAIN		= (1 << 5),
+	HK_FLAG_WQ		= (1 << 6),
 };
 
 #ifdef CONFIG_CPU_ISOLATION
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b71b436f59f2..a2500c459617 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -3,6 +3,7 @@
  *  any CPU: unbound workqueues, timers, kthreads and any offloadable work.
  *
  * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
+ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
  *
  */
 
@@ -119,7 +120,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
 {
 	unsigned int flags;
 
-	flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+	flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
 
 	return housekeeping_setup(str, flags);
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 017044c26233..593dbe749174 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5565,12 +5565,13 @@ static void __init wq_numa_init(void)
 int __init workqueue_init_early(void)
 {
 	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+	int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
 	int i, cpu;
 
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
+	cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.2.3-71-gd317


From d84b31313ef8a8de55a2cbfb72f76f36d8c927fb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 21 Feb 2018 05:17:27 +0100
Subject: sched/isolation: Offload residual 1Hz scheduler tick

When a CPU runs in full dynticks mode, a 1Hz tick remains in order to
keep the scheduler stats alive. However this residual tick is a burden
for bare metal tasks that can't stand any interruption at all, or want
to minimize them.

The usual boot parameters "nohz_full=" or "isolcpus=nohz" will now
outsource these scheduler ticks to the global workqueue so that a
housekeeping CPU handles those remotely. The sched_class::task_tick()
implementations have been audited and look safe to be called remotely
as the target runqueue and its current task are passed in parameter
and don't seem to be accessed locally.

Note that in the case of using isolcpus, it's still up to the user to
affine the global workqueues to the housekeeping CPUs through
/sys/devices/virtual/workqueue/cpumask or domains isolation
"isolcpus=nohz,domain".

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1519186649-3242-6-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c      | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/deadline.c  |  8 +++++
 kernel/sched/fair.c      |  7 +++-
 kernel/sched/idle_task.c |  8 +++++
 kernel/sched/isolation.c |  4 +++
 kernel/sched/rt.c        |  8 +++++
 kernel/sched/sched.h     |  2 ++
 kernel/sched/stop_task.c |  8 +++++
 8 files changed, 136 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e72ca3c574fc..5dfef458ab52 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3125,6 +3125,96 @@ u64 scheduler_tick_max_deferment(void)
 
 	return jiffies_to_nsecs(next - now);
 }
+
+struct tick_work {
+	int			cpu;
+	struct delayed_work	work;
+};
+
+static struct tick_work __percpu *tick_work_cpu;
+
+static void sched_tick_remote(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct tick_work *twork = container_of(dwork, struct tick_work, work);
+	int cpu = twork->cpu;
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+
+	/*
+	 * Handle the tick only if it appears the remote CPU is running in full
+	 * dynticks mode. The check is racy by nature, but missing a tick or
+	 * having one too much is no big deal because the scheduler tick updates
+	 * statistics and checks timeslices in a time-independent way, regardless
+	 * of when exactly it is running.
+	 */
+	if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
+		struct task_struct *curr;
+		u64 delta;
+
+		rq_lock_irq(rq, &rf);
+		update_rq_clock(rq);
+		curr = rq->curr;
+		delta = rq_clock_task(rq) - curr->se.exec_start;
+
+		/*
+		 * Make sure the next tick runs within a reasonable
+		 * amount of time.
+		 */
+		WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+		curr->sched_class->task_tick(rq, curr, 0);
+		rq_unlock_irq(rq, &rf);
+	}
+
+	/*
+	 * Run the remote tick once per second (1Hz). This arbitrary
+	 * frequency is large enough to avoid overload but short enough
+	 * to keep scheduler internal stats reasonably up to date.
+	 */
+	queue_delayed_work(system_unbound_wq, dwork, HZ);
+}
+
+static void sched_tick_start(int cpu)
+{
+	struct tick_work *twork;
+
+	if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+		return;
+
+	WARN_ON_ONCE(!tick_work_cpu);
+
+	twork = per_cpu_ptr(tick_work_cpu, cpu);
+	twork->cpu = cpu;
+	INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+	queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_tick_stop(int cpu)
+{
+	struct tick_work *twork;
+
+	if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+		return;
+
+	WARN_ON_ONCE(!tick_work_cpu);
+
+	twork = per_cpu_ptr(tick_work_cpu, cpu);
+	cancel_delayed_work_sync(&twork->work);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __init sched_tick_offload_init(void)
+{
+	tick_work_cpu = alloc_percpu(struct tick_work);
+	BUG_ON(!tick_work_cpu);
+
+	return 0;
+}
+
+#else /* !CONFIG_NO_HZ_FULL */
+static inline void sched_tick_start(int cpu) { }
+static inline void sched_tick_stop(int cpu) { }
 #endif
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -5786,6 +5876,7 @@ int sched_cpu_starting(unsigned int cpu)
 {
 	set_cpu_rq_start_time(cpu);
 	sched_rq_cpu_starting(cpu);
+	sched_tick_start(cpu);
 	return 0;
 }
 
@@ -5797,6 +5888,7 @@ int sched_cpu_dying(unsigned int cpu)
 
 	/* Handle pending wakeups and then migrate everything off */
 	sched_ttwu_pending();
+	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
 	if (rq->rd) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9df09782025c..65cd5ead1759 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1776,6 +1776,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
 		enqueue_pushable_dl_task(rq, p);
 }
 
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
 static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
 {
 	update_curr_dl(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 33662a3bdc6d..e1febd252a84 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9515,7 +9515,12 @@ static void rq_offline_fair(struct rq *rq)
 #endif /* CONFIG_SMP */
 
 /*
- * scheduler tick hitting a task of our scheduling class:
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
  */
 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d518664cce4f..e1b46e08c8e1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -51,6 +51,14 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 	rq_last_tick_reset(rq);
 }
 
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index a2500c459617..39f340dde1d7 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/static_key.h>
 #include <linux/ctype.h>
+#include "sched.h"
 
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
 EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -61,6 +62,9 @@ void __init housekeeping_init(void)
 
 	static_branch_enable(&housekeeping_overriden);
 
+	if (housekeeping_flags & HK_FLAG_TICK)
+		sched_tick_offload_init();
+
 	/* We need at least one CPU to handle housekeeping work */
 	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aad49451584e..c80563b4f6b9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2292,6 +2292,14 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 static inline void watchdog(struct rq *rq, struct task_struct *p) { }
 #endif
 
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb5fc458547f..c1c7c788da1c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1574,6 +1574,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
 
 #ifdef CONFIG_NO_HZ_FULL
 extern bool sched_can_stop_tick(struct rq *rq);
+extern int __init sched_tick_offload_init(void);
 
 /*
  * Tick may be needed by tasks in the runqueue depending on their policy and
@@ -1598,6 +1599,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
 		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
 }
 #else
+static inline int sched_tick_offload_init(void) { return 0; }
 static inline void sched_update_tick_dependency(struct rq *rq) { }
 #endif
 
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 210b1f2146ff..ea8d2b6a1239 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -75,6 +75,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 	cgroup_account_cputime(curr, delta_exec);
 }
 
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
-- 
cgit v1.2.3-71-gd317


From dcdedb24159be3487e3dbbe1faa79ae7d00c92ac Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 21 Feb 2018 05:17:28 +0100
Subject: sched/nohz: Remove the 1 Hz tick code

Now that the 1Hz tick is offloaded to workqueues, we can safely remove
the residual code that used to handle it locally.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1519186649-3242-7-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/nohz.h |  4 ----
 kernel/sched/core.c        | 29 -----------------------------
 kernel/sched/idle_task.c   |  1 -
 kernel/sched/sched.h       | 11 +----------
 kernel/time/tick-sched.c   |  6 ------
 5 files changed, 1 insertion(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 3d3a97d9399d..094217273ff9 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -37,8 +37,4 @@ extern void wake_up_nohz_cpu(int cpu);
 static inline void wake_up_nohz_cpu(int cpu) { }
 #endif
 
-#ifdef CONFIG_NO_HZ_FULL
-extern u64 scheduler_tick_max_deferment(void);
-#endif
-
 #endif /* _LINUX_SCHED_NOHZ_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5dfef458ab52..8fff4f16c510 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3096,35 +3096,9 @@ void scheduler_tick(void)
 	rq->idle_balance = idle_cpu(cpu);
 	trigger_load_balance(rq);
 #endif
-	rq_last_tick_reset(rq);
 }
 
 #ifdef CONFIG_NO_HZ_FULL
-/**
- * scheduler_tick_max_deferment
- *
- * Keep at least one tick per second when a single
- * active task is running because the scheduler doesn't
- * yet completely support full dynticks environment.
- *
- * This makes sure that uptime, CFS vruntime, load
- * balancing, etc... continue to move forward, even
- * with a very low granularity.
- *
- * Return: Maximum deferment in nanoseconds.
- */
-u64 scheduler_tick_max_deferment(void)
-{
-	struct rq *rq = this_rq();
-	unsigned long next, now = READ_ONCE(jiffies);
-
-	next = rq->last_sched_tick + HZ;
-
-	if (time_before_eq(next, now))
-		return 0;
-
-	return jiffies_to_nsecs(next - now);
-}
 
 struct tick_work {
 	int			cpu;
@@ -6116,9 +6090,6 @@ void __init sched_init(void)
 		rq->last_load_update_tick = jiffies;
 		rq->nohz_flags = 0;
 #endif
-#ifdef CONFIG_NO_HZ_FULL
-		rq->last_sched_tick = 0;
-#endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index e1b46e08c8e1..48b8a83f5185 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -48,7 +48,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
-	rq_last_tick_reset(rq);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c1c7c788da1c..dc6c8b5a24ad 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -727,9 +727,7 @@ struct rq {
 #endif /* CONFIG_SMP */
 	unsigned long nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
-#ifdef CONFIG_NO_HZ_FULL
-	unsigned long last_sched_tick;
-#endif
+
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
 	unsigned long nr_load_updates;
@@ -1626,13 +1624,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 	sched_update_tick_dependency(rq);
 }
 
-static inline void rq_last_tick_reset(struct rq *rq)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	rq->last_sched_tick = jiffies;
-#endif
-}
-
 extern void update_rq_clock(struct rq *rq);
 
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d479b21a848b..f2fa2e940fe5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -748,12 +748,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		delta = KTIME_MAX;
 	}
 
-#ifdef CONFIG_NO_HZ_FULL
-	/* Limit the tick delta to the maximum scheduler deferment */
-	if (!ts->inidle)
-		delta = min(delta, scheduler_tick_max_deferment());
-#endif
-
 	/* Calculate the next expiry time */
 	if (delta < (KTIME_MAX - basemono))
 		expires = basemono + delta;
-- 
cgit v1.2.3-71-gd317


From d1897c9538edafd4ae6bbd03cc075962ddde2c21 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 21 Feb 2018 11:39:22 -0800
Subject: cgroup: fix rule checking for threaded mode switching

A domain cgroup isn't allowed to be turned threaded if its subtree is
populated or domain controllers are enabled.  cgroup_enable_threaded()
depended on cgroup_can_be_thread_root() test to enforce this rule.  A
parent which has populated domain descendants or have domain
controllers enabled can't become a thread root, so the above rules are
enforced automatically.

However, for the root cgroup which can host mixed domain and threaded
children, cgroup_can_be_thread_root() doesn't check any of those
conditions and thus first level cgroups ends up escaping those rules.

This patch fixes the bug by adding explicit checks for those rules in
cgroup_enable_threaded().

Reported-by: Michael Kerrisk (man-pages) <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 8cfd8147df67 ("cgroup: implement cgroup v2 thread support")
Cc: stable@vger.kernel.org # v4.14+
---
 kernel/cgroup/cgroup.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8cda3bc3ae22..4bfb2908ec15 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3183,6 +3183,16 @@ static int cgroup_enable_threaded(struct cgroup *cgrp)
 	if (cgroup_is_threaded(cgrp))
 		return 0;
 
+	/*
+	 * If @cgroup is populated or has domain controllers enabled, it
+	 * can't be switched.  While the below cgroup_can_be_thread_root()
+	 * test can catch the same conditions, that's only when @parent is
+	 * not mixable, so let's check it explicitly.
+	 */
+	if (cgroup_is_populated(cgrp) ||
+	    cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+		return -EOPNOTSUPP;
+
 	/* we're joining the parent's domain, ensure its validity */
 	if (!cgroup_is_valid_domain(dom_cgrp) ||
 	    !cgroup_can_be_thread_root(dom_cgrp))
-- 
cgit v1.2.3-71-gd317


From ad7c946b35ad455417fdd4bc0e17deda4011841b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 8 Jan 2018 14:35:52 -0800
Subject: rcu: Create RCU-specific workqueues with rescuers

RCU's expedited grace periods can participate in out-of-memory deadlocks
due to all available system_wq kthreads being blocked and there not being
memory available to create more.  This commit prevents such deadlocks
by allocating an RCU-specific workqueue_struct at early boot time, and
providing it with a rescuer to ensure forward progress.  This uses the
shiny new init_rescuer() function provided by Tejun (but indirectly).

This commit also causes SRCU to use this new RCU-specific
workqueue_struct.  Note that SRCU's use of workqueues never blocks them
waiting for readers, so this should be safe from a forward-progress
viewpoint.  Note that this moves SRCU from system_power_efficient_wq
to a normal workqueue.  In the unlikely event that this results in
measurable degradation, a separate power-efficient workqueue will be
creates for SRCU.

Reported-by: Prateek Sood <prsood@codeaurora.org>
Reported-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/rcu/rcu.h      | 1 +
 kernel/rcu/srcutree.c | 8 +++-----
 kernel/rcu/tree.c     | 6 ++++++
 kernel/rcu/tree_exp.h | 2 +-
 4 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 1c868bcfd705..7a693e31184a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -485,6 +485,7 @@ void show_rcu_gp_kthreads(void);
 void rcu_force_quiescent_state(void);
 void rcu_bh_force_quiescent_state(void);
 void rcu_sched_force_quiescent_state(void);
+extern struct workqueue_struct *rcu_gp_wq;
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
 #ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 045b559b9f22..743d18379256 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -492,8 +492,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
  */
 static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
 {
-	srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
-				   &sdp->work, delay);
+	srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay);
 }
 
 /*
@@ -691,8 +690,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
 	    rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
 		WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
 		srcu_gp_start(sp);
-		queue_delayed_work(system_power_efficient_wq, &sp->work,
-				   srcu_get_delay(sp));
+		queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp));
 	}
 	spin_unlock_irqrestore_rcu_node(sp, flags);
 }
@@ -1225,7 +1223,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 	spin_unlock_irq_rcu_node(sp);
 
 	if (pushgp)
-		queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+		queue_delayed_work(rcu_gp_wq, &sp->work, delay);
 }
 
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 99d404c6bbbb..2a734692a581 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4167,6 +4167,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
 	pr_cont("\n");
 }
 
+struct workqueue_struct *rcu_gp_wq;
+
 void __init rcu_init(void)
 {
 	int cpu;
@@ -4193,6 +4195,10 @@ void __init rcu_init(void)
 		rcu_cpu_starting(cpu);
 		rcutree_online_cpu(cpu);
 	}
+
+	/* Create workqueue for expedited GPs and for Tree SRCU. */
+	rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
+	WARN_ON(!rcu_gp_wq);
 }
 
 #include "tree_exp.h"
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6ad87642f44a..f72eefab8543 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -626,7 +626,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
 		rew.rew_rsp = rsp;
 		rew.rew_s = s;
 		INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
-		schedule_work(&rew.rew_work);
+		queue_work(rcu_gp_wq, &rew.rew_work);
 	}
 
 	/* Wait for expedited grace period to complete. */
-- 
cgit v1.2.3-71-gd317


From 04860d48a8aba5b21ae5ba1f86b0d4e3bc628fff Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 26 Feb 2018 14:49:26 +0100
Subject: locking/lockdep: Show unadorned pointers

Show unadorned pointers in lockdep reports - lockdep is a debugging
facility and hashing pointers there doesn't make a whole lotta sense.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20180226134926.23069-1-bp@alien8.de
---
 kernel/locking/lockdep.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 89b5f83f1969..12a2805dd64a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -557,7 +557,7 @@ static void print_lock(struct held_lock *hlock)
 	}
 
 	print_lock_name(lock_classes + class_idx - 1);
-	printk(KERN_CONT ", at: [<%p>] %pS\n",
+	printk(KERN_CONT ", at: [<%px>] %pS\n",
 		(void *)hlock->acquire_ip, (void *)hlock->acquire_ip);
 }
 
@@ -808,7 +808,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	if (verbose(class)) {
 		graph_unlock();
 
-		printk("\nnew class %p: %s", class->key, class->name);
+		printk("\nnew class %px: %s", class->key, class->name);
 		if (class->name_version > 1)
 			printk(KERN_CONT "#%d", class->name_version);
 		printk(KERN_CONT "\n");
@@ -1407,7 +1407,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
 	}
 	printk("%*s }\n", depth, "");
 
-	printk("%*s ... key      at: [<%p>] %pS\n",
+	printk("%*s ... key      at: [<%px>] %pS\n",
 		depth, "", class->key, class->key);
 }
 
@@ -2340,7 +2340,7 @@ cache_hit:
 
 		if (very_verbose(class)) {
 			printk("\nhash chain already cached, key: "
-					"%016Lx tail class: [%p] %s\n",
+					"%016Lx tail class: [%px] %s\n",
 					(unsigned long long)chain_key,
 					class->key, class->name);
 		}
@@ -2349,7 +2349,7 @@ cache_hit:
 	}
 
 	if (very_verbose(class)) {
-		printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
+		printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n",
 			(unsigned long long)chain_key, class->key, class->name);
 	}
 
@@ -2676,16 +2676,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 void print_irqtrace_events(struct task_struct *curr)
 {
 	printk("irq event stamp: %u\n", curr->irq_events);
-	printk("hardirqs last  enabled at (%u): [<%p>] %pS\n",
+	printk("hardirqs last  enabled at (%u): [<%px>] %pS\n",
 		curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
 		(void *)curr->hardirq_enable_ip);
-	printk("hardirqs last disabled at (%u): [<%p>] %pS\n",
+	printk("hardirqs last disabled at (%u): [<%px>] %pS\n",
 		curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
 		(void *)curr->hardirq_disable_ip);
-	printk("softirqs last  enabled at (%u): [<%p>] %pS\n",
+	printk("softirqs last  enabled at (%u): [<%px>] %pS\n",
 		curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
 		(void *)curr->softirq_enable_ip);
-	printk("softirqs last disabled at (%u): [<%p>] %pS\n",
+	printk("softirqs last disabled at (%u): [<%px>] %pS\n",
 		curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
 		(void *)curr->softirq_disable_ip);
 }
@@ -3207,7 +3207,7 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 	 * Sanity check, the lock-class key must be persistent:
 	 */
 	if (!static_obj(key)) {
-		printk("BUG: key %p not in .data!\n", key);
+		printk("BUG: key %px not in .data!\n", key);
 		/*
 		 * What it says above ^^^^^, I suggest you read it.
 		 */
@@ -3322,7 +3322,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	}
 	atomic_inc((atomic_t *)&class->ops);
 	if (very_verbose(class)) {
-		printk("\nacquire class [%p] %s", class->key, class->name);
+		printk("\nacquire class [%px] %s", class->key, class->name);
 		if (class->name_version > 1)
 			printk(KERN_CONT "#%d", class->name_version);
 		printk(KERN_CONT "\n");
@@ -4376,7 +4376,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
 	pr_warn("WARNING: held lock freed!\n");
 	print_kernel_ident();
 	pr_warn("-------------------------\n");
-	pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+	pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n",
 		curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
 	print_lock(hlock);
 	lockdep_print_held_locks(curr);
-- 
cgit v1.2.3-71-gd317


From fde9fc766e96c494b82931b1d270a9a751be07c0 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@mips.com>
Date: Mon, 19 Feb 2018 16:55:06 +0000
Subject: signals: Move put_compat_sigset to compat.h to silence hardened
 usercopy

Since commit afcc90f8621e ("usercopy: WARN() on slab cache usercopy
region violations"), MIPS systems booting with a compat root filesystem
emit a warning when copying compat siginfo to userspace:

WARNING: CPU: 0 PID: 953 at mm/usercopy.c:81 usercopy_warn+0x98/0xe8
Bad or missing usercopy whitelist? Kernel memory exposure attempt
detected from SLAB object 'task_struct' (offset 1432, size 16)!
Modules linked in:
CPU: 0 PID: 953 Comm: S01logging Not tainted 4.16.0-rc2 #10
Stack : ffffffff808c0000 0000000000000000 0000000000000001 65ac85163f3bdc4a
	65ac85163f3bdc4a 0000000000000000 90000000ff667ab8 ffffffff808c0000
	00000000000003f8 ffffffff808d0000 00000000000000d1 0000000000000000
	000000000000003c 0000000000000000 ffffffff808c8ca8 ffffffff808d0000
	ffffffff808d0000 ffffffff80810000 fffffc0000000000 ffffffff80785c30
	0000000000000009 0000000000000051 90000000ff667eb0 90000000ff667db0
	000000007fe0d938 0000000000000018 ffffffff80449958 0000000020052798
	ffffffff808c0000 90000000ff664000 90000000ff667ab0 00000000100c0000
	ffffffff80698810 0000000000000000 0000000000000000 0000000000000000
	0000000000000000 0000000000000000 ffffffff8010d02c 65ac85163f3bdc4a
	...
Call Trace:
[<ffffffff8010d02c>] show_stack+0x9c/0x130
[<ffffffff80698810>] dump_stack+0x90/0xd0
[<ffffffff80137b78>] __warn+0x100/0x118
[<ffffffff80137bdc>] warn_slowpath_fmt+0x4c/0x70
[<ffffffff8021e4a8>] usercopy_warn+0x98/0xe8
[<ffffffff8021e68c>] __check_object_size+0xfc/0x250
[<ffffffff801bbfb8>] put_compat_sigset+0x30/0x88
[<ffffffff8011af24>] setup_rt_frame_n32+0xc4/0x160
[<ffffffff8010b8b4>] do_signal+0x19c/0x230
[<ffffffff8010c408>] do_notify_resume+0x60/0x78
[<ffffffff80106f50>] work_notifysig+0x10/0x18
---[ end trace 88fffbf69147f48a ]---

Commit 5905429ad856 ("fork: Provide usercopy whitelisting for
task_struct") noted that:

"While the blocked and saved_sigmask fields of task_struct are copied to
userspace (via sigmask_to_save() and setup_rt_frame()), it is always
copied with a static length (i.e. sizeof(sigset_t))."

However, this is not true in the case of compat signals, whose sigset
is copied by put_compat_sigset and receives size as an argument.

At most call sites, put_compat_sigset is copying a sigset from the
current task_struct. This triggers a warning when
CONFIG_HARDENED_USERCOPY is active. However, by marking this function as
static inline, the warning can be avoided because in all of these cases
the size is constant at compile time, which is allowed. The only site
where this is not the case is handling the rt_sigpending syscall, but
there the copy is being made from a stack local variable so does not
trigger the warning.

Move put_compat_sigset to compat.h, and mark it static inline. This
fixes the WARN on MIPS.

Fixes: afcc90f8621e ("usercopy: WARN() on slab cache usercopy region violations")
Signed-off-by: Matt Redfearn <matt.redfearn@mips.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: "Dmitry V . Levin" <ldv@altlinux.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/18639/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 include/linux/compat.h | 26 ++++++++++++++++++++++++--
 kernel/compat.c        | 19 -------------------
 2 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 8a9643857c4a..c4139c7a0de0 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -17,6 +17,7 @@
 #include <linux/if.h>
 #include <linux/fs.h>
 #include <linux/aio_abi.h>	/* for aio_context_t */
+#include <linux/uaccess.h>
 #include <linux/unistd.h>
 
 #include <asm/compat.h>
@@ -550,8 +551,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
 extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat);
-extern int put_compat_sigset(compat_sigset_t __user *compat,
-			     const sigset_t *set, unsigned int size);
+
+/*
+ * Defined inline such that size can be compile time constant, which avoids
+ * CONFIG_HARDENED_USERCOPY complaining about copies from task_struct
+ */
+static inline int
+put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
+		  unsigned int size)
+{
+	/* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
+#ifdef __BIG_ENDIAN
+	compat_sigset_t v;
+	switch (_NSIG_WORDS) {
+	case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
+	case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
+	case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
+	case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
+	}
+	return copy_to_user(compat, &v, size) ? -EFAULT : 0;
+#else
+	return copy_to_user(compat, set, size) ? -EFAULT : 0;
+#endif
+}
 
 asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
 		compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
diff --git a/kernel/compat.c b/kernel/compat.c
index 3247fe761f60..3f5fa8902e7d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -488,25 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
 }
 EXPORT_SYMBOL_GPL(get_compat_sigset);
 
-int
-put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
-		  unsigned int size)
-{
-	/* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
-#ifdef __BIG_ENDIAN
-	compat_sigset_t v;
-	switch (_NSIG_WORDS) {
-	case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
-	case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
-	case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
-	case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
-	}
-	return copy_to_user(compat, &v, size) ? -EFAULT : 0;
-#else
-	return copy_to_user(compat, set, size) ? -EFAULT : 0;
-#endif
-}
-
 #ifdef CONFIG_NUMA
 COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
 		       compat_uptr_t __user *, pages32,
-- 
cgit v1.2.3-71-gd317


From c2e513821d5df5e772287f6d0c23fd17b7c2bb1a Mon Sep 17 00:00:00 2001
From: Mario Leinweber <marioleinweber@web.de>
Date: Fri, 2 Mar 2018 13:20:07 -0500
Subject: sched/deadline: Clean up various coding style details

- Fixed style error: Missing space before the open parenthesis
- Fixed style warnings: 2x Missing blank line after declaration

One warning left: else after return
 (I don't feel comfortable fixing that without side effects)

Signed-off-by: Mario Leinweber <marioleinweber@web.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20180302182007.28691-1-marioleinweber@web.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpudeadline.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 8d9562d890d3..6a9defebbb54 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -42,8 +42,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx)
 		return;
 
 	/* adapted from lib/prio_heap.c */
-	while(1) {
+	while (1) {
 		u64 largest_dl;
+
 		l = left_child(idx);
 		r = right_child(idx);
 		largest = idx;
@@ -131,6 +132,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 		return 1;
 	} else {
 		int best_cpu = cpudl_maximum(cp);
+
 		WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
 
 		if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
@@ -205,6 +207,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
 	old_idx = cp->elements[cpu].idx;
 	if (old_idx == IDX_INVALID) {
 		int new_idx = cp->size++;
+
 		cp->elements[new_idx].dl = dl;
 		cp->elements[new_idx].cpu = cpu;
 		cp->elements[cpu].idx = new_idx;
-- 
cgit v1.2.3-71-gd317


From 97fb7a0a8944bd6d2c5634e1e0fa689a5c40bc22 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 3 Mar 2018 14:01:12 +0100
Subject: sched: Clean up and harmonize the coding style of the scheduler code
 base

A good number of small style inconsistencies have accumulated
in the scheduler core, so do a pass over them to harmonize
all these details:

 - fix speling in comments,

 - use curly braces for multi-line statements,

 - remove unnecessary parentheses from integer literals,

 - capitalize consistently,

 - remove stray newlines,

 - add comments where necessary,

 - remove invalid/unnecessary comments,

 - align structure definitions and other data types vertically,

 - add missing newlines for increased readability,

 - fix vertical tabulation where it's misaligned,

 - harmonize preprocessor conditional block labeling
   and vertical alignment,

 - remove line-breaks where they uglify the code,

 - add newline after local variable definitions,

No change in functionality:

  md5:
     1191fa0a890cfa8132156d2959d7e9e2  built-in.o.before.asm
     1191fa0a890cfa8132156d2959d7e9e2  built-in.o.after.asm

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/autogroup.c         |  12 +-
 kernel/sched/autogroup.h         |   8 +-
 kernel/sched/clock.c             |  22 +-
 kernel/sched/core.c              |   6 +-
 kernel/sched/cpuacct.c           |  20 +-
 kernel/sched/cpudeadline.c       |  13 +-
 kernel/sched/cpudeadline.h       |  27 +-
 kernel/sched/cpufreq_schedutil.c | 129 +++++-----
 kernel/sched/cpupri.c            |   9 +-
 kernel/sched/cpupri.h            |  24 +-
 kernel/sched/cputime.c           |  48 ++--
 kernel/sched/deadline.c          |  51 ++--
 kernel/sched/debug.c             |  88 +++----
 kernel/sched/fair.c              | 183 +++++++-------
 kernel/sched/idle.c              |   6 +-
 kernel/sched/idle_task.c         |   3 +-
 kernel/sched/isolation.c         |   2 +-
 kernel/sched/loadavg.c           |  30 +--
 kernel/sched/membarrier.c        |  18 +-
 kernel/sched/rt.c                |  25 +-
 kernel/sched/sched.h             | 529 ++++++++++++++++++++-------------------
 kernel/sched/stats.c             |   7 +-
 kernel/sched/stats.h             |  86 +++----
 kernel/sched/stop_task.c         |   3 +-
 kernel/sched/swait.c             |   3 +
 kernel/sched/topology.c          |  42 ++--
 kernel/sched/wait.c              |   4 +
 kernel/sched/wait_bit.c          |  18 +-
 28 files changed, 706 insertions(+), 710 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index bb4b9fe026a1..ff1b7b647b86 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -168,18 +168,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 	autogroup_kref_put(prev);
 }
 
-/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
 void sched_autogroup_create_attach(struct task_struct *p)
 {
 	struct autogroup *ag = autogroup_create();
 
 	autogroup_move_group(p, ag);
-	/* drop extra reference added by autogroup_create() */
+
+	/* Drop extra reference added by autogroup_create(): */
 	autogroup_kref_put(ag);
 }
 EXPORT_SYMBOL(sched_autogroup_create_attach);
 
-/* Cannot be called under siglock.  Currently has no users */
+/* Cannot be called under siglock. Currently has no users: */
 void sched_autogroup_detach(struct task_struct *p)
 {
 	autogroup_move_group(p, &autogroup_default);
@@ -202,7 +203,6 @@ static int __init setup_autogroup(char *str)
 
 	return 1;
 }
-
 __setup("noautogroup", setup_autogroup);
 
 #ifdef CONFIG_PROC_FS
@@ -224,7 +224,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 	if (nice < 0 && !can_nice(current, nice))
 		return -EPERM;
 
-	/* this is a heavy operation taking global locks.. */
+	/* This is a heavy operation, taking global locks.. */
 	if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
 		return -EAGAIN;
 
@@ -267,4 +267,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
 
 	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
-#endif /* CONFIG_SCHED_DEBUG */
+#endif
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 27cd22b89824..49e6ec9559cf 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -7,9 +7,9 @@
 
 struct autogroup {
 	/*
-	 * reference doesn't mean how many thread attach to this
-	 * autogroup now. It just stands for the number of task
-	 * could use this autogroup.
+	 * Reference doesn't mean how many threads attach to this
+	 * autogroup now. It just stands for the number of tasks
+	 * which could use this autogroup.
 	 */
 	struct kref		kref;
 	struct task_group	*tg;
@@ -56,11 +56,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
 	return tg;
 }
 
-#ifdef CONFIG_SCHED_DEBUG
 static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
 	return 0;
 }
-#endif
 
 #endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e086babe6c61..7da6bec8a2ff 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,5 +1,5 @@
 /*
- * sched_clock for unstable cpu clocks
+ * sched_clock() for unstable CPU clocks
  *
  *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
  *
@@ -11,7 +11,7 @@
  *   Guillaume Chazarain <guichaz@gmail.com>
  *
  *
- * What:
+ * What this file implements:
  *
  * cpu_clock(i) provides a fast (execution time) high resolution
  * clock with bounded drift between CPUs. The value of cpu_clock(i)
@@ -26,11 +26,11 @@
  * at 0 on boot (but people really shouldn't rely on that).
  *
  * cpu_clock(i)       -- can be used from any context, including NMI.
- * local_clock()      -- is cpu_clock() on the current cpu.
+ * local_clock()      -- is cpu_clock() on the current CPU.
  *
  * sched_clock_cpu(i)
  *
- * How:
+ * How it is implemented:
  *
  * The implementation either uses sched_clock() when
  * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
@@ -302,21 +302,21 @@ again:
 	 * cmpxchg64 below only protects one readout.
 	 *
 	 * We must reread via sched_clock_local() in the retry case on
-	 * 32bit as an NMI could use sched_clock_local() via the
+	 * 32-bit kernels as an NMI could use sched_clock_local() via the
 	 * tracer and hit between the readout of
-	 * the low32bit and the high 32bit portion.
+	 * the low 32-bit and the high 32-bit portion.
 	 */
 	this_clock = sched_clock_local(my_scd);
 	/*
-	 * We must enforce atomic readout on 32bit, otherwise the
-	 * update on the remote cpu can hit inbetween the readout of
-	 * the low32bit and the high 32bit portion.
+	 * We must enforce atomic readout on 32-bit, otherwise the
+	 * update on the remote CPU can hit inbetween the readout of
+	 * the low 32-bit and the high 32-bit portion.
 	 */
 	remote_clock = cmpxchg64(&scd->clock, 0, 0);
 #else
 	/*
-	 * On 64bit the read of [my]scd->clock is atomic versus the
-	 * update, so we can avoid the above 32bit dance.
+	 * On 64-bit kernels the read of [my]scd->clock is atomic versus the
+	 * update, so we can avoid the above 32-bit dance.
 	 */
 	sched_clock_local(my_scd);
 again:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8fff4f16c510..9427b59551c1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -135,7 +135,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 		 *					[L] ->on_rq
 		 *	RELEASE (rq->lock)
 		 *
-		 * If we observe the old cpu in task_rq_lock, the acquire of
+		 * If we observe the old CPU in task_rq_lock, the acquire of
 		 * the old rq->lock will fully serialize against the stores.
 		 *
 		 * If we observe the new CPU in task_rq_lock, the acquire will
@@ -1457,7 +1457,7 @@ EXPORT_SYMBOL_GPL(kick_process);
  *
  *  - cpu_active must be a subset of cpu_online
  *
- *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
+ *  - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
  *    see __set_cpus_allowed_ptr(). At this point the newly online
  *    CPU isn't yet part of the sched domains, and balancing will not
  *    see it.
@@ -3037,7 +3037,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 
 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
 	/*
-	 * 64-bit doesn't need locks to atomically read a 64bit value.
+	 * 64-bit doesn't need locks to atomically read a 64-bit value.
 	 * So we have a optimization chance when the task's delta_exec is 0.
 	 * Reading ->on_cpu is racy, but this is ok.
 	 *
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 44ab32a4fab6..1abd325e733a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -18,7 +18,7 @@
  * (balbir@in.ibm.com).
  */
 
-/* Time spent by the tasks of the cpu accounting group executing in ... */
+/* Time spent by the tasks of the CPU accounting group executing in ... */
 enum cpuacct_stat_index {
 	CPUACCT_STAT_USER,	/* ... user mode */
 	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */
@@ -35,12 +35,12 @@ struct cpuacct_usage {
 	u64	usages[CPUACCT_STAT_NSTATS];
 };
 
-/* track cpu usage of a group of tasks and its child groups */
+/* track CPU usage of a group of tasks and its child groups */
 struct cpuacct {
-	struct cgroup_subsys_state css;
-	/* cpuusage holds pointer to a u64-type object on every cpu */
-	struct cpuacct_usage __percpu *cpuusage;
-	struct kernel_cpustat __percpu *cpustat;
+	struct cgroup_subsys_state	css;
+	/* cpuusage holds pointer to a u64-type object on every CPU */
+	struct cpuacct_usage __percpu	*cpuusage;
+	struct kernel_cpustat __percpu	*cpustat;
 };
 
 static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
@@ -48,7 +48,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
 	return css ? container_of(css, struct cpuacct, css) : NULL;
 }
 
-/* return cpu accounting group to which this task belongs */
+/* Return CPU accounting group to which this task belongs */
 static inline struct cpuacct *task_ca(struct task_struct *tsk)
 {
 	return css_ca(task_css(tsk, cpuacct_cgrp_id));
@@ -65,7 +65,7 @@ static struct cpuacct root_cpuacct = {
 	.cpuusage	= &root_cpuacct_cpuusage,
 };
 
-/* create a new cpu accounting group */
+/* Create a new CPU accounting group */
 static struct cgroup_subsys_state *
 cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -96,7 +96,7 @@ out:
 	return ERR_PTR(-ENOMEM);
 }
 
-/* destroy an existing cpu accounting group */
+/* Destroy an existing CPU accounting group */
 static void cpuacct_css_free(struct cgroup_subsys_state *css)
 {
 	struct cpuacct *ca = css_ca(css);
@@ -162,7 +162,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 #endif
 }
 
-/* return total cpu usage (in nanoseconds) of a group */
+/* Return total CPU usage (in nanoseconds) of a group */
 static u64 __cpuusage_read(struct cgroup_subsys_state *css,
 			   enum cpuacct_stat_index index)
 {
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 6a9defebbb54..cb172b61d191 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -10,7 +10,6 @@
  *  as published by the Free Software Foundation; version 2
  *  of the License.
  */
-
 #include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -147,9 +146,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 }
 
 /*
- * cpudl_clear - remove a cpu from the cpudl max-heap
+ * cpudl_clear - remove a CPU from the cpudl max-heap
  * @cp: the cpudl max-heap context
- * @cpu: the target cpu
+ * @cpu: the target CPU
  *
  * Notes: assumes cpu_rq(cpu)->lock is locked
  *
@@ -188,8 +187,8 @@ void cpudl_clear(struct cpudl *cp, int cpu)
 /*
  * cpudl_set - update the cpudl max-heap
  * @cp: the cpudl max-heap context
- * @cpu: the target cpu
- * @dl: the new earliest deadline for this cpu
+ * @cpu: the target CPU
+ * @dl: the new earliest deadline for this CPU
  *
  * Notes: assumes cpu_rq(cpu)->lock is locked
  *
@@ -224,7 +223,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
 /*
  * cpudl_set_freecpu - Set the cpudl.free_cpus
  * @cp: the cpudl max-heap context
- * @cpu: rd attached cpu
+ * @cpu: rd attached CPU
  */
 void cpudl_set_freecpu(struct cpudl *cp, int cpu)
 {
@@ -234,7 +233,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu)
 /*
  * cpudl_clear_freecpu - Clear the cpudl.free_cpus
  * @cp: the cpudl max-heap context
- * @cpu: rd attached cpu
+ * @cpu: rd attached CPU
  */
 void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
 {
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index b010d26e108e..c26e7a0e5a66 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,35 +1,28 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_CPUDL_H
-#define _LINUX_CPUDL_H
-
 #include <linux/sched.h>
 #include <linux/sched/deadline.h>
 
-#define IDX_INVALID     -1
+#define IDX_INVALID		-1
 
 struct cpudl_item {
-	u64 dl;
-	int cpu;
-	int idx;
+	u64			dl;
+	int			cpu;
+	int			idx;
 };
 
 struct cpudl {
-	raw_spinlock_t lock;
-	int size;
-	cpumask_var_t free_cpus;
-	struct cpudl_item *elements;
+	raw_spinlock_t		lock;
+	int			size;
+	cpumask_var_t		free_cpus;
+	struct cpudl_item	*elements;
 };
 
-
 #ifdef CONFIG_SMP
-int cpudl_find(struct cpudl *cp, struct task_struct *p,
-	       struct cpumask *later_mask);
+int  cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
 void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
 void cpudl_clear(struct cpudl *cp, int cpu);
-int cpudl_init(struct cpudl *cp);
+int  cpudl_init(struct cpudl *cp);
 void cpudl_set_freecpu(struct cpudl *cp, int cpu);
 void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
 void cpudl_cleanup(struct cpudl *cp);
 #endif /* CONFIG_SMP */
-
-#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7936f548e071..0dad8160e00f 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -20,52 +20,52 @@
 #include "sched.h"
 
 struct sugov_tunables {
-	struct gov_attr_set attr_set;
-	unsigned int rate_limit_us;
+	struct gov_attr_set	attr_set;
+	unsigned int		rate_limit_us;
 };
 
 struct sugov_policy {
-	struct cpufreq_policy *policy;
-
-	struct sugov_tunables *tunables;
-	struct list_head tunables_hook;
-
-	raw_spinlock_t update_lock;  /* For shared policies */
-	u64 last_freq_update_time;
-	s64 freq_update_delay_ns;
-	unsigned int next_freq;
-	unsigned int cached_raw_freq;
-
-	/* The next fields are only needed if fast switch cannot be used. */
-	struct irq_work irq_work;
-	struct kthread_work work;
-	struct mutex work_lock;
-	struct kthread_worker worker;
-	struct task_struct *thread;
-	bool work_in_progress;
-
-	bool need_freq_update;
+	struct cpufreq_policy	*policy;
+
+	struct sugov_tunables	*tunables;
+	struct list_head	tunables_hook;
+
+	raw_spinlock_t		update_lock;	/* For shared policies */
+	u64			last_freq_update_time;
+	s64			freq_update_delay_ns;
+	unsigned int		next_freq;
+	unsigned int		cached_raw_freq;
+
+	/* The next fields are only needed if fast switch cannot be used: */
+	struct			irq_work irq_work;
+	struct			kthread_work work;
+	struct			mutex work_lock;
+	struct			kthread_worker worker;
+	struct task_struct	*thread;
+	bool			work_in_progress;
+
+	bool			need_freq_update;
 };
 
 struct sugov_cpu {
-	struct update_util_data update_util;
-	struct sugov_policy *sg_policy;
-	unsigned int cpu;
+	struct update_util_data	update_util;
+	struct sugov_policy	*sg_policy;
+	unsigned int		cpu;
 
-	bool iowait_boost_pending;
-	unsigned int iowait_boost;
-	unsigned int iowait_boost_max;
+	bool			iowait_boost_pending;
+	unsigned int		iowait_boost;
+	unsigned int		iowait_boost_max;
 	u64 last_update;
 
-	/* The fields below are only needed when sharing a policy. */
-	unsigned long util_cfs;
-	unsigned long util_dl;
-	unsigned long max;
-	unsigned int flags;
+	/* The fields below are only needed when sharing a policy: */
+	unsigned long		util_cfs;
+	unsigned long		util_dl;
+	unsigned long		max;
+	unsigned int		flags;
 
-	/* The field below is for single-CPU policies only. */
+	/* The field below is for single-CPU policies only: */
 #ifdef CONFIG_NO_HZ_COMMON
-	unsigned long saved_idle_calls;
+	unsigned long		saved_idle_calls;
 #endif
 };
 
@@ -79,9 +79,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 
 	/*
 	 * Since cpufreq_update_util() is called with rq->lock held for
-	 * the @target_cpu, our per-cpu data is fully serialized.
+	 * the @target_cpu, our per-CPU data is fully serialized.
 	 *
-	 * However, drivers cannot in general deal with cross-cpu
+	 * However, drivers cannot in general deal with cross-CPU
 	 * requests, so while get_next_freq() will work, our
 	 * sugov_update_commit() call may not for the fast switching platforms.
 	 *
@@ -111,6 +111,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 	}
 
 	delta_ns = time - sg_policy->last_freq_update_time;
+
 	return delta_ns >= sg_policy->freq_update_delay_ns;
 }
 
@@ -345,8 +346,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 	return get_next_freq(sg_policy, util, max);
 }
 
-static void sugov_update_shared(struct update_util_data *hook, u64 time,
-				unsigned int flags)
+static void
+sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
 {
 	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
@@ -423,8 +424,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
 	return sprintf(buf, "%u\n", tunables->rate_limit_us);
 }
 
-static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
-				   size_t count)
+static ssize_t
+rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
 {
 	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 	struct sugov_policy *sg_policy;
@@ -479,11 +480,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
 {
 	struct task_struct *thread;
 	struct sched_attr attr = {
-		.size = sizeof(struct sched_attr),
-		.sched_policy = SCHED_DEADLINE,
-		.sched_flags = SCHED_FLAG_SUGOV,
-		.sched_nice = 0,
-		.sched_priority = 0,
+		.size		= sizeof(struct sched_attr),
+		.sched_policy	= SCHED_DEADLINE,
+		.sched_flags	= SCHED_FLAG_SUGOV,
+		.sched_nice	= 0,
+		.sched_priority	= 0,
 		/*
 		 * Fake (unused) bandwidth; workaround to "fix"
 		 * priority inheritance.
@@ -663,21 +664,21 @@ static int sugov_start(struct cpufreq_policy *policy)
 	struct sugov_policy *sg_policy = policy->governor_data;
 	unsigned int cpu;
 
-	sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
-	sg_policy->last_freq_update_time = 0;
-	sg_policy->next_freq = UINT_MAX;
-	sg_policy->work_in_progress = false;
-	sg_policy->need_freq_update = false;
-	sg_policy->cached_raw_freq = 0;
+	sg_policy->freq_update_delay_ns	= sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+	sg_policy->last_freq_update_time	= 0;
+	sg_policy->next_freq			= UINT_MAX;
+	sg_policy->work_in_progress		= false;
+	sg_policy->need_freq_update		= false;
+	sg_policy->cached_raw_freq		= 0;
 
 	for_each_cpu(cpu, policy->cpus) {
 		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 
 		memset(sg_cpu, 0, sizeof(*sg_cpu));
-		sg_cpu->cpu = cpu;
-		sg_cpu->sg_policy = sg_policy;
-		sg_cpu->flags = 0;
-		sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+		sg_cpu->cpu			= cpu;
+		sg_cpu->sg_policy		= sg_policy;
+		sg_cpu->flags			= 0;
+		sg_cpu->iowait_boost_max	= policy->cpuinfo.max_freq;
 	}
 
 	for_each_cpu(cpu, policy->cpus) {
@@ -721,14 +722,14 @@ static void sugov_limits(struct cpufreq_policy *policy)
 }
 
 static struct cpufreq_governor schedutil_gov = {
-	.name = "schedutil",
-	.owner = THIS_MODULE,
-	.dynamic_switching = true,
-	.init = sugov_init,
-	.exit = sugov_exit,
-	.start = sugov_start,
-	.stop = sugov_stop,
-	.limits = sugov_limits,
+	.name			= "schedutil",
+	.owner			= THIS_MODULE,
+	.dynamic_switching	= true,
+	.init			= sugov_init,
+	.exit			= sugov_exit,
+	.start			= sugov_start,
+	.stop			= sugov_stop,
+	.limits			= sugov_limits,
 };
 
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 2511aba36b89..f43e14ccb67d 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -14,7 +14,7 @@
  *
  *  going from the lowest priority to the highest.  CPUs in the INVALID state
  *  are not eligible for routing.  The system maintains this state with
- *  a 2 dimensional bitmap (the first for priority class, the second for cpus
+ *  a 2 dimensional bitmap (the first for priority class, the second for CPUs
  *  in that class).  Therefore a typical application without affinity
  *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
  *  searches).  For tasks with affinity restrictions, the algorithm has a
@@ -26,7 +26,6 @@
  *  as published by the Free Software Foundation; version 2
  *  of the License.
  */
-
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
@@ -128,9 +127,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 }
 
 /**
- * cpupri_set - update the cpu priority setting
+ * cpupri_set - update the CPU priority setting
  * @cp: The cpupri context
- * @cpu: The target cpu
+ * @cpu: The target CPU
  * @newpri: The priority (INVALID-RT99) to assign to this CPU
  *
  * Note: Assumes cpu_rq(cpu)->lock is locked
@@ -151,7 +150,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 		return;
 
 	/*
-	 * If the cpu was currently mapped to a different value, we
+	 * If the CPU was currently mapped to a different value, we
 	 * need to map it to the new value then remove the old value.
 	 * Note, we must add the new value first, otherwise we risk the
 	 * cpu being missed by the priority loop in cpupri_find.
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index bab050019071..141a06c914c6 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,32 +1,26 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_CPUPRI_H
-#define _LINUX_CPUPRI_H
-
 #include <linux/sched.h>
 
 #define CPUPRI_NR_PRIORITIES	(MAX_RT_PRIO + 2)
 
-#define CPUPRI_INVALID -1
-#define CPUPRI_IDLE     0
-#define CPUPRI_NORMAL   1
+#define CPUPRI_INVALID		-1
+#define CPUPRI_IDLE		 0
+#define CPUPRI_NORMAL		 1
 /* values 2-101 are RT priorities 0-99 */
 
 struct cpupri_vec {
-	atomic_t	count;
-	cpumask_var_t	mask;
+	atomic_t		count;
+	cpumask_var_t		mask;
 };
 
 struct cpupri {
-	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
-	int *cpu_to_pri;
+	struct cpupri_vec	pri_to_cpu[CPUPRI_NR_PRIORITIES];
+	int			*cpu_to_pri;
 };
 
 #ifdef CONFIG_SMP
-int  cpupri_find(struct cpupri *cp,
-		 struct task_struct *p, struct cpumask *lowest_mask);
+int  cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
-int cpupri_init(struct cpupri *cp);
+int  cpupri_init(struct cpupri *cp);
 void cpupri_cleanup(struct cpupri *cp);
 #endif
-
-#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bac6ac9a4ec7..d3b450b57ade 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -113,9 +113,9 @@ static inline void task_group_account_field(struct task_struct *p, int index,
 }
 
 /*
- * Account user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
+ * Account user CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in user space since the last update
  */
 void account_user_time(struct task_struct *p, u64 cputime)
 {
@@ -135,9 +135,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
 }
 
 /*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
+ * Account guest CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in virtual machine since the last update
  */
 void account_guest_time(struct task_struct *p, u64 cputime)
 {
@@ -159,9 +159,9 @@ void account_guest_time(struct task_struct *p, u64 cputime)
 }
 
 /*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
+ * Account system CPU time to a process and desired cpustat field
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in kernel space since the last update
  * @index: pointer to cpustat field that has to be updated
  */
 void account_system_index_time(struct task_struct *p,
@@ -179,10 +179,10 @@ void account_system_index_time(struct task_struct *p,
 }
 
 /*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
+ * Account system CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime: the CPU time spent in kernel space since the last update
  */
 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
 {
@@ -205,7 +205,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
 
 /*
  * Account for involuntary wait time.
- * @cputime: the cpu time spent in involuntary wait
+ * @cputime: the CPU time spent in involuntary wait
  */
 void account_steal_time(u64 cputime)
 {
@@ -216,7 +216,7 @@ void account_steal_time(u64 cputime)
 
 /*
  * Account for idle time.
- * @cputime: the cpu time spent in idle wait
+ * @cputime: the CPU time spent in idle wait
  */
 void account_idle_time(u64 cputime)
 {
@@ -338,7 +338,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
  * Account a tick to a process and cpustat
- * @p: the process that the cpu time gets accounted to
+ * @p: the process that the CPU time gets accounted to
  * @user_tick: is the tick from userspace
  * @rq: the pointer to rq
  *
@@ -400,17 +400,16 @@ static void irqtime_account_idle_ticks(int ticks)
 	irqtime_account_process_tick(current, 0, rq, ticks);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static inline void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_idle_ticks(int ticks) { }
 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-						struct rq *rq, int nr_ticks) {}
+						struct rq *rq, int nr_ticks) { }
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 /*
  * Use precise platform statistics if available:
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 void vtime_common_task_switch(struct task_struct *prev)
 {
 	if (is_idle_task(prev))
@@ -421,8 +420,7 @@ void vtime_common_task_switch(struct task_struct *prev)
 	vtime_flush(prev);
 	arch_vtime_task_switch(prev);
 }
-#endif
-
+# endif
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
 
 
@@ -469,10 +467,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 	*ut = cputime.utime;
 	*st = cputime.stime;
 }
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
+
 /*
- * Account a single tick of cpu time.
- * @p: the process that the cpu time gets accounted to
+ * Account a single tick of CPU time.
+ * @p: the process that the CPU time gets accounted to
  * @user_tick: indicates if the tick is a user or a system tick
  */
 void account_process_tick(struct task_struct *p, int user_tick)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 65cd5ead1759..58f8b7b37983 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -539,12 +539,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 
 		/*
 		 * If we cannot preempt any rq, fall back to pick any
-		 * online cpu.
+		 * online CPU:
 		 */
 		cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
 		if (cpu >= nr_cpu_ids) {
 			/*
-			 * Fail to find any suitable cpu.
+			 * Failed to find any suitable CPU.
 			 * The task will never come back!
 			 */
 			BUG_ON(dl_bandwidth_enabled());
@@ -608,8 +608,7 @@ static inline void queue_pull_task(struct rq *rq)
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
-				  int flags);
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
 
 /*
  * We are being explicitly informed that a new instance is starting,
@@ -1873,7 +1872,7 @@ static int find_later_rq(struct task_struct *task)
 
 	/*
 	 * We have to consider system topology and task affinity
-	 * first, then we can look for a suitable cpu.
+	 * first, then we can look for a suitable CPU.
 	 */
 	if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
 		return -1;
@@ -1887,7 +1886,7 @@ static int find_later_rq(struct task_struct *task)
 	 * Now we check how well this matches with task's
 	 * affinity and system topology.
 	 *
-	 * The last cpu where the task run is our first
+	 * The last CPU where the task run is our first
 	 * guess, since it is most likely cache-hot there.
 	 */
 	if (cpumask_test_cpu(cpu, later_mask))
@@ -1917,9 +1916,9 @@ static int find_later_rq(struct task_struct *task)
 			best_cpu = cpumask_first_and(later_mask,
 							sched_domain_span(sd));
 			/*
-			 * Last chance: if a cpu being in both later_mask
+			 * Last chance: if a CPU being in both later_mask
 			 * and current sd span is valid, that becomes our
-			 * choice. Of course, the latest possible cpu is
+			 * choice. Of course, the latest possible CPU is
 			 * already under consideration through later_mask.
 			 */
 			if (best_cpu < nr_cpu_ids) {
@@ -2075,7 +2074,7 @@ retry:
 		if (task == next_task) {
 			/*
 			 * The task is still there. We don't try
-			 * again, some other cpu will pull it when ready.
+			 * again, some other CPU will pull it when ready.
 			 */
 			goto out;
 		}
@@ -2308,7 +2307,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	/*
 	 * Since this might be the only -deadline task on the rq,
 	 * this is the right place to try to pull some other one
-	 * from an overloaded cpu, if any.
+	 * from an overloaded CPU, if any.
 	 */
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
@@ -2634,17 +2633,17 @@ void __dl_clear_params(struct task_struct *p)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
 
-	dl_se->dl_runtime = 0;
-	dl_se->dl_deadline = 0;
-	dl_se->dl_period = 0;
-	dl_se->flags = 0;
-	dl_se->dl_bw = 0;
-	dl_se->dl_density = 0;
+	dl_se->dl_runtime		= 0;
+	dl_se->dl_deadline		= 0;
+	dl_se->dl_period		= 0;
+	dl_se->flags			= 0;
+	dl_se->dl_bw			= 0;
+	dl_se->dl_density		= 0;
 
-	dl_se->dl_throttled = 0;
-	dl_se->dl_yielded = 0;
-	dl_se->dl_non_contending = 0;
-	dl_se->dl_overrun = 0;
+	dl_se->dl_throttled		= 0;
+	dl_se->dl_yielded		= 0;
+	dl_se->dl_non_contending	= 0;
+	dl_se->dl_overrun		= 0;
 }
 
 bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
@@ -2663,21 +2662,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
 #ifdef CONFIG_SMP
 int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
 {
-	unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
-							cs_cpus_allowed);
+	unsigned int dest_cpu;
 	struct dl_bw *dl_b;
 	bool overflow;
 	int cpus, ret;
 	unsigned long flags;
 
+	dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
+
 	rcu_read_lock_sched();
 	dl_b = dl_bw_of(dest_cpu);
 	raw_spin_lock_irqsave(&dl_b->lock, flags);
 	cpus = dl_bw_cpus(dest_cpu);
 	overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
-	if (overflow)
+	if (overflow) {
 		ret = -EBUSY;
-	else {
+	} else {
 		/*
 		 * We reserve space for this task in the destination
 		 * root_domain, as we can't fail after this point.
@@ -2689,6 +2689,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
 	}
 	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 	rcu_read_unlock_sched();
+
 	return ret;
 }
 
@@ -2709,6 +2710,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
 		ret = 0;
 	raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
 	rcu_read_unlock_sched();
+
 	return ret;
 }
 
@@ -2726,6 +2728,7 @@ bool dl_cpu_busy(unsigned int cpu)
 	overflow = __dl_overflow(dl_b, cpus, 0, 0);
 	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 	rcu_read_unlock_sched();
+
 	return overflow;
 }
 #endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1ca0130ed4f9..7c82a9b88510 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -9,7 +9,6 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-
 #include <linux/proc_fs.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/task.h>
@@ -274,34 +273,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 	if (table == NULL)
 		return NULL;
 
-	set_table_entry(&table[0], "min_interval", &sd->min_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[1], "max_interval", &sd->max_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[9], "cache_nice_tries",
-		&sd->cache_nice_tries,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[10], "flags", &sd->flags,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[11], "max_newidle_lb_cost",
-		&sd->max_newidle_lb_cost,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[12], "name", sd->name,
-		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+	set_table_entry(&table[0] , "min_interval",	   &sd->min_interval,	     sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[1] , "max_interval",	   &sd->max_interval,	     sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[2] , "busy_idx",		   &sd->busy_idx,	     sizeof(int) , 0644, proc_dointvec_minmax,   true );
+	set_table_entry(&table[3] , "idle_idx",		   &sd->idle_idx,	     sizeof(int) , 0644, proc_dointvec_minmax,   true );
+	set_table_entry(&table[4] , "newidle_idx",	   &sd->newidle_idx,	     sizeof(int) , 0644, proc_dointvec_minmax,   true );
+	set_table_entry(&table[5] , "wake_idx",		   &sd->wake_idx,	     sizeof(int) , 0644, proc_dointvec_minmax,   true );
+	set_table_entry(&table[6] , "forkexec_idx",	   &sd->forkexec_idx,	     sizeof(int) , 0644, proc_dointvec_minmax,   true );
+	set_table_entry(&table[7] , "busy_factor",	   &sd->busy_factor,	     sizeof(int) , 0644, proc_dointvec_minmax,   false);
+	set_table_entry(&table[8] , "imbalance_pct",	   &sd->imbalance_pct,	     sizeof(int) , 0644, proc_dointvec_minmax,   false);
+	set_table_entry(&table[9] , "cache_nice_tries",	   &sd->cache_nice_tries,    sizeof(int) , 0644, proc_dointvec_minmax,   false);
+	set_table_entry(&table[10], "flags",		   &sd->flags,		     sizeof(int) , 0644, proc_dointvec_minmax,   false);
+	set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[12], "name",		   sd->name,		CORENAME_MAX_SIZE, 0444, proc_dostring,		 false);
 	/* &table[13] is terminator */
 
 	return table;
@@ -332,8 +316,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 	return table;
 }
 
-static cpumask_var_t sd_sysctl_cpus;
-static struct ctl_table_header *sd_sysctl_header;
+static cpumask_var_t		sd_sysctl_cpus;
+static struct ctl_table_header	*sd_sysctl_header;
 
 void register_sched_domain_sysctl(void)
 {
@@ -413,14 +397,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 {
 	struct sched_entity *se = tg->se[cpu];
 
-#define P(F) \
-	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
-#define P_SCHEDSTAT(F) \
-	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)schedstat_val(F))
-#define PN(F) \
-	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN_SCHEDSTAT(F) \
-	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
+#define P(F)		SEQ_printf(m, "  .%-30s: %lld\n",	#F, (long long)F)
+#define P_SCHEDSTAT(F)	SEQ_printf(m, "  .%-30s: %lld\n",	#F, (long long)schedstat_val(F))
+#define PN(F)		SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F)	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
 
 	if (!se)
 		return;
@@ -428,6 +408,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	PN(se->exec_start);
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
+
 	if (schedstat_enabled()) {
 		PN_SCHEDSTAT(se->statistics.wait_start);
 		PN_SCHEDSTAT(se->statistics.sleep_start);
@@ -440,6 +421,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 		PN_SCHEDSTAT(se->statistics.wait_sum);
 		P_SCHEDSTAT(se->statistics.wait_count);
 	}
+
 	P(se->load.weight);
 	P(se->runnable_weight);
 #ifdef CONFIG_SMP
@@ -464,6 +446,7 @@ static char *task_group_path(struct task_group *tg)
 		return group_path;
 
 	cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+
 	return group_path;
 }
 #endif
@@ -799,9 +782,9 @@ void sysrq_sched_debug_show(void)
 /*
  * This itererator needs some explanation.
  * It returns 1 for the header position.
- * This means 2 is cpu 0.
- * In a hotplugged system some cpus, including cpu 0, may be missing so we have
- * to use cpumask_* to iterate over the cpus.
+ * This means 2 is CPU 0.
+ * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
+ * to use cpumask_* to iterate over the CPUs.
  */
 static void *sched_debug_start(struct seq_file *file, loff_t *offset)
 {
@@ -821,6 +804,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
 
 	if (n < nr_cpu_ids)
 		return (void *)(unsigned long)(n + 2);
+
 	return NULL;
 }
 
@@ -835,10 +819,10 @@ static void sched_debug_stop(struct seq_file *file, void *data)
 }
 
 static const struct seq_operations sched_debug_sops = {
-	.start = sched_debug_start,
-	.next = sched_debug_next,
-	.stop = sched_debug_stop,
-	.show = sched_debug_show,
+	.start		= sched_debug_start,
+	.next		= sched_debug_next,
+	.stop		= sched_debug_stop,
+	.show		= sched_debug_show,
 };
 
 static int sched_debug_release(struct inode *inode, struct file *file)
@@ -876,14 +860,10 @@ static int __init init_sched_debug_procfs(void)
 
 __initcall(init_sched_debug_procfs);
 
-#define __P(F) \
-	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
-#define P(F) \
-	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
-#define __PN(F) \
-	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) \
-	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define __P(F)	SEQ_printf(m, "%-45s:%21Ld\n",	     #F, (long long)F)
+#define   P(F)	SEQ_printf(m, "%-45s:%21Ld\n",	     #F, (long long)p->F)
+#define __PN(F)	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define   PN(F)	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 
 
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e1febd252a84..1f877de96c9b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,7 +20,6 @@
  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
-
 #include <linux/sched/mm.h>
 #include <linux/sched/topology.h>
 
@@ -103,7 +102,7 @@ const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 
 #ifdef CONFIG_SMP
 /*
- * For asym packing, by default the lower numbered cpu has higher priority.
+ * For asym packing, by default the lower numbered CPU has higher priority.
  */
 int __weak arch_asym_cpu_priority(int cpu)
 {
@@ -1181,7 +1180,7 @@ pid_t task_numa_group_id(struct task_struct *p)
 }
 
 /*
- * The averaged statistics, shared & private, memory & cpu,
+ * The averaged statistics, shared & private, memory & CPU,
  * occupy the first half of the array. The second half of the
  * array is for current counters, which are averaged into the
  * first set by task_numa_placement.
@@ -1587,7 +1586,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	 * be incurred if the tasks were swapped.
 	 */
 	if (cur) {
-		/* Skip this swap candidate if cannot move to the source cpu */
+		/* Skip this swap candidate if cannot move to the source CPU: */
 		if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
 			goto unlock;
 
@@ -1631,7 +1630,7 @@ static void task_numa_compare(struct task_numa_env *env,
 		goto balance;
 	}
 
-	/* Balance doesn't matter much if we're running a task per cpu */
+	/* Balance doesn't matter much if we're running a task per CPU: */
 	if (imp > env->best_imp && src_rq->nr_running == 1 &&
 			dst_rq->nr_running == 1)
 		goto assign;
@@ -1676,7 +1675,7 @@ balance:
 	 */
 	if (!cur) {
 		/*
-		 * select_idle_siblings() uses an per-cpu cpumask that
+		 * select_idle_siblings() uses an per-CPU cpumask that
 		 * can be used from IRQ context.
 		 */
 		local_irq_disable();
@@ -3362,7 +3361,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 }
 
 /*
- * Called within set_task_rq() right before setting a task's cpu. The
+ * Called within set_task_rq() right before setting a task's CPU. The
  * caller only guarantees p->pi_lock is held; no other assumptions,
  * including the state of rq->lock, should be made.
  */
@@ -3541,7 +3540,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
 
 	/*
 	 * runnable_sum can't be lower than running_sum
-	 * As running sum is scale with cpu capacity wehreas the runnable sum
+	 * As running sum is scale with CPU capacity wehreas the runnable sum
 	 * is not we rescale running_sum 1st
 	 */
 	running_sum = se->avg.util_sum /
@@ -4688,7 +4687,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!se)
 		add_nr_running(rq, task_delta);
 
-	/* determine whether we need to wake up potentially idle cpu */
+	/* Determine whether we need to wake up potentially idle CPU: */
 	if (rq->curr == rq->idle && rq->cfs.nr_running)
 		resched_curr(rq);
 }
@@ -5053,7 +5052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 }
 
 /*
- * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
+ * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
  *
  * The race is harmless, since modifying bandwidth settings of unhooked group
  * bits doesn't do much.
@@ -5098,7 +5097,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 		 */
 		cfs_rq->runtime_remaining = 1;
 		/*
-		 * Offline rq is schedulable till cpu is completely disabled
+		 * Offline rq is schedulable till CPU is completely disabled
 		 * in take_cpu_down(), so we prevent new cfs throttling here.
 		 */
 		cfs_rq->runtime_enabled = 0;
@@ -5335,8 +5334,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
  *
  *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
  *
- * If a cpu misses updates for n ticks (as it was idle) and update gets
- * called on the n+1-th tick when cpu may be busy, then we have:
+ * If a CPU misses updates for n ticks (as it was idle) and update gets
+ * called on the n+1-th tick when CPU may be busy, then we have:
  *
  *   load_n   = (1 - 1/2^i)^n * load_0
  *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
@@ -5480,7 +5479,7 @@ static unsigned long weighted_cpuload(struct rq *rq)
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
  *
  * Therefore we need to avoid the delta approach from the regular tick when
@@ -5591,7 +5590,7 @@ void cpu_load_update_active(struct rq *this_rq)
 }
 
 /*
- * Return a low guess at the load of a migration-source cpu weighted
+ * Return a low guess at the load of a migration-source CPU weighted
  * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
@@ -5609,7 +5608,7 @@ static unsigned long source_load(int cpu, int type)
 }
 
 /*
- * Return a high guess at the load of a migration-target cpu weighted
+ * Return a high guess at the load of a migration-target CPU weighted
  * according to the scheduling class and "nice" value.
  */
 static unsigned long target_load(int cpu, int type)
@@ -5889,7 +5888,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		max_spare_cap = 0;
 
 		for_each_cpu(i, sched_group_span(group)) {
-			/* Bias balancing toward cpus of our domain */
+			/* Bias balancing toward CPUs of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
 			else
@@ -5919,7 +5918,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 			if (min_runnable_load > (runnable_load + imbalance)) {
 				/*
 				 * The runnable load is significantly smaller
-				 * so we can pick this new cpu
+				 * so we can pick this new CPU:
 				 */
 				min_runnable_load = runnable_load;
 				min_avg_load = avg_load;
@@ -5928,7 +5927,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 				   (100*min_avg_load > imbalance_scale*avg_load)) {
 				/*
 				 * The runnable loads are close so take the
-				 * blocked load into account through avg_load.
+				 * blocked load into account through avg_load:
 				 */
 				min_avg_load = avg_load;
 				idlest = group;
@@ -5989,7 +5988,7 @@ skip_spare:
 }
 
 /*
- * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
  */
 static int
 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -6067,12 +6066,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 
 		new_cpu = find_idlest_group_cpu(group, p, cpu);
 		if (new_cpu == cpu) {
-			/* Now try balancing at a lower domain level of cpu */
+			/* Now try balancing at a lower domain level of 'cpu': */
 			sd = sd->child;
 			continue;
 		}
 
-		/* Now try balancing at a lower domain level of new_cpu */
+		/* Now try balancing at a lower domain level of 'new_cpu': */
 		cpu = new_cpu;
 		weight = sd->span_weight;
 		sd = NULL;
@@ -6082,7 +6081,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 			if (tmp->flags & sd_flag)
 				sd = tmp;
 		}
-		/* while loop will break here if sd == NULL */
 	}
 
 	return new_cpu;
@@ -6278,12 +6276,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 		return target;
 
 	/*
-	 * If the previous cpu is cache affine and idle, don't be stupid.
+	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
 	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
 		return prev;
 
-	/* Check a recently used CPU as a potential idle candidate */
+	/* Check a recently used CPU as a potential idle candidate: */
 	recent_used_cpu = p->recent_used_cpu;
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
@@ -6292,7 +6290,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
 		/*
 		 * Replace recent_used_cpu with prev as it is a potential
-		 * candidate for the next wake.
+		 * candidate for the next wake:
 		 */
 		p->recent_used_cpu = prev;
 		return recent_used_cpu;
@@ -6357,7 +6355,7 @@ static inline unsigned long task_util(struct task_struct *p)
 }
 
 /*
- * cpu_util_wake: Compute cpu utilization with any contributions from
+ * cpu_util_wake: Compute CPU utilization with any contributions from
  * the waking task p removed.
  */
 static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
@@ -6403,10 +6401,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
  *
- * Balances load by selecting the idlest cpu in the idlest group, or under
- * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
+ * Balances load by selecting the idlest CPU in the idlest group, or under
+ * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
  *
- * Returns the target cpu number.
+ * Returns the target CPU number.
  *
  * preempt must be disabled.
  */
@@ -6431,7 +6429,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 			break;
 
 		/*
-		 * If both cpu and prev_cpu are part of this domain,
+		 * If both 'cpu' and 'prev_cpu' are part of this domain,
 		 * cpu is a valid SD_WAKE_AFFINE target.
 		 */
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
@@ -6482,9 +6480,9 @@ pick_cpu:
 static void detach_entity_cfs_rq(struct sched_entity *se);
 
 /*
- * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
+ * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
  */
 static void migrate_task_rq_fair(struct task_struct *p)
 {
@@ -6918,17 +6916,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  * BASICS
  *
  * The purpose of load-balancing is to achieve the same basic fairness the
- * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * per-CPU scheduler provides, namely provide a proportional amount of compute
  * time to each task. This is expressed in the following equation:
  *
  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
  *
- * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
  * W_i,0 is defined as:
  *
  *   W_i,0 = \Sum_j w_i,j                                             (2)
  *
- * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
  * is derived from the nice value as per sched_prio_to_weight[].
  *
  * The weight average is an exponential decay average of the instantaneous
@@ -6936,7 +6934,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  *
  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
  *
- * C_i is the compute capacity of cpu i, typically it is the
+ * C_i is the compute capacity of CPU i, typically it is the
  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
  * can also include other factors [XXX].
  *
@@ -6957,11 +6955,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  * SCHED DOMAINS
  *
  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
- * for all i,j solution, we create a tree of cpus that follows the hardware
+ * for all i,j solution, we create a tree of CPUs that follows the hardware
  * topology where each level pairs two lower groups (or better). This results
- * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
  * tree to only the first of the previous level and we decrease the frequency
- * of load-balance at each level inv. proportional to the number of cpus in
+ * of load-balance at each level inv. proportional to the number of CPUs in
  * the groups.
  *
  * This yields:
@@ -6970,7 +6968,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
  *     i = 0      2^i   2^i
  *                               `- size of each group
- *         |         |     `- number of cpus doing load-balance
+ *         |         |     `- number of CPUs doing load-balance
  *         |         `- freq
  *         `- sum over all levels
  *
@@ -6978,7 +6976,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  * this makes (5) the runtime complexity of the balancer.
  *
  * An important property here is that each CPU is still (indirectly) connected
- * to every other cpu in at most O(log n) steps:
+ * to every other CPU in at most O(log n) steps:
  *
  * The adjacency matrix of the resulting graph is given by:
  *
@@ -6990,7 +6988,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  *
  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
  *
- * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * Showing there's indeed a path between every CPU in at most O(log n) steps.
  * The task movement gives a factor of O(m), giving a convergence complexity
  * of:
  *
@@ -7000,7 +6998,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  * WORK CONSERVING
  *
  * In order to avoid CPUs going idle while there's still work to do, new idle
- * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * balancing is more aggressive and has the newly idle CPU iterate up the domain
  * tree itself instead of relying on other CPUs to bring it work.
  *
  * This adds some complexity to both (5) and (8) but it reduces the total idle
@@ -7021,7 +7019,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  *
  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
  *
- * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
  *
  * The big problem is S_k, its a global sum needed to compute a local (W_i)
  * property.
@@ -7185,7 +7183,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 		env->flags |= LBF_SOME_PINNED;
 
 		/*
-		 * Remember if this task can be migrated to any other cpu in
+		 * Remember if this task can be migrated to any other CPU in
 		 * our sched_group. We may want to revisit it if we couldn't
 		 * meet load balance goals by pulling other tasks on src_cpu.
 		 *
@@ -7195,7 +7193,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 		if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
 			return 0;
 
-		/* Prevent to re-select dst_cpu via env's cpus */
+		/* Prevent to re-select dst_cpu via env's CPUs: */
 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
 			if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
 				env->flags |= LBF_DST_PINNED;
@@ -7769,8 +7767,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
  * Group imbalance indicates (and tries to solve) the problem where balancing
  * groups is inadequate due to ->cpus_allowed constraints.
  *
- * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
- * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
+ * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
+ * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
  * Something like:
  *
  *	{ 0 1 2 3 } { 4 5 6 7 }
@@ -7778,7 +7776,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
  *
  * If we were to balance group-wise we'd place two tasks in the first group and
  * two tasks in the second group. Clearly this is undesired as it will overload
- * cpu 3 and leave one of the cpus in the second group unused.
+ * cpu 3 and leave one of the CPUs in the second group unused.
  *
  * The current solution to this issue is detecting the skew in the first group
  * by noticing the lower domain failed to reach balance and had difficulty
@@ -7891,7 +7889,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
-		/* Bias balancing toward cpus of our domain */
+		/* Bias balancing toward CPUs of our domain: */
 		if (local_group)
 			load = target_load(i, load_idx);
 		else
@@ -7977,7 +7975,7 @@ asym_packing:
 	if (!(env->sd->flags & SD_ASYM_PACKING))
 		return true;
 
-	/* No ASYM_PACKING if target cpu is already busy */
+	/* No ASYM_PACKING if target CPU is already busy */
 	if (env->idle == CPU_NOT_IDLE)
 		return true;
 	/*
@@ -7990,7 +7988,7 @@ asym_packing:
 		if (!sds->busiest)
 			return true;
 
-		/* Prefer to move from lowest priority cpu's work */
+		/* Prefer to move from lowest priority CPU's work */
 		if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
 				      sg->asym_prefer_cpu))
 			return true;
@@ -8243,7 +8241,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	if (busiest->group_type == group_imbalanced) {
 		/*
 		 * In the group_imb case we cannot rely on group-wide averages
-		 * to ensure cpu-load equilibrium, look at wider averages. XXX
+		 * to ensure CPU-load equilibrium, look at wider averages. XXX
 		 */
 		busiest->load_per_task =
 			min(busiest->load_per_task, sds->avg_load);
@@ -8262,7 +8260,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	}
 
 	/*
-	 * If there aren't any idle cpus, avoid creating some.
+	 * If there aren't any idle CPUs, avoid creating some.
 	 */
 	if (busiest->group_type == group_overloaded &&
 	    local->group_type   == group_overloaded) {
@@ -8276,9 +8274,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	}
 
 	/*
-	 * We're trying to get all the cpus to the average_load, so we don't
+	 * We're trying to get all the CPUs to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
-	 * reduce the max loaded cpu below the average load. At the same time,
+	 * reduce the max loaded CPU below the average load. At the same time,
 	 * we also don't want to reduce the group load below the group
 	 * capacity. Thus we look for the minimum possible imbalance.
 	 */
@@ -8372,9 +8370,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 
 	if (env->idle == CPU_IDLE) {
 		/*
-		 * This cpu is idle. If the busiest group is not overloaded
+		 * This CPU is idle. If the busiest group is not overloaded
 		 * and there is no imbalance between this and busiest group
-		 * wrt idle cpus, it is balanced. The imbalance becomes
+		 * wrt idle CPUs, it is balanced. The imbalance becomes
 		 * significant if the diff is greater than 1 otherwise we
 		 * might end up to just move the imbalance on another group
 		 */
@@ -8402,7 +8400,7 @@ out_balanced:
 }
 
 /*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
  */
 static struct rq *find_busiest_queue(struct lb_env *env,
 				     struct sched_group *group)
@@ -8446,7 +8444,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 
 		/*
 		 * When comparing with imbalance, use weighted_cpuload()
-		 * which is not scaled with the cpu capacity.
+		 * which is not scaled with the CPU capacity.
 		 */
 
 		if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -8454,9 +8452,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 			continue;
 
 		/*
-		 * For the load comparisons with the other cpu's, consider
-		 * the weighted_cpuload() scaled with the cpu capacity, so
-		 * that the load can be moved away from the cpu that is
+		 * For the load comparisons with the other CPU's, consider
+		 * the weighted_cpuload() scaled with the CPU capacity, so
+		 * that the load can be moved away from the CPU that is
 		 * potentially running at a lower capacity.
 		 *
 		 * Thus we're looking for max(wl_i / capacity_i), crosswise
@@ -8527,13 +8525,13 @@ static int should_we_balance(struct lb_env *env)
 		return 0;
 
 	/*
-	 * In the newly idle case, we will allow all the cpu's
+	 * In the newly idle case, we will allow all the CPUs
 	 * to do the newly idle load balance.
 	 */
 	if (env->idle == CPU_NEWLY_IDLE)
 		return 1;
 
-	/* Try to find first idle cpu */
+	/* Try to find first idle CPU */
 	for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
 		if (!idle_cpu(cpu))
 			continue;
@@ -8546,7 +8544,7 @@ static int should_we_balance(struct lb_env *env)
 		balance_cpu = group_balance_cpu(sg);
 
 	/*
-	 * First idle cpu or the first cpu(busiest) in this sched group
+	 * First idle CPU or the first CPU(busiest) in this sched group
 	 * is eligible for doing load balancing at this and above domains.
 	 */
 	return balance_cpu == env->dst_cpu;
@@ -8655,7 +8653,7 @@ more_balance:
 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
 		 * us and move them to an alternate dst_cpu in our sched_group
 		 * where they can run. The upper limit on how many times we
-		 * iterate on same src_cpu is dependent on number of cpus in our
+		 * iterate on same src_cpu is dependent on number of CPUs in our
 		 * sched_group.
 		 *
 		 * This changes load balance semantics a bit on who can move
@@ -8672,7 +8670,7 @@ more_balance:
 		 */
 		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
 
-			/* Prevent to re-select dst_cpu via env's cpus */
+			/* Prevent to re-select dst_cpu via env's CPUs */
 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
 
 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
@@ -8734,9 +8732,10 @@ more_balance:
 
 			raw_spin_lock_irqsave(&busiest->lock, flags);
 
-			/* don't kick the active_load_balance_cpu_stop,
-			 * if the curr task on busiest cpu can't be
-			 * moved to this_cpu
+			/*
+			 * Don't kick the active_load_balance_cpu_stop,
+			 * if the curr task on busiest CPU can't be
+			 * moved to this_cpu:
 			 */
 			if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
 				raw_spin_unlock_irqrestore(&busiest->lock,
@@ -8962,7 +8961,7 @@ out:
 }
 
 /*
- * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
  * running tasks off the busiest CPU onto idle CPUs. It requires at
  * least 1 task to be running on each physical CPU where possible, and
  * avoids physical / logical imbalances.
@@ -8986,7 +8985,7 @@ static int active_load_balance_cpu_stop(void *data)
 	if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
 		goto out_unlock;
 
-	/* make sure the requested cpu hasn't gone down in the meantime */
+	/* Make sure the requested CPU hasn't gone down in the meantime: */
 	if (unlikely(busiest_cpu != smp_processor_id() ||
 		     !busiest_rq->active_balance))
 		goto out_unlock;
@@ -8998,7 +8997,7 @@ static int active_load_balance_cpu_stop(void *data)
 	/*
 	 * This condition is "impossible", if it occurs
 	 * we need to fix it. Originally reported by
-	 * Bjorn Helgaas on a 128-cpu setup.
+	 * Bjorn Helgaas on a 128-CPU setup.
 	 */
 	BUG_ON(busiest_rq == target_rq);
 
@@ -9100,7 +9099,7 @@ static void nohz_balancer_kick(void)
 		return;
 	/*
 	 * Use smp_send_reschedule() instead of resched_cpu().
-	 * This way we generate a sched IPI on the target cpu which
+	 * This way we generate a sched IPI on the target CPU which
 	 * is idle. And the softirq performing nohz idle load balance
 	 * will be run before returning from the IPI.
 	 */
@@ -9157,14 +9156,12 @@ unlock:
 }
 
 /*
- * This routine will record that the cpu is going idle with tick stopped.
+ * This routine will record that the CPU is going idle with tick stopped.
  * This info will be used in performing idle load balancing in the future.
  */
 void nohz_balance_enter_idle(int cpu)
 {
-	/*
-	 * If this cpu is going down, then nothing needs to be done.
-	 */
+	/* If this CPU is going down, then nothing needs to be done: */
 	if (!cpu_active(cpu))
 		return;
 
@@ -9175,9 +9172,7 @@ void nohz_balance_enter_idle(int cpu)
 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
 		return;
 
-	/*
-	 * If we're a completely isolated CPU, we don't play.
-	 */
+	/* If we're a completely isolated CPU, we don't play: */
 	if (on_null_domain(cpu_rq(cpu)))
 		return;
 
@@ -9286,7 +9281,7 @@ out:
 
 	/*
 	 * next_balance will be updated only when there is a need.
-	 * When the cpu is attached to null domain for ex, it will not be
+	 * When the CPU is attached to null domain for ex, it will not be
 	 * updated.
 	 */
 	if (likely(update_next_balance)) {
@@ -9310,7 +9305,7 @@ out:
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * rebalancing for all the CPUs for whom scheduler ticks are stopped.
  */
 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
@@ -9330,8 +9325,8 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 			continue;
 
 		/*
-		 * If this cpu gets work to do, stop the load balancing
-		 * work being done for other cpus. Next load
+		 * If this CPU gets work to do, stop the load balancing
+		 * work being done for other CPUs. Next load
 		 * balancing owner will pick it up.
 		 */
 		if (need_resched())
@@ -9373,13 +9368,13 @@ end:
 
 /*
  * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu in the system.
+ * of an idle CPU in the system.
  *   - This rq has more than one task.
  *   - This rq has at least one CFS task and the capacity of the CPU is
  *     significantly reduced because of RT tasks or IRQs.
- *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
- *     multiple busy cpu.
- *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
+ *   - At parent of LLC scheduler domain level, this CPU's scheduler group has
+ *     multiple busy CPUs.
+ *   - For SD_ASYM_PACKING, if the lower numbered CPU's in the scheduler
  *     domain span are idle.
  */
 static inline bool nohz_kick_needed(struct rq *rq)
@@ -9469,10 +9464,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 						CPU_IDLE : CPU_NOT_IDLE;
 
 	/*
-	 * If this cpu has a pending nohz_balance_kick, then do the
-	 * balancing on behalf of the other idle cpus whose ticks are
+	 * If this CPU has a pending nohz_balance_kick, then do the
+	 * balancing on behalf of the other idle CPUs whose ticks are
 	 * stopped. Do nohz_idle_balance *before* rebalance_domains to
-	 * give the idle cpus a chance to load balance. Else we may
+	 * give the idle CPUs a chance to load balance. Else we may
 	 * load balance only within the local sched_domain hierarchy
 	 * and abort nohz_idle_balance altogether if we pull some load.
 	 */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7dae9eb8c042..343d25f85477 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,5 +1,5 @@
 /*
- * Generic entry point for the idle threads
+ * Generic entry points for the idle threads
  */
 #include <linux/sched.h>
 #include <linux/sched/idle.h>
@@ -332,8 +332,8 @@ void cpu_startup_entry(enum cpuhp_state state)
 {
 	/*
 	 * This #ifdef needs to die, but it's too late in the cycle to
-	 * make this generic (arm and sh have never invoked the canary
-	 * init for the non boot cpus!). Will be fixed in 3.11
+	 * make this generic (ARM and SH have never invoked the canary
+	 * init for the non boot CPUs!). Will be fixed in 3.11
 	 */
 #ifdef CONFIG_X86
 	/*
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 48b8a83f5185..ec73680922f8 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -14,7 +14,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
-#endif /* CONFIG_SMP */
+#endif
 
 /*
  * Idle tasks are unconditionally rescheduled:
@@ -30,6 +30,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	put_prev_task(rq, prev);
 	update_idle_core(rq);
 	schedstat_inc(rq->sched_goidle);
+
 	return rq->idle;
 }
 
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 39f340dde1d7..aad5f48a07c6 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -6,13 +6,13 @@
  * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
  *
  */
-
 #include <linux/sched/isolation.h>
 #include <linux/tick.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/static_key.h>
 #include <linux/ctype.h>
+
 #include "sched.h"
 
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 89a989e4d758..a398e7e28a8a 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -32,29 +32,29 @@
  * Due to a number of reasons the above turns in the mess below:
  *
  *  - for_each_possible_cpu() is prohibitively expensive on machines with
- *    serious number of cpus, therefore we need to take a distributed approach
+ *    serious number of CPUs, therefore we need to take a distributed approach
  *    to calculating nr_active.
  *
  *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
  *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
  *
  *    So assuming nr_active := 0 when we start out -- true per definition, we
- *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    can simply take per-CPU deltas and fold those into a global accumulate
  *    to obtain the same result. See calc_load_fold_active().
  *
- *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    Furthermore, in order to avoid synchronizing all per-CPU delta folding
  *    across the machine, we assume 10 ticks is sufficient time for every
- *    cpu to have completed this task.
+ *    CPU to have completed this task.
  *
  *    This places an upper-bound on the IRQ-off latency of the machine. Then
  *    again, being late doesn't loose the delta, just wrecks the sample.
  *
- *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- *    this would add another cross-cpu cacheline miss and atomic operation
- *    to the wakeup path. Instead we increment on whatever cpu the task ran
- *    when it went into uninterruptible state and decrement on whatever cpu
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
+ *    this would add another cross-CPU cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever CPU the task ran
+ *    when it went into uninterruptible state and decrement on whatever CPU
  *    did the wakeup. This means that only the sum of nr_uninterruptible over
- *    all cpus yields the correct result.
+ *    all CPUs yields the correct result.
  *
  *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
  */
@@ -115,11 +115,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
  * Handle NO_HZ for the global load-average.
  *
  * Since the above described distributed algorithm to compute the global
- * load-average relies on per-cpu sampling from the tick, it is affected by
+ * load-average relies on per-CPU sampling from the tick, it is affected by
  * NO_HZ.
  *
  * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
- * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * entering NO_HZ state such that we can include this as an 'extra' CPU delta
  * when we read the global state.
  *
  * Obviously reality has to ruin such a delightfully simple scheme:
@@ -146,9 +146,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
  *    busy state.
  *
  *    This is solved by pushing the window forward, and thus skipping the
- *    sample, for this cpu (effectively using the NO_HZ-delta for this cpu which
+ *    sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
  *    was in effect at the time the window opened). This also solves the issue
- *    of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ
+ *    of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
  *    intervals.
  *
  * When making the ILB scale, we should try to pull this in as well.
@@ -299,7 +299,7 @@ calc_load_n(unsigned long load, unsigned long exp,
 }
 
 /*
- * NO_HZ can leave us missing all per-cpu ticks calling
+ * NO_HZ can leave us missing all per-CPU ticks calling
  * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
  * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
  * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
@@ -363,7 +363,7 @@ void calc_global_load(unsigned long ticks)
 		return;
 
 	/*
-	 * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus.
+	 * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
 	 */
 	delta = calc_load_nohz_fold();
 	if (delta)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 5d0762633639..2c6ae2413fa2 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -27,18 +27,18 @@
  * except MEMBARRIER_CMD_QUERY.
  */
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
-#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	\
-	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK			\
+	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE			\
 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
 #else
 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
 #endif
 
-#define MEMBARRIER_CMD_BITMASK	\
-	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
-	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
-	| MEMBARRIER_CMD_PRIVATE_EXPEDITED	\
-	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED	\
+#define MEMBARRIER_CMD_BITMASK						\
+	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED	\
+	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED			\
+	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\
+	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
 	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
 
 static void ipi_mb(void *info)
@@ -85,6 +85,7 @@ static int membarrier_global_expedited(void)
 		 */
 		if (cpu == raw_smp_processor_id())
 			continue;
+
 		rcu_read_lock();
 		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
 		if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
@@ -188,6 +189,7 @@ static int membarrier_private_expedited(int flags)
 	 * rq->curr modification in scheduler.
 	 */
 	smp_mb();	/* exit from system call is not a mb */
+
 	return 0;
 }
 
@@ -219,6 +221,7 @@ static int membarrier_register_global_expedited(void)
 	}
 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 		  &mm->membarrier_state);
+
 	return 0;
 }
 
@@ -253,6 +256,7 @@ static int membarrier_register_private_expedited(int flags)
 		synchronize_sched();
 	}
 	atomic_or(state, &mm->membarrier_state);
+
 	return 0;
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c80563b4f6b9..e40498872111 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1453,9 +1453,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 		return;
 
 	/*
-	 * There appears to be other cpus that can accept
-	 * current and none to run 'p', so lets reschedule
-	 * to try and push current away:
+	 * There appear to be other CPUs that can accept
+	 * the current task but none can run 'p', so lets reschedule
+	 * to try and push the current task away:
 	 */
 	requeue_task_rt(rq, p, 1);
 	resched_curr(rq);
@@ -1596,12 +1596,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 	if (!task_running(rq, p) &&
 	    cpumask_test_cpu(cpu, &p->cpus_allowed))
 		return 1;
+
 	return 0;
 }
 
 /*
  * Return the highest pushable rq's task, which is suitable to be executed
- * on the cpu, NULL otherwise
+ * on the CPU, NULL otherwise
  */
 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
 {
@@ -1639,11 +1640,11 @@ static int find_lowest_rq(struct task_struct *task)
 		return -1; /* No targets found */
 
 	/*
-	 * At this point we have built a mask of cpus representing the
+	 * At this point we have built a mask of CPUs representing the
 	 * lowest priority tasks in the system.  Now we want to elect
 	 * the best one based on our affinity and topology.
 	 *
-	 * We prioritize the last cpu that the task executed on since
+	 * We prioritize the last CPU that the task executed on since
 	 * it is most likely cache-hot in that location.
 	 */
 	if (cpumask_test_cpu(cpu, lowest_mask))
@@ -1651,7 +1652,7 @@ static int find_lowest_rq(struct task_struct *task)
 
 	/*
 	 * Otherwise, we consult the sched_domains span maps to figure
-	 * out which cpu is logically closest to our hot cache data.
+	 * out which CPU is logically closest to our hot cache data.
 	 */
 	if (!cpumask_test_cpu(this_cpu, lowest_mask))
 		this_cpu = -1; /* Skip this_cpu opt if not among lowest */
@@ -1692,6 +1693,7 @@ static int find_lowest_rq(struct task_struct *task)
 	cpu = cpumask_any(lowest_mask);
 	if (cpu < nr_cpu_ids)
 		return cpu;
+
 	return -1;
 }
 
@@ -1827,7 +1829,7 @@ retry:
 			 * The task hasn't migrated, and is still the next
 			 * eligible task, but we failed to find a run-queue
 			 * to push it to.  Do not retry in this case, since
-			 * other cpus will pull from us when ready.
+			 * other CPUs will pull from us when ready.
 			 */
 			goto out;
 		}
@@ -1919,7 +1921,7 @@ static int rto_next_cpu(struct root_domain *rd)
 	 * rt_next_cpu() will simply return the first CPU found in
 	 * the rto_mask.
 	 *
-	 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+	 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
 	 * will return the next CPU found in the rto_mask.
 	 *
 	 * If there are no more CPUs left in the rto_mask, then a check is made
@@ -1980,7 +1982,7 @@ static void tell_cpu_to_push(struct rq *rq)
 	raw_spin_lock(&rq->rd->rto_lock);
 
 	/*
-	 * The rto_cpu is updated under the lock, if it has a valid cpu
+	 * The rto_cpu is updated under the lock, if it has a valid CPU
 	 * then the IPI is still running and will continue due to the
 	 * update to loop_next, and nothing needs to be done here.
 	 * Otherwise it is finishing up and an ipi needs to be sent.
@@ -2105,7 +2107,7 @@ static void pull_rt_task(struct rq *this_rq)
 
 			/*
 			 * There's a chance that p is higher in priority
-			 * than what's currently running on its cpu.
+			 * than what's currently running on its CPU.
 			 * This is just that p is wakeing up and hasn't
 			 * had a chance to schedule. We only pull
 			 * p if it is lower in priority than the
@@ -2693,6 +2695,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
 			msecs_to_jiffies(sysctl_sched_rr_timeslice);
 	}
 	mutex_unlock(&mutex);
+
 	return ret;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dc6c8b5a24ad..bd1461ae06e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-
+/*
+ * Scheduler internal types and methods:
+ */
 #include <linux/sched.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/sysctl.h>
@@ -79,11 +81,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
  * and does not change the user-interface for setting shares/weights.
  *
  * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are
- * pretty high and the returns do not justify the increased costs.
+ * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
+ * are pretty high and the returns do not justify the increased costs.
  *
- * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
- * increase coverage and consistency always enable it on 64bit platforms.
+ * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
+ * increase coverage and consistency always enable it on 64-bit platforms.
  */
 #ifdef CONFIG_64BIT
 # define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
@@ -111,16 +113,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
  * 10 -> just above 1us
  * 9  -> just above 0.5us
  */
-#define DL_SCALE (10)
-
-/*
- * These are the 'tuning knobs' of the scheduler:
- */
+#define DL_SCALE		10
 
 /*
- * single value that denotes runtime == period, ie unlimited time.
+ * Single value that denotes runtime == period, ie unlimited time.
  */
-#define RUNTIME_INF	((u64)~0ULL)
+#define RUNTIME_INF		((u64)~0ULL)
 
 static inline int idle_policy(int policy)
 {
@@ -235,9 +233,9 @@ void __dl_clear_params(struct task_struct *p);
  * control.
  */
 struct dl_bandwidth {
-	raw_spinlock_t dl_runtime_lock;
-	u64 dl_runtime;
-	u64 dl_period;
+	raw_spinlock_t		dl_runtime_lock;
+	u64			dl_runtime;
+	u64			dl_period;
 };
 
 static inline int dl_bandwidth_enabled(void)
@@ -246,8 +244,9 @@ static inline int dl_bandwidth_enabled(void)
 }
 
 struct dl_bw {
-	raw_spinlock_t lock;
-	u64 bw, total_bw;
+	raw_spinlock_t		lock;
+	u64			bw;
+	u64			total_bw;
 };
 
 static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
@@ -273,20 +272,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
 	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
 }
 
-void dl_change_utilization(struct task_struct *p, u64 new_bw);
+extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
 extern void init_dl_bw(struct dl_bw *dl_b);
-extern int sched_dl_global_validate(void);
+extern int  sched_dl_global_validate(void);
 extern void sched_dl_do_global(void);
-extern int sched_dl_overflow(struct task_struct *p, int policy,
-			     const struct sched_attr *attr);
+extern int  sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
 extern bool __checkparam_dl(const struct sched_attr *attr);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
-extern int dl_task_can_attach(struct task_struct *p,
-			      const struct cpumask *cs_cpus_allowed);
-extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
-					const struct cpumask *trial);
+extern int  dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
+extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
 extern bool dl_cpu_busy(unsigned int cpu);
 
 #ifdef CONFIG_CGROUP_SCHED
@@ -300,32 +296,36 @@ extern struct list_head task_groups;
 
 struct cfs_bandwidth {
 #ifdef CONFIG_CFS_BANDWIDTH
-	raw_spinlock_t lock;
-	ktime_t period;
-	u64 quota, runtime;
-	s64 hierarchical_quota;
-	u64 runtime_expires;
-
-	int idle, period_active;
-	struct hrtimer period_timer, slack_timer;
-	struct list_head throttled_cfs_rq;
-
-	/* statistics */
-	int nr_periods, nr_throttled;
-	u64 throttled_time;
+	raw_spinlock_t		lock;
+	ktime_t			period;
+	u64			quota;
+	u64			runtime;
+	s64			hierarchical_quota;
+	u64			runtime_expires;
+
+	int			idle;
+	int			period_active;
+	struct hrtimer		period_timer;
+	struct hrtimer		slack_timer;
+	struct list_head	throttled_cfs_rq;
+
+	/* Statistics: */
+	int			nr_periods;
+	int			nr_throttled;
+	u64			throttled_time;
 #endif
 };
 
-/* task group related information */
+/* Task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	/* schedulable entities of this group on each cpu */
-	struct sched_entity **se;
-	/* runqueue "owned" by this group on each cpu */
-	struct cfs_rq **cfs_rq;
-	unsigned long shares;
+	/* schedulable entities of this group on each CPU */
+	struct sched_entity	**se;
+	/* runqueue "owned" by this group on each CPU */
+	struct cfs_rq		**cfs_rq;
+	unsigned long		shares;
 
 #ifdef	CONFIG_SMP
 	/*
@@ -333,29 +333,29 @@ struct task_group {
 	 * it in its own cacheline separated from the fields above which
 	 * will also be accessed at each tick.
 	 */
-	atomic_long_t load_avg ____cacheline_aligned;
+	atomic_long_t		load_avg ____cacheline_aligned;
 #endif
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
-	struct sched_rt_entity **rt_se;
-	struct rt_rq **rt_rq;
+	struct sched_rt_entity	**rt_se;
+	struct rt_rq		**rt_rq;
 
-	struct rt_bandwidth rt_bandwidth;
+	struct rt_bandwidth	rt_bandwidth;
 #endif
 
-	struct rcu_head rcu;
-	struct list_head list;
+	struct rcu_head		rcu;
+	struct list_head	list;
 
-	struct task_group *parent;
-	struct list_head siblings;
-	struct list_head children;
+	struct task_group	*parent;
+	struct list_head	siblings;
+	struct list_head	children;
 
 #ifdef CONFIG_SCHED_AUTOGROUP
-	struct autogroup *autogroup;
+	struct autogroup	*autogroup;
 #endif
 
-	struct cfs_bandwidth cfs_bandwidth;
+	struct cfs_bandwidth	cfs_bandwidth;
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -369,8 +369,8 @@ struct task_group {
  * (The default weight is 1024 - so there's no practical
  *  limitation from this.)
  */
-#define MIN_SHARES	(1UL <<  1)
-#define MAX_SHARES	(1UL << 18)
+#define MIN_SHARES		(1UL <<  1)
+#define MAX_SHARES		(1UL << 18)
 #endif
 
 typedef int (*tg_visitor)(struct task_group *, void *);
@@ -443,35 +443,39 @@ struct cfs_bandwidth { };
 
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
-	struct load_weight load;
-	unsigned long runnable_weight;
-	unsigned int nr_running, h_nr_running;
+	struct load_weight	load;
+	unsigned long		runnable_weight;
+	unsigned int		nr_running;
+	unsigned int		h_nr_running;
 
-	u64 exec_clock;
-	u64 min_vruntime;
+	u64			exec_clock;
+	u64			min_vruntime;
 #ifndef CONFIG_64BIT
-	u64 min_vruntime_copy;
+	u64			min_vruntime_copy;
 #endif
 
-	struct rb_root_cached tasks_timeline;
+	struct rb_root_cached	tasks_timeline;
 
 	/*
 	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
-	struct sched_entity *curr, *next, *last, *skip;
+	struct sched_entity	*curr;
+	struct sched_entity	*next;
+	struct sched_entity	*last;
+	struct sched_entity	*skip;
 
 #ifdef	CONFIG_SCHED_DEBUG
-	unsigned int nr_spread_over;
+	unsigned int		nr_spread_over;
 #endif
 
 #ifdef CONFIG_SMP
 	/*
 	 * CFS load tracking
 	 */
-	struct sched_avg avg;
+	struct sched_avg	avg;
 #ifndef CONFIG_64BIT
-	u64 load_last_update_time_copy;
+	u64			load_last_update_time_copy;
 #endif
 	struct {
 		raw_spinlock_t	lock ____cacheline_aligned;
@@ -482,9 +486,9 @@ struct cfs_rq {
 	} removed;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	unsigned long tg_load_avg_contrib;
-	long propagate;
-	long prop_runnable_sum;
+	unsigned long		tg_load_avg_contrib;
+	long			propagate;
+	long			prop_runnable_sum;
 
 	/*
 	 *   h_load = weight * f(tg)
@@ -492,36 +496,38 @@ struct cfs_rq {
 	 * Where f(tg) is the recursive weight fraction assigned to
 	 * this group.
 	 */
-	unsigned long h_load;
-	u64 last_h_load_update;
-	struct sched_entity *h_load_next;
+	unsigned long		h_load;
+	u64			last_h_load_update;
+	struct sched_entity	*h_load_next;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
+	struct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached */
 
 	/*
 	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 	 * (like users, containers etc.)
 	 *
-	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
-	 * list is used during load balance.
+	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
+	 * This list is used during load balance.
 	 */
-	int on_list;
-	struct list_head leaf_cfs_rq_list;
-	struct task_group *tg;	/* group that "owns" this runqueue */
+	int			on_list;
+	struct list_head	leaf_cfs_rq_list;
+	struct task_group	*tg;	/* group that "owns" this runqueue */
 
 #ifdef CONFIG_CFS_BANDWIDTH
-	int runtime_enabled;
-	u64 runtime_expires;
-	s64 runtime_remaining;
-
-	u64 throttled_clock, throttled_clock_task;
-	u64 throttled_clock_task_time;
-	int throttled, throttle_count;
-	struct list_head throttled_list;
+	int			runtime_enabled;
+	u64			runtime_expires;
+	s64			runtime_remaining;
+
+	u64			throttled_clock;
+	u64			throttled_clock_task;
+	u64			throttled_clock_task_time;
+	int			throttled;
+	int			throttle_count;
+	struct list_head	throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
@@ -538,45 +544,45 @@ static inline int rt_bandwidth_enabled(void)
 
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
-	struct rt_prio_array active;
-	unsigned int rt_nr_running;
-	unsigned int rr_nr_running;
+	struct rt_prio_array	active;
+	unsigned int		rt_nr_running;
+	unsigned int		rr_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	struct {
-		int curr; /* highest queued rt task prio */
+		int		curr; /* highest queued rt task prio */
 #ifdef CONFIG_SMP
-		int next; /* next highest */
+		int		next; /* next highest */
 #endif
 	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
-	unsigned long rt_nr_migratory;
-	unsigned long rt_nr_total;
-	int overloaded;
-	struct plist_head pushable_tasks;
+	unsigned long		rt_nr_migratory;
+	unsigned long		rt_nr_total;
+	int			overloaded;
+	struct plist_head	pushable_tasks;
 #endif /* CONFIG_SMP */
-	int rt_queued;
+	int			rt_queued;
 
-	int rt_throttled;
-	u64 rt_time;
-	u64 rt_runtime;
+	int			rt_throttled;
+	u64			rt_time;
+	u64			rt_runtime;
 	/* Nests inside the rq lock: */
-	raw_spinlock_t rt_runtime_lock;
+	raw_spinlock_t		rt_runtime_lock;
 
 #ifdef CONFIG_RT_GROUP_SCHED
-	unsigned long rt_nr_boosted;
+	unsigned long		rt_nr_boosted;
 
-	struct rq *rq;
-	struct task_group *tg;
+	struct rq		*rq;
+	struct task_group	*tg;
 #endif
 };
 
 /* Deadline class' related fields in a runqueue */
 struct dl_rq {
 	/* runqueue is an rbtree, ordered by deadline */
-	struct rb_root_cached root;
+	struct rb_root_cached	root;
 
-	unsigned long dl_nr_running;
+	unsigned long		dl_nr_running;
 
 #ifdef CONFIG_SMP
 	/*
@@ -586,28 +592,28 @@ struct dl_rq {
 	 * should migrate somewhere else.
 	 */
 	struct {
-		u64 curr;
-		u64 next;
+		u64		curr;
+		u64		next;
 	} earliest_dl;
 
-	unsigned long dl_nr_migratory;
-	int overloaded;
+	unsigned long		dl_nr_migratory;
+	int			overloaded;
 
 	/*
 	 * Tasks on this rq that can be pushed away. They are kept in
 	 * an rb-tree, ordered by tasks' deadlines, with caching
 	 * of the leftmost (earliest deadline) element.
 	 */
-	struct rb_root_cached pushable_dl_tasks_root;
+	struct rb_root_cached	pushable_dl_tasks_root;
 #else
-	struct dl_bw dl_bw;
+	struct dl_bw		dl_bw;
 #endif
 	/*
 	 * "Active utilization" for this runqueue: increased when a
 	 * task wakes up (becomes TASK_RUNNING) and decreased when a
 	 * task blocks
 	 */
-	u64 running_bw;
+	u64			running_bw;
 
 	/*
 	 * Utilization of the tasks "assigned" to this runqueue (including
@@ -618,14 +624,14 @@ struct dl_rq {
 	 * This is needed to compute the "inactive utilization" for the
 	 * runqueue (inactive utilization = this_bw - running_bw).
 	 */
-	u64 this_bw;
-	u64 extra_bw;
+	u64			this_bw;
+	u64			extra_bw;
 
 	/*
 	 * Inverse of the fraction of CPU utilization that can be reclaimed
 	 * by the GRUB algorithm.
 	 */
-	u64 bw_ratio;
+	u64			bw_ratio;
 };
 
 #ifdef CONFIG_SMP
@@ -638,51 +644,51 @@ static inline bool sched_asym_prefer(int a, int b)
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * fully partitioning the member CPUs from any other cpuset. Whenever a new
  * exclusive cpuset is created, we also create and attach a new root-domain
  * object.
  *
  */
 struct root_domain {
-	atomic_t refcount;
-	atomic_t rto_count;
-	struct rcu_head rcu;
-	cpumask_var_t span;
-	cpumask_var_t online;
+	atomic_t		refcount;
+	atomic_t		rto_count;
+	struct rcu_head		rcu;
+	cpumask_var_t		span;
+	cpumask_var_t		online;
 
 	/* Indicate more than one runnable task for any CPU */
-	bool overload;
+	bool			overload;
 
 	/*
 	 * The bit corresponding to a CPU gets set here if such CPU has more
 	 * than one runnable -deadline task (as it is below for RT tasks).
 	 */
-	cpumask_var_t dlo_mask;
-	atomic_t dlo_count;
-	struct dl_bw dl_bw;
-	struct cpudl cpudl;
+	cpumask_var_t		dlo_mask;
+	atomic_t		dlo_count;
+	struct dl_bw		dl_bw;
+	struct cpudl		cpudl;
 
 #ifdef HAVE_RT_PUSH_IPI
 	/*
 	 * For IPI pull requests, loop across the rto_mask.
 	 */
-	struct irq_work rto_push_work;
-	raw_spinlock_t rto_lock;
+	struct irq_work		rto_push_work;
+	raw_spinlock_t		rto_lock;
 	/* These are only updated and read within rto_lock */
-	int rto_loop;
-	int rto_cpu;
+	int			rto_loop;
+	int			rto_cpu;
 	/* These atomics are updated outside of a lock */
-	atomic_t rto_loop_next;
-	atomic_t rto_loop_start;
+	atomic_t		rto_loop_next;
+	atomic_t		rto_loop_start;
 #endif
 	/*
 	 * The "RT overload" flag: it gets set if a CPU has more than
 	 * one runnable RT task.
 	 */
-	cpumask_var_t rto_mask;
-	struct cpupri cpupri;
+	cpumask_var_t		rto_mask;
+	struct cpupri		cpupri;
 
-	unsigned long max_cpu_capacity;
+	unsigned long		max_cpu_capacity;
 };
 
 extern struct root_domain def_root_domain;
@@ -708,39 +714,39 @@ extern void rto_push_irq_work_func(struct irq_work *work);
  */
 struct rq {
 	/* runqueue lock: */
-	raw_spinlock_t lock;
+	raw_spinlock_t		lock;
 
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
-	unsigned int nr_running;
+	unsigned int		nr_running;
 #ifdef CONFIG_NUMA_BALANCING
-	unsigned int nr_numa_running;
-	unsigned int nr_preferred_running;
+	unsigned int		nr_numa_running;
+	unsigned int		nr_preferred_running;
 #endif
 	#define CPU_LOAD_IDX_MAX 5
-	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+	unsigned long		cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ_COMMON
 #ifdef CONFIG_SMP
-	unsigned long last_load_update_tick;
+	unsigned long		last_load_update_tick;
 #endif /* CONFIG_SMP */
-	unsigned long nohz_flags;
+	unsigned long		nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
 
-	/* capture load from *all* tasks on this cpu: */
-	struct load_weight load;
-	unsigned long nr_load_updates;
-	u64 nr_switches;
+	/* capture load from *all* tasks on this CPU: */
+	struct load_weight	load;
+	unsigned long		nr_load_updates;
+	u64			nr_switches;
 
-	struct cfs_rq cfs;
-	struct rt_rq rt;
-	struct dl_rq dl;
+	struct cfs_rq		cfs;
+	struct rt_rq		rt;
+	struct dl_rq		dl;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	/* list of leaf cfs_rq on this cpu: */
-	struct list_head leaf_cfs_rq_list;
-	struct list_head *tmp_alone_branch;
+	/* list of leaf cfs_rq on this CPU: */
+	struct list_head	leaf_cfs_rq_list;
+	struct list_head	*tmp_alone_branch;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 	/*
@@ -749,94 +755,98 @@ struct rq {
 	 * one CPU and if it got migrated afterwards it may decrease
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
-	unsigned long nr_uninterruptible;
+	unsigned long		nr_uninterruptible;
 
-	struct task_struct *curr, *idle, *stop;
-	unsigned long next_balance;
-	struct mm_struct *prev_mm;
+	struct task_struct	*curr;
+	struct task_struct	*idle;
+	struct task_struct	*stop;
+	unsigned long		next_balance;
+	struct mm_struct	*prev_mm;
 
-	unsigned int clock_update_flags;
-	u64 clock;
-	u64 clock_task;
+	unsigned int		clock_update_flags;
+	u64			clock;
+	u64			clock_task;
 
-	atomic_t nr_iowait;
+	atomic_t		nr_iowait;
 
 #ifdef CONFIG_SMP
-	struct root_domain *rd;
-	struct sched_domain *sd;
+	struct root_domain	*rd;
+	struct sched_domain	*sd;
+
+	unsigned long		cpu_capacity;
+	unsigned long		cpu_capacity_orig;
 
-	unsigned long cpu_capacity;
-	unsigned long cpu_capacity_orig;
+	struct callback_head	*balance_callback;
 
-	struct callback_head *balance_callback;
+	unsigned char		idle_balance;
 
-	unsigned char idle_balance;
 	/* For active balancing */
-	int active_balance;
-	int push_cpu;
-	struct cpu_stop_work active_balance_work;
-	/* cpu of this runqueue: */
-	int cpu;
-	int online;
+	int			active_balance;
+	int			push_cpu;
+	struct cpu_stop_work	active_balance_work;
+
+	/* CPU of this runqueue: */
+	int			cpu;
+	int			online;
 
 	struct list_head cfs_tasks;
 
-	u64 rt_avg;
-	u64 age_stamp;
-	u64 idle_stamp;
-	u64 avg_idle;
+	u64			rt_avg;
+	u64			age_stamp;
+	u64			idle_stamp;
+	u64			avg_idle;
 
 	/* This is used to determine avg_idle's max value */
-	u64 max_idle_balance_cost;
+	u64			max_idle_balance_cost;
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-	u64 prev_irq_time;
+	u64			prev_irq_time;
 #endif
 #ifdef CONFIG_PARAVIRT
-	u64 prev_steal_time;
+	u64			prev_steal_time;
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-	u64 prev_steal_time_rq;
+	u64			prev_steal_time_rq;
 #endif
 
 	/* calc_load related fields */
-	unsigned long calc_load_update;
-	long calc_load_active;
+	unsigned long		calc_load_update;
+	long			calc_load_active;
 
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
-	int hrtick_csd_pending;
-	call_single_data_t hrtick_csd;
+	int			hrtick_csd_pending;
+	call_single_data_t	hrtick_csd;
 #endif
-	struct hrtimer hrtick_timer;
+	struct hrtimer		hrtick_timer;
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
-	struct sched_info rq_sched_info;
-	unsigned long long rq_cpu_time;
+	struct sched_info	rq_sched_info;
+	unsigned long long	rq_cpu_time;
 	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 
 	/* sys_sched_yield() stats */
-	unsigned int yld_count;
+	unsigned int		yld_count;
 
 	/* schedule() stats */
-	unsigned int sched_count;
-	unsigned int sched_goidle;
+	unsigned int		sched_count;
+	unsigned int		sched_goidle;
 
 	/* try_to_wake_up() stats */
-	unsigned int ttwu_count;
-	unsigned int ttwu_local;
+	unsigned int		ttwu_count;
+	unsigned int		ttwu_local;
 #endif
 
 #ifdef CONFIG_SMP
-	struct llist_head wake_list;
+	struct llist_head	wake_list;
 #endif
 
 #ifdef CONFIG_CPU_IDLE
 	/* Must be inspected within a rcu lock section */
-	struct cpuidle_state *idle_state;
+	struct cpuidle_state	*idle_state;
 #endif
 };
 
@@ -902,9 +912,9 @@ static inline u64 __rq_clock_broken(struct rq *rq)
  * one position though, because the next rq_unpin_lock() will shift it
  * back.
  */
-#define RQCF_REQ_SKIP	0x01
-#define RQCF_ACT_SKIP	0x02
-#define RQCF_UPDATED	0x04
+#define RQCF_REQ_SKIP		0x01
+#define RQCF_ACT_SKIP		0x02
+#define RQCF_UPDATED		0x04
 
 static inline void assert_clock_updated(struct rq *rq)
 {
@@ -1057,12 +1067,12 @@ extern void sched_ttwu_pending(void);
 
 /**
  * highest_flag_domain - Return highest sched_domain containing flag.
- * @cpu:	The cpu whose highest level of sched domain is to
+ * @cpu:	The CPU whose highest level of sched domain is to
  *		be returned.
  * @flag:	The flag to check for the highest sched_domain
- *		for the given cpu.
+ *		for the given CPU.
  *
- * Returns the highest sched_domain of a cpu which contains the given flag.
+ * Returns the highest sched_domain of a CPU which contains the given flag.
  */
 static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 {
@@ -1097,30 +1107,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 
 struct sched_group_capacity {
-	atomic_t ref;
+	atomic_t		ref;
 	/*
 	 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
 	 * for a single CPU.
 	 */
-	unsigned long capacity;
-	unsigned long min_capacity; /* Min per-CPU capacity in group */
-	unsigned long next_update;
-	int imbalance; /* XXX unrelated to capacity but shared group state */
+	unsigned long		capacity;
+	unsigned long		min_capacity;		/* Min per-CPU capacity in group */
+	unsigned long		next_update;
+	int			imbalance;		/* XXX unrelated to capacity but shared group state */
 
 #ifdef CONFIG_SCHED_DEBUG
-	int id;
+	int			id;
 #endif
 
-	unsigned long cpumask[0]; /* balance mask */
+	unsigned long		cpumask[0];		/* Balance mask */
 };
 
 struct sched_group {
-	struct sched_group *next;	/* Must be a circular list */
-	atomic_t ref;
+	struct sched_group	*next;			/* Must be a circular list */
+	atomic_t		ref;
 
-	unsigned int group_weight;
+	unsigned int		group_weight;
 	struct sched_group_capacity *sgc;
-	int asym_prefer_cpu;		/* cpu of highest priority in group */
+	int			asym_prefer_cpu;	/* CPU of highest priority in group */
 
 	/*
 	 * The CPUs this group covers.
@@ -1129,7 +1139,7 @@ struct sched_group {
 	 * by attaching extra space to the end of the structure,
 	 * depending on how many CPUs the kernel has booted up with)
 	 */
-	unsigned long cpumask[0];
+	unsigned long		cpumask[0];
 };
 
 static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1146,8 +1156,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
 }
 
 /**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
+ * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
+ * @group: The group whose first CPU is to be returned.
  */
 static inline unsigned int group_first_cpu(struct sched_group *group)
 {
@@ -1357,9 +1367,9 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 /*
  * wake flags
  */
-#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */
-#define WF_FORK		0x02		/* child wakeup after fork */
-#define WF_MIGRATED	0x4		/* internal use, task got migrated */
+#define WF_SYNC			0x01		/* Waker goes to sleep after wakeup */
+#define WF_FORK			0x02		/* Child wakeup after fork */
+#define WF_MIGRATED		0x4		/* Internal use, task got migrated */
 
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1370,11 +1380,11 @@ static inline int task_on_rq_migrating(struct task_struct *p)
  * slice expiry etc.
  */
 
-#define WEIGHT_IDLEPRIO                3
-#define WMULT_IDLEPRIO         1431655765
+#define WEIGHT_IDLEPRIO		3
+#define WMULT_IDLEPRIO		1431655765
 
-extern const int sched_prio_to_weight[40];
-extern const u32 sched_prio_to_wmult[40];
+extern const int		sched_prio_to_weight[40];
+extern const u32		sched_prio_to_wmult[40];
 
 /*
  * {de,en}queue flags:
@@ -1396,9 +1406,9 @@ extern const u32 sched_prio_to_wmult[40];
  */
 
 #define DEQUEUE_SLEEP		0x01
-#define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */
-#define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */
-#define DEQUEUE_NOCLOCK		0x08 /* matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_SAVE		0x02 /* Matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE		0x04 /* Matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK		0x08 /* Matches ENQUEUE_NOCLOCK */
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
@@ -1420,10 +1430,10 @@ struct sched_class {
 
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
-	void (*yield_task) (struct rq *rq);
-	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+	void (*yield_task)   (struct rq *rq);
+	bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
 
-	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+	void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
 
 	/*
 	 * It is the responsibility of the pick_next_task() method that will
@@ -1433,16 +1443,16 @@ struct sched_class {
 	 * May return RETRY_TASK when it finds a higher prio class has runnable
 	 * tasks.
 	 */
-	struct task_struct * (*pick_next_task) (struct rq *rq,
-						struct task_struct *prev,
-						struct rq_flags *rf);
-	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+	struct task_struct * (*pick_next_task)(struct rq *rq,
+					       struct task_struct *prev,
+					       struct rq_flags *rf);
+	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
 	void (*migrate_task_rq)(struct task_struct *p);
 
-	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
+	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
 
 	void (*set_cpus_allowed)(struct task_struct *p,
 				 const struct cpumask *newmask);
@@ -1451,31 +1461,31 @@ struct sched_class {
 	void (*rq_offline)(struct rq *rq);
 #endif
 
-	void (*set_curr_task) (struct rq *rq);
-	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
-	void (*task_fork) (struct task_struct *p);
-	void (*task_dead) (struct task_struct *p);
+	void (*set_curr_task)(struct rq *rq);
+	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+	void (*task_fork)(struct task_struct *p);
+	void (*task_dead)(struct task_struct *p);
 
 	/*
 	 * The switched_from() call is allowed to drop rq->lock, therefore we
 	 * cannot assume the switched_from/switched_to pair is serliazed by
 	 * rq->lock. They are however serialized by p->pi_lock.
 	 */
-	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
-	void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
+	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
-			     int oldprio);
+			      int oldprio);
 
-	unsigned int (*get_rr_interval) (struct rq *rq,
-					 struct task_struct *task);
+	unsigned int (*get_rr_interval)(struct rq *rq,
+					struct task_struct *task);
 
-	void (*update_curr) (struct rq *rq);
+	void (*update_curr)(struct rq *rq);
 
-#define TASK_SET_GROUP  0
-#define TASK_MOVE_GROUP	1
+#define TASK_SET_GROUP		0
+#define TASK_MOVE_GROUP		1
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	void (*task_change_group) (struct task_struct *p, int type);
+	void (*task_change_group)(struct task_struct *p, int type);
 #endif
 };
 
@@ -1524,6 +1534,7 @@ static inline void idle_set_state(struct rq *rq,
 static inline struct cpuidle_state *idle_get_state(struct rq *rq)
 {
 	SCHED_WARN_ON(!rcu_read_lock_held());
+
 	return rq->idle_state;
 }
 #else
@@ -1562,9 +1573,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
 extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
 
-#define BW_SHIFT	20
-#define BW_UNIT		(1 << BW_SHIFT)
-#define RATIO_SHIFT	8
+#define BW_SHIFT		20
+#define BW_UNIT			(1 << BW_SHIFT)
+#define RATIO_SHIFT		8
 unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
@@ -1814,8 +1825,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 /*
  * Unfair double_lock_balance: Optimizes throughput at the expense of
  * latency by eliminating extra atomic operations when the locks are
- * already in proper order on entry.  This favors lower cpu-ids and will
- * grant the double lock to lower cpus over higher ids under contention,
+ * already in proper order on entry.  This favors lower CPU-ids and will
+ * grant the double lock to lower CPUs over higher ids under contention,
  * regardless of entry order into the function.
  */
 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
@@ -1847,7 +1858,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 {
 	if (unlikely(!irqs_disabled())) {
-		/* printk() doesn't work good under rq->lock */
+		/* printk() doesn't work well under rq->lock */
 		raw_spin_unlock(&this_rq->lock);
 		BUG_ON(1);
 	}
@@ -2106,15 +2117,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
 
 #ifdef arch_scale_freq_capacity
-#ifndef arch_scale_freq_invariant
-#define arch_scale_freq_invariant()	(true)
-#endif
-#else /* arch_scale_freq_capacity */
-#define arch_scale_freq_invariant()	(false)
+# ifndef arch_scale_freq_invariant
+#  define arch_scale_freq_invariant()	true
+# endif
+#else
+# define arch_scale_freq_invariant()	false
 #endif
 
 #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
-
 static inline unsigned long cpu_util_dl(struct rq *rq)
 {
 	return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2124,5 +2134,4 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
 {
 	return rq->cfs.avg.util_avg;
 }
-
 #endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 940b1fa1d2ce..968c1fe3099a 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -78,8 +78,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
  * This itererator needs some explanation.
  * It returns 1 for the header position.
  * This means 2 is cpu 0.
- * In a hotplugged system some cpus, including cpu 0, may be missing so we have
- * to use cpumask_* to iterate over the cpus.
+ * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the CPUs.
  */
 static void *schedstat_start(struct seq_file *file, loff_t *offset)
 {
@@ -99,12 +99,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
 
 	if (n < nr_cpu_ids)
 		return (void *)(unsigned long)(n + 2);
+
 	return NULL;
 }
 
 static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
 {
 	(*offset)++;
+
 	return schedstat_start(file, offset);
 }
 
@@ -134,6 +136,7 @@ static const struct file_operations proc_schedstat_operations = {
 static int __init proc_schedstat_init(void)
 {
 	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+
 	return 0;
 }
 subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8e7b58de61e7..8aea199a39b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 	if (rq)
 		rq->rq_sched_info.run_delay += delta;
 }
-#define schedstat_enabled()		static_branch_unlikely(&sched_schedstats)
+#define   schedstat_enabled()		static_branch_unlikely(&sched_schedstats)
 #define __schedstat_inc(var)		do { var++; } while (0)
-#define schedstat_inc(var)		do { if (schedstat_enabled()) { var++; } } while (0)
+#define   schedstat_inc(var)		do { if (schedstat_enabled()) { var++; } } while (0)
 #define __schedstat_add(var, amt)	do { var += (amt); } while (0)
-#define schedstat_add(var, amt)		do { if (schedstat_enabled()) { var += (amt); } } while (0)
-#define __schedstat_set(var, val)		do { var = (val); } while (0)
-#define schedstat_set(var, val)		do { if (schedstat_enabled()) { var = (val); } } while (0)
-#define schedstat_val(var)		(var)
-#define schedstat_val_or_zero(var)	((schedstat_enabled()) ? (var) : 0)
-
-#else /* !CONFIG_SCHEDSTATS */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{}
-#define schedstat_enabled()		0
-#define __schedstat_inc(var)		do { } while (0)
-#define schedstat_inc(var)		do { } while (0)
-#define __schedstat_add(var, amt)	do { } while (0)
-#define schedstat_add(var, amt)		do { } while (0)
-#define __schedstat_set(var, val)	do { } while (0)
-#define schedstat_set(var, val)		do { } while (0)
-#define schedstat_val(var)		0
-#define schedstat_val_or_zero(var)	0
+#define   schedstat_add(var, amt)	do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define __schedstat_set(var, val)	do { var = (val); } while (0)
+#define   schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define   schedstat_val(var)		(var)
+#define   schedstat_val_or_zero(var)	((schedstat_enabled()) ? (var) : 0)
+
+#else /* !CONFIG_SCHEDSTATS: */
+static inline void rq_sched_info_arrive  (struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_depart  (struct rq *rq, unsigned long long delta) { }
+# define   schedstat_enabled()		0
+# define __schedstat_inc(var)		do { } while (0)
+# define   schedstat_inc(var)		do { } while (0)
+# define __schedstat_add(var, amt)	do { } while (0)
+# define   schedstat_add(var, amt)	do { } while (0)
+# define __schedstat_set(var, val)	do { } while (0)
+# define   schedstat_set(var, val)	do { } while (0)
+# define   schedstat_val(var)		0
+# define   schedstat_val_or_zero(var)	0
 #endif /* CONFIG_SCHEDSTATS */
 
 #ifdef CONFIG_SCHED_INFO
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
 
 /*
  * We are interested in knowing how long it was from the *first* time a
- * task was queued to the time that it finally hit a cpu, we call this routine
- * from dequeue_task() to account for possible rq->clock skew across cpus. The
- * delta taken on each cpu would annul the skew.
+ * task was queued to the time that it finally hit a CPU, we call this routine
+ * from dequeue_task() to account for possible rq->clock skew across CPUs. The
+ * delta taken on each CPU would annul the skew.
  */
 static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
 {
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
 }
 
 /*
- * Called when a task finally hits the cpu.  We can now calculate how
+ * Called when a task finally hits the CPU.  We can now calculate how
  * long it was waiting to run.  We also note when it began so that we
  * can keep stats on how long its timeslice is.
  */
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
  */
 static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
 {
-	if (unlikely(sched_info_on()))
+	if (unlikely(sched_info_on())) {
 		if (!t->sched_info.last_queued)
 			t->sched_info.last_queued = rq_clock(rq);
+	}
 }
 
 /*
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
  */
 static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
 {
-	unsigned long long delta = rq_clock(rq) -
-					t->sched_info.last_arrival;
+	unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
 
 	rq_sched_info_depart(rq, delta);
 
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
  * the idle task.)  We are only called when prev != next.
  */
 static inline void
-__sched_info_switch(struct rq *rq,
-		    struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
 {
 	/*
-	 * prev now departs the cpu.  It's not interesting to record
+	 * prev now departs the CPU.  It's not interesting to record
 	 * stats about how efficient we were at scheduling the idle
 	 * process, however.
 	 */
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq,
 	if (next != rq->idle)
 		sched_info_arrive(rq, next);
 }
+
 static inline void
-sched_info_switch(struct rq *rq,
-		  struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
 {
 	if (unlikely(sched_info_on()))
 		__sched_info_switch(rq, prev, next);
 }
-#else
-#define sched_info_queued(rq, t)		do { } while (0)
-#define sched_info_reset_dequeued(t)	do { } while (0)
-#define sched_info_dequeued(rq, t)		do { } while (0)
-#define sched_info_depart(rq, t)		do { } while (0)
-#define sched_info_arrive(rq, next)		do { } while (0)
-#define sched_info_switch(rq, t, next)		do { } while (0)
+
+#else /* !CONFIG_SCHED_INFO: */
+# define sched_info_queued(rq, t)	do { } while (0)
+# define sched_info_reset_dequeued(t)	do { } while (0)
+# define sched_info_dequeued(rq, t)	do { } while (0)
+# define sched_info_depart(rq, t)	do { } while (0)
+# define sched_info_arrive(rq, next)	do { } while (0)
+# define sched_info_switch(rq, t, next)	do { } while (0)
 #endif /* CONFIG_SCHED_INFO */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index ea8d2b6a1239..c183b790ca54 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,6 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "sched.h"
-
 /*
  * stop-task scheduling class.
  *
@@ -9,6 +7,7 @@
  *
  * See kernel/stop_machine.c
  */
+#include "sched.h"
 
 #ifdef CONFIG_SMP
 static int
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 9ff1555341ed..b88ab4e0207f 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,4 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * <linux/swait.h> (simple wait queues ) implementation:
+ */
 #include <linux/sched/signal.h>
 #include <linux/swait.h>
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 519b024f4e94..219eee70e457 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -41,8 +41,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 	if (!(sd->flags & SD_LOAD_BALANCE)) {
 		printk("does not load-balance\n");
 		if (sd->parent)
-			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-					" has parent");
+			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
 		return -1;
 	}
 
@@ -50,12 +49,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 	       cpumask_pr_args(sched_domain_span(sd)), sd->name);
 
 	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-		printk(KERN_ERR "ERROR: domain->span does not contain "
-				"CPU%d\n", cpu);
+		printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
 	}
 	if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
-		printk(KERN_ERR "ERROR: domain->groups does not contain"
-				" CPU%d\n", cpu);
+		printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
 	}
 
 	printk(KERN_DEBUG "%*s groups:", level + 1, "");
@@ -115,8 +112,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 	if (sd->parent &&
 	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
-		printk(KERN_ERR "ERROR: parent span is not a superset "
-			"of domain->span\n");
+		printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
 	return 0;
 }
 
@@ -595,7 +591,7 @@ int group_balance_cpu(struct sched_group *sg)
  * are not.
  *
  * This leads to a few particularly weird cases where the sched_domain's are
- * not of the same number for each cpu. Consider:
+ * not of the same number for each CPU. Consider:
  *
  * NUMA-2	0-3						0-3
  *  groups:	{0-2},{1-3}					{1-3},{0-2}
@@ -780,7 +776,7 @@ fail:
  *	    ^ ^             ^ ^
  *          `-'             `-'
  *
- * The sched_domains are per-cpu and have a two way link (parent & child) and
+ * The sched_domains are per-CPU and have a two way link (parent & child) and
  * denote the ever growing mask of CPUs belonging to that level of topology.
  *
  * Each sched_domain has a circular (double) linked list of sched_group's, each
@@ -1021,6 +1017,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
 	d->rd = alloc_rootdomain();
 	if (!d->rd)
 		return sa_sd;
+
 	return sa_rootdomain;
 }
 
@@ -1047,12 +1044,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 }
 
 #ifdef CONFIG_NUMA
-static int sched_domains_numa_levels;
 enum numa_topology_type sched_numa_topology_type;
-static int *sched_domains_numa_distance;
-int sched_max_numa_distance;
-static struct cpumask ***sched_domains_numa_masks;
-static int sched_domains_curr_level;
+
+static int			sched_domains_numa_levels;
+static int			sched_domains_curr_level;
+
+int				sched_max_numa_distance;
+static int			*sched_domains_numa_distance;
+static struct cpumask		***sched_domains_numa_masks;
 #endif
 
 /*
@@ -1074,11 +1073,11 @@ static int sched_domains_curr_level;
  *   SD_ASYM_PACKING        - describes SMT quirks
  */
 #define TOPOLOGY_SD_FLAGS		\
-	(SD_SHARE_CPUCAPACITY |		\
+	(SD_SHARE_CPUCAPACITY	|	\
 	 SD_SHARE_PKG_RESOURCES |	\
-	 SD_NUMA |			\
-	 SD_ASYM_PACKING |		\
-	 SD_ASYM_CPUCAPACITY |		\
+	 SD_NUMA		|	\
+	 SD_ASYM_PACKING	|	\
+	 SD_ASYM_CPUCAPACITY	|	\
 	 SD_SHARE_POWERDOMAIN)
 
 static struct sched_domain *
@@ -1628,7 +1627,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
 			pr_err("     the %s domain not a subset of the %s domain\n",
 					child->name, sd->name);
 #endif
-			/* Fixup, ensure @sd has at least @child cpus. */
+			/* Fixup, ensure @sd has at least @child CPUs. */
 			cpumask_or(sched_domain_span(sd),
 				   sched_domain_span(sd),
 				   sched_domain_span(child));
@@ -1720,6 +1719,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	ret = 0;
 error:
 	__free_domain_allocs(&d, alloc_state, cpu_map);
+
 	return ret;
 }
 
@@ -1824,6 +1824,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 		return 1;
 
 	tmp = SD_ATTR_INIT;
+
 	return !memcmp(cur ? (cur + idx_cur) : &tmp,
 			new ? (new + idx_new) : &tmp,
 			sizeof(struct sched_domain_attr));
@@ -1929,4 +1930,3 @@ match2:
 
 	mutex_unlock(&sched_domains_mutex);
 }
-
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 929ecb7d6b78..7b2a142ae629 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -107,6 +107,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 			break;
 		}
 	}
+
 	return nr_exclusive;
 }
 
@@ -317,6 +318,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
 	spin_unlock(&wq->lock);
 	schedule();
 	spin_lock(&wq->lock);
+
 	return 0;
 }
 EXPORT_SYMBOL(do_wait_intr);
@@ -333,6 +335,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
 	spin_unlock_irq(&wq->lock);
 	schedule();
 	spin_lock_irq(&wq->lock);
+
 	return 0;
 }
 EXPORT_SYMBOL(do_wait_intr_irq);
@@ -378,6 +381,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
 
 	if (ret)
 		list_del_init(&wq_entry->entry);
+
 	return ret;
 }
 EXPORT_SYMBOL(autoremove_wake_function);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 84cb3acd9260..5293c59163a6 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -29,8 +29,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 			wait_bit->key.bit_nr != key->bit_nr ||
 			test_bit(key->bit_nr, key->flags))
 		return 0;
-	else
-		return autoremove_wake_function(wq_entry, mode, sync, key);
+
+	return autoremove_wake_function(wq_entry, mode, sync, key);
 }
 EXPORT_SYMBOL(wake_bit_function);
 
@@ -50,7 +50,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
 		if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
 			ret = (*action)(&wbq_entry->key, mode);
 	} while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+
 	finish_wait(wq_head, &wbq_entry->wq_entry);
+
 	return ret;
 }
 EXPORT_SYMBOL(__wait_on_bit);
@@ -73,6 +75,7 @@ int __sched out_of_line_wait_on_bit_timeout(
 	DEFINE_WAIT_BIT(wq_entry, word, bit);
 
 	wq_entry.key.timeout = jiffies + timeout;
+
 	return __wait_on_bit(wq_head, &wq_entry, action, mode);
 }
 EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
@@ -120,6 +123,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
 void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
 {
 	struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+
 	if (waitqueue_active(wq_head))
 		__wake_up(wq_head, TASK_NORMAL, 1, &key);
 }
@@ -157,6 +161,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
 {
 	if (BITS_PER_LONG == 64) {
 		unsigned long q = (unsigned long)p;
+
 		return bit_waitqueue((void *)(q & ~1), q & 1);
 	}
 	return bit_waitqueue(p, 0);
@@ -173,6 +178,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
 	    wait_bit->key.bit_nr != key->bit_nr ||
 	    atomic_read(val) != 0)
 		return 0;
+
 	return autoremove_wake_function(wq_entry, mode, sync, key);
 }
 
@@ -196,6 +202,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
 		ret = (*action)(val, mode);
 	} while (!ret && atomic_read(val) != 0);
 	finish_wait(wq_head, &wbq_entry->wq_entry);
+
 	return ret;
 }
 
@@ -226,6 +233,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
 	schedule();
 	if (signal_pending_state(mode, current))
 		return -EINTR;
+
 	return 0;
 }
 EXPORT_SYMBOL(atomic_t_wait);
@@ -250,6 +258,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode)
 	schedule();
 	if (signal_pending_state(mode, current))
 		return -EINTR;
+
 	return 0;
 }
 EXPORT_SYMBOL(bit_wait);
@@ -259,6 +268,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode)
 	io_schedule();
 	if (signal_pending_state(mode, current))
 		return -EINTR;
+
 	return 0;
 }
 EXPORT_SYMBOL(bit_wait_io);
@@ -266,11 +276,13 @@ EXPORT_SYMBOL(bit_wait_io);
 __sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
 {
 	unsigned long now = READ_ONCE(jiffies);
+
 	if (time_after_eq(now, word->timeout))
 		return -EAGAIN;
 	schedule_timeout(word->timeout - now);
 	if (signal_pending_state(mode, current))
 		return -EINTR;
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(bit_wait_timeout);
@@ -278,11 +290,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
 __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
 {
 	unsigned long now = READ_ONCE(jiffies);
+
 	if (time_after_eq(now, word->timeout))
 		return -EAGAIN;
 	io_schedule_timeout(word->timeout - now);
 	if (signal_pending_state(mode, current))
 		return -EINTR;
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
-- 
cgit v1.2.3-71-gd317


From 325ea10c0809406ce23f038602abbc454f3f761d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 3 Mar 2018 12:20:47 +0100
Subject: sched/headers: Simplify and clean up header usage in the scheduler

Do the following cleanups and simplifications:

 - sched/sched.h already includes <asm/paravirt.h>, so no need to
   include it in sched/core.c again.

 - order the <linux/sched/*.h> headers alphabetically

 - add all <linux/sched/*.h> headers to kernel/sched/sched.h

 - remove all unnecessary includes from the .c files that
   are already included in kernel/sched/sched.h.

Finally, make all scheduler .c files use a single common header:

  #include "sched.h"

... which now contains a union of the relied upon headers.

This makes the various .c files easier to read and easier to handle.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/deadline.h   |  6 ---
 kernel/sched/autogroup.c         |  9 ++---
 kernel/sched/autogroup.h         |  4 --
 kernel/sched/clock.c             | 14 +------
 kernel/sched/completion.c        |  5 +--
 kernel/sched/core.c              | 40 +++++++-------------
 kernel/sched/cpuacct.c           | 13 +------
 kernel/sched/cpudeadline.c       |  5 +--
 kernel/sched/cpudeadline.h       |  2 -
 kernel/sched/cpufreq.c           |  1 -
 kernel/sched/cpufreq_schedutil.c |  8 +---
 kernel/sched/cpupri.c            |  6 +--
 kernel/sched/cpupri.h            |  1 -
 kernel/sched/cputime.c           | 10 ++---
 kernel/sched/deadline.c          |  3 --
 kernel/sched/debug.c             | 11 +-----
 kernel/sched/fair.c              | 16 +-------
 kernel/sched/idle.c              | 15 +-------
 kernel/sched/idle_task.c         |  5 +--
 kernel/sched/isolation.c         |  7 ----
 kernel/sched/loadavg.c           |  4 --
 kernel/sched/membarrier.c        |  9 +----
 kernel/sched/rt.c                |  4 --
 kernel/sched/sched.h             | 81 ++++++++++++++++++++++++++--------------
 kernel/sched/stats.c             | 13 +++----
 kernel/sched/swait.c             |  3 +-
 kernel/sched/topology.c          |  4 --
 kernel/sched/wait.c              |  9 +----
 kernel/sched/wait_bit.c          |  5 +--
 29 files changed, 94 insertions(+), 219 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index a5bc8728ead7..0cb034331cbb 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,8 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SCHED_DEADLINE_H
-#define _LINUX_SCHED_DEADLINE_H
-
-#include <linux/sched.h>
 
 /*
  * SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b)
 {
 	return (s64)(a - b) < 0;
 }
-
-#endif /* _LINUX_SCHED_DEADLINE_H */
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index ff1b7b647b86..6be6c575b6cd 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,10 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/utsname.h>
-#include <linux/security.h>
-#include <linux/export.h>
-
+/*
+ * Auto-group scheduling implementation:
+ */
 #include "sched.h"
 
 unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 49e6ec9559cf..b96419974a1f 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,10 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifdef CONFIG_SCHED_AUTOGROUP
 
-#include <linux/kref.h>
-#include <linux/rwsem.h>
-#include <linux/sched/autogroup.h>
-
 struct autogroup {
 	/*
 	 * Reference doesn't mean how many threads attach to this
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 7da6bec8a2ff..10c83e73837a 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -52,19 +52,7 @@
  * that is otherwise invisible (TSC gets stopped).
  *
  */
-#include <linux/spinlock.h>
-#include <linux/hardirq.h>
-#include <linux/export.h>
-#include <linux/percpu.h>
-#include <linux/ktime.h>
-#include <linux/sched.h>
-#include <linux/nmi.h>
-#include <linux/sched/clock.h>
-#include <linux/static_key.h>
-#include <linux/workqueue.h>
-#include <linux/compiler.h>
-#include <linux/tick.h>
-#include <linux/init.h>
+#include "sched.h"
 
 /*
  * Scheduler clock - returns current time in nanosec units.
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 0926aef10dad..5d2d56b0817a 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,10 +11,7 @@
  * typically be used for exclusion which gives rise to priority inversion.
  * Waiting for completion is a typically sync point, but not an exclusion point.
  */
-
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/completion.h>
+#include "sched.h"
 
 /**
  * complete: - signals a single thread waiting on this completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9427b59551c1..e1e334ba8ff9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5,37 +5,11 @@
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  */
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <uapi/linux/sched/types.h>
-#include <linux/sched/loadavg.h>
-#include <linux/sched/hotplug.h>
-#include <linux/wait_bit.h>
-#include <linux/cpuset.h>
-#include <linux/delayacct.h>
-#include <linux/init_task.h>
-#include <linux/context_tracking.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/compat.h>
-
-#include <linux/blkdev.h>
-#include <linux/kprobes.h>
-#include <linux/mmu_context.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/prefetch.h>
-#include <linux/profile.h>
-#include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/sched/isolation.h>
+#include "sched.h"
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
 
-#include "sched.h"
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
 
@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq)
 	raw_spin_unlock_irq(&rq->lock);
 }
 
+/*
+ * NOP if the arch has not defined these:
+ */
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch()	do { } while (0)
+#endif
+
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 1abd325e733a..9fbb10383434 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,22 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/cgroup.h>
-#include <linux/slab.h>
-#include <linux/percpu.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/seq_file.h>
-#include <linux/rcupdate.h>
-#include <linux/kernel_stat.h>
-#include <linux/err.h>
-
-#include "sched.h"
-
 /*
  * CPU accounting code for task groups.
  *
  * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
  * (balbir@in.ibm.com).
  */
+#include "sched.h"
 
 /* Time spent by the tasks of the CPU accounting group executing in ... */
 enum cpuacct_stat_index {
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index cb172b61d191..50316455ea66 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -10,10 +10,7 @@
  *  as published by the Free Software Foundation; version 2
  *  of the License.
  */
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include "cpudeadline.h"
+#include "sched.h"
 
 static inline int parent(int i)
 {
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index c26e7a0e5a66..0adeda93b5fb 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,6 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/sched.h>
-#include <linux/sched/deadline.h>
 
 #define IDX_INVALID		-1
 
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index dbc51442ecbc..5e54cbcae673 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -8,7 +8,6 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-
 #include "sched.h"
 
 DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 0dad8160e00f..feb5f89020f2 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -11,14 +11,10 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/cpufreq.h>
-#include <linux/kthread.h>
-#include <uapi/linux/sched/types.h>
-#include <linux/slab.h>
-#include <trace/events/power.h>
-
 #include "sched.h"
 
+#include <trace/events/power.h>
+
 struct sugov_tunables {
 	struct gov_attr_set	attr_set;
 	unsigned int		rate_limit_us;
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index f43e14ccb67d..daaadf939ccb 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -26,11 +26,7 @@
  *  as published by the Free Software Foundation; version 2
  *  of the License.
  */
-#include <linux/gfp.h>
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/slab.h>
-#include "cpupri.h"
+#include "sched.h"
 
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
 static int convert_prio(int prio)
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 141a06c914c6..7dc20a3232e7 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,5 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/sched.h>
 
 #define CPUPRI_NR_PRIORITIES	(MAX_RT_PRIO + 2)
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index d3b450b57ade..0796f938c4f0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,10 +1,6 @@
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kernel_stat.h>
-#include <linux/static_key.h>
-#include <linux/context_tracking.h>
-#include <linux/sched/cputime.h>
+/*
+ * Simple CPU accounting cgroup controller
+ */
 #include "sched.h"
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 58f8b7b37983..af491f537636 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,9 +17,6 @@
  */
 #include "sched.h"
 
-#include <linux/slab.h>
-#include <uapi/linux/sched/types.h>
-
 struct dl_bandwidth def_dl_bandwidth;
 
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7c82a9b88510..644d9a464380 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,7 +1,7 @@
 /*
  * kernel/sched/debug.c
  *
- * Print the CFS rbtree
+ * Print the CFS rbtree and other debugging details
  *
  * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
  *
@@ -9,15 +9,6 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#include <linux/proc_fs.h>
-#include <linux/sched/mm.h>
-#include <linux/sched/task.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-#include <linux/utsname.h>
-#include <linux/mempolicy.h>
-#include <linux/debugfs.h>
-
 #include "sched.h"
 
 static DEFINE_SPINLOCK(sched_debug_lock);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1f877de96c9b..f5591071ae98 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,24 +20,10 @@
  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
-#include <linux/sched/mm.h>
-#include <linux/sched/topology.h>
-
-#include <linux/latencytop.h>
-#include <linux/cpumask.h>
-#include <linux/cpuidle.h>
-#include <linux/slab.h>
-#include <linux/profile.h>
-#include <linux/interrupt.h>
-#include <linux/mempolicy.h>
-#include <linux/migrate.h>
-#include <linux/task_work.h>
-#include <linux/sched/isolation.h>
+#include "sched.h"
 
 #include <trace/events/sched.h>
 
-#include "sched.h"
-
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 343d25f85477..2760e0357271 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,23 +1,10 @@
 /*
  * Generic entry points for the idle threads
  */
-#include <linux/sched.h>
-#include <linux/sched/idle.h>
-#include <linux/cpu.h>
-#include <linux/cpuidle.h>
-#include <linux/cpuhotplug.h>
-#include <linux/tick.h>
-#include <linux/mm.h>
-#include <linux/stackprotector.h>
-#include <linux/suspend.h>
-#include <linux/livepatch.h>
-
-#include <asm/tlb.h>
+#include "sched.h"
 
 #include <trace/events/power.h>
 
-#include "sched.h"
-
 /* Linker adds these: start and end of __cpuidle functions */
 extern char __cpuidle_text_start[], __cpuidle_text_end[];
 
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index ec73680922f8..488222ac4651 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -1,12 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "sched.h"
-
 /*
  * idle-task scheduling class.
  *
- * (NOTE: these are not related to SCHED_IDLE tasks which are
+ * (NOTE: these are not related to SCHED_IDLE batch scheduling tasks which are
  *  handled in sched/fair.c)
  */
+#include "sched.h"
 
 #ifdef CONFIG_SMP
 static int
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index aad5f48a07c6..e6802181900f 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -6,13 +6,6 @@
  * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
  *
  */
-#include <linux/sched/isolation.h>
-#include <linux/tick.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/static_key.h>
-#include <linux/ctype.h>
-
 #include "sched.h"
 
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index a398e7e28a8a..a171c1258109 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,10 +6,6 @@
  * figure. Its a silly number but people think its important. We go through
  * great pains to make it work on big machines and tickless kernels.
  */
-
-#include <linux/export.h>
-#include <linux/sched/loadavg.h>
-
 #include "sched.h"
 
 /*
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 2c6ae2413fa2..76e0eaf4654e 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -13,14 +13,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
-
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-#include <linux/cpumask.h>
-#include <linux/atomic.h>
-
-#include "sched.h"	/* for cpu_rq(). */
+#include "sched.h"
 
 /*
  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e40498872111..a3d438fec46c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,12 +3,8 @@
  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
  * policies)
  */
-
 #include "sched.h"
 
-#include <linux/slab.h>
-#include <linux/irq_work.h>
-
 int sched_rr_timeslice = RR_TIMESLICE;
 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bd1461ae06e4..23ba4dd76ac4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,39 +3,71 @@
  * Scheduler internal types and methods:
  */
 #include <linux/sched.h>
+
 #include <linux/sched/autogroup.h>
-#include <linux/sched/sysctl.h>
-#include <linux/sched/topology.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/deadline.h>
 #include <linux/sched/clock.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/numa_balancing.h>
-#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/sched/cpufreq.h>
-#include <linux/sched/stat.h>
-#include <linux/sched/nohz.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/deadline.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/idle.h>
+#include <linux/sched/init.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/jobctl.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/prio.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/sysctl.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
-#include <linux/sched/cputime.h>
-#include <linux/sched/init.h>
+#include <linux/sched/topology.h>
+#include <linux/sched/user.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/xacct.h>
+
+#include <uapi/linux/sched/types.h>
 
-#include <linux/u64_stats_sync.h>
-#include <linux/kernel_stat.h>
 #include <linux/binfmts.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/compat.h>
+#include <linux/context_tracking.h>
+#include <linux/cpufreq.h>
+#include <linux/cpuidle.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/kprobes.h>
+#include <linux/kthread.h>
+#include <linux/membarrier.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/nmi.h>
+#include <linux/proc_fs.h>
+#include <linux/prefetch.h>
+#include <linux/profile.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/security.h>
+#include <linux/stackprotector.h>
 #include <linux/stop_machine.h>
-#include <linux/irq_work.h>
-#include <linux/tick.h>
-#include <linux/slab.h>
-#include <linux/cgroup.h>
+#include <linux/suspend.h>
+#include <linux/swait.h>
+#include <linux/syscalls.h>
+#include <linux/task_work.h>
+#include <linux/tsacct_kern.h>
+
+#include <asm/tlb.h>
 
 #ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
+# include <asm/paravirt.h>
 #endif
 
 #include "cpupri.h"
@@ -1357,13 +1389,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 	return p->on_rq == TASK_ON_RQ_MIGRATING;
 }
 
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next)	do { } while (0)
-#endif
-#ifndef finish_arch_post_lock_switch
-# define finish_arch_post_lock_switch()	do { } while (0)
-#endif
-
 /*
  * wake flags
  */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 968c1fe3099a..ab112cbfd7c8 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -1,14 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
-
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-
+/*
+ * /proc/schedstat implementation
+ */
 #include "sched.h"
 
 /*
- * bump this up when changing the output format or the meaning of an existing
+ * Current schedstat API version.
+ *
+ * Bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
 #define SCHEDSTAT_VERSION 15
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index b88ab4e0207f..b6fb2c3b3ff7 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -2,8 +2,7 @@
 /*
  * <linux/swait.h> (simple wait queues ) implementation:
  */
-#include <linux/sched/signal.h>
-#include <linux/swait.h>
+#include "sched.h"
 
 void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
 			     struct lock_class_key *key)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 219eee70e457..64cc564f5255 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,10 +2,6 @@
 /*
  * Scheduler topology setup/handling methods
  */
-#include <linux/sched.h>
-#include <linux/mutex.h>
-#include <linux/sched/isolation.h>
-
 #include "sched.h"
 
 DEFINE_MUTEX(sched_domains_mutex);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 7b2a142ae629..928be527477e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -3,14 +3,7 @@
  *
  * (C) 2004 Nadia Yvette Chambers, Oracle
  */
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/mm.h>
-#include <linux/wait.h>
-#include <linux/hash.h>
-#include <linux/kthread.h>
+#include "sched.h"
 
 void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
 {
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 5293c59163a6..4239c78f5cd3 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,10 +1,7 @@
 /*
  * The implementation of the wait_bit*() and related waiting APIs:
  */
-#include <linux/wait_bit.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/hash.h>
+#include "sched.h"
 
 #define WAIT_TABLE_BITS 8
 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
-- 
cgit v1.2.3-71-gd317


From a92057e14beb233e8c891f4de075f2a468c71f15 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 3 Mar 2018 15:44:39 +0100
Subject: sched/idle: Merge kernel/sched/idle.c and kernel/sched/idle_task.c

Merge these two small .c modules as they implement two aspects
of idle task handling.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/Makefile    |   5 +-
 kernel/sched/idle.c      | 123 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/idle_task.c | 117 --------------------------------------------
 3 files changed, 125 insertions(+), 120 deletions(-)
 delete mode 100644 kernel/sched/idle_task.c

(limited to 'kernel')

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index e2f9d4feff40..d9a02b318108 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle_task.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o idle.o
+obj-y += idle.o fair.o rt.o deadline.o
+obj-y += wait.o wait_bit.o swait.o completion.o
+
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2760e0357271..2975f195e1c4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,5 +1,9 @@
 /*
- * Generic entry points for the idle threads
+ * Generic entry points for the idle threads and
+ * implementation of the idle task scheduling class.
+ *
+ * (NOTE: these are not related to SCHED_IDLE batch scheduled
+ *        tasks which are handled in sched/fair.c )
  */
 #include "sched.h"
 
@@ -33,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable)
 static int __init cpu_idle_poll_setup(char *__unused)
 {
 	cpu_idle_force_poll = 1;
+
 	return 1;
 }
 __setup("nohlt", cpu_idle_poll_setup);
@@ -40,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup);
 static int __init cpu_idle_nopoll_setup(char *__unused)
 {
 	cpu_idle_force_poll = 0;
+
 	return 1;
 }
 __setup("hlt", cpu_idle_nopoll_setup);
@@ -51,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
 	trace_cpu_idle_rcuidle(0, smp_processor_id());
 	local_irq_enable();
 	stop_critical_timings();
+
 	while (!tif_need_resched() &&
 		(cpu_idle_force_poll || tick_check_broadcast_expired()))
 		cpu_relax();
 	start_critical_timings();
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 	rcu_idle_exit();
+
 	return 1;
 }
 
@@ -337,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state)
 	while (1)
 		do_idle();
 }
+
+/*
+ * idle-task scheduling class.
+ */
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+	return task_cpu(p); /* IDLE tasks as never migrated */
+}
+#endif
+
+/*
+ * Idle tasks are unconditionally rescheduled:
+ */
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+	resched_curr(rq);
+}
+
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+	put_prev_task(rq, prev);
+	update_idle_core(rq);
+	schedstat_inc(rq->sched_goidle);
+
+	return rq->idle;
+}
+
+/*
+ * It is not legal to sleep in the idle task - print a warning
+ * message if some code attempts to do it:
+ */
+static void
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+	raw_spin_unlock_irq(&rq->lock);
+	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+	dump_stack();
+	raw_spin_lock_irq(&rq->lock);
+}
+
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+{
+}
+
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
+{
+	BUG();
+}
+
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+{
+	BUG();
+}
+
+static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
+{
+	return 0;
+}
+
+static void update_curr_idle(struct rq *rq)
+{
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+const struct sched_class idle_sched_class = {
+	/* .next is NULL */
+	/* no enqueue/yield_task for idle tasks */
+
+	/* dequeue is not valid, we print a debug message there: */
+	.dequeue_task		= dequeue_task_idle,
+
+	.check_preempt_curr	= check_preempt_curr_idle,
+
+	.pick_next_task		= pick_next_task_idle,
+	.put_prev_task		= put_prev_task_idle,
+
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_idle,
+	.set_cpus_allowed	= set_cpus_allowed_common,
+#endif
+
+	.set_curr_task          = set_curr_task_idle,
+	.task_tick		= task_tick_idle,
+
+	.get_rr_interval	= get_rr_interval_idle,
+
+	.prio_changed		= prio_changed_idle,
+	.switched_to		= switched_to_idle,
+	.update_curr		= update_curr_idle,
+};
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
deleted file mode 100644
index 488222ac4651..000000000000
--- a/kernel/sched/idle_task.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * idle-task scheduling class.
- *
- * (NOTE: these are not related to SCHED_IDLE batch scheduling tasks which are
- *  handled in sched/fair.c)
- */
-#include "sched.h"
-
-#ifdef CONFIG_SMP
-static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
-{
-	return task_cpu(p); /* IDLE tasks as never migrated */
-}
-#endif
-
-/*
- * Idle tasks are unconditionally rescheduled:
- */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
-{
-	resched_curr(rq);
-}
-
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
-{
-	put_prev_task(rq, prev);
-	update_idle_core(rq);
-	schedstat_inc(rq->sched_goidle);
-
-	return rq->idle;
-}
-
-/*
- * It is not legal to sleep in the idle task - print a warning
- * message if some code attempts to do it:
- */
-static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
-{
-	raw_spin_unlock_irq(&rq->lock);
-	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
-	dump_stack();
-	raw_spin_lock_irq(&rq->lock);
-}
-
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
-}
-
-/*
- * scheduler tick hitting a task of our scheduling class.
- *
- * NOTE: This function can be called remotely by the tick offload that
- * goes along full dynticks. Therefore no local assumption can be made
- * and everything must be accessed through the @rq and @curr passed in
- * parameters.
- */
-static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
-{
-}
-
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
-{
-	BUG();
-}
-
-static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
-{
-	BUG();
-}
-
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
-	return 0;
-}
-
-static void update_curr_idle(struct rq *rq)
-{
-}
-
-/*
- * Simple, special scheduling class for the per-CPU idle tasks:
- */
-const struct sched_class idle_sched_class = {
-	/* .next is NULL */
-	/* no enqueue/yield_task for idle tasks */
-
-	/* dequeue is not valid, we print a debug message there: */
-	.dequeue_task		= dequeue_task_idle,
-
-	.check_preempt_curr	= check_preempt_curr_idle,
-
-	.pick_next_task		= pick_next_task_idle,
-	.put_prev_task		= put_prev_task_idle,
-
-#ifdef CONFIG_SMP
-	.select_task_rq		= select_task_rq_idle,
-	.set_cpus_allowed	= set_cpus_allowed_common,
-#endif
-
-	.set_curr_task          = set_curr_task_idle,
-	.task_tick		= task_tick_idle,
-
-	.get_rr_interval	= get_rr_interval_idle,
-
-	.prio_changed		= prio_changed_idle,
-	.switched_to		= switched_to_idle,
-	.update_curr		= update_curr_idle,
-};
-- 
cgit v1.2.3-71-gd317


From 02d8ec9456f47b8865f1ff3fbb532e12a760d3b5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 3 Mar 2018 16:27:54 +0100
Subject: sched/deadline, rt: Rename queue_push_tasks/queue_pull_task to create
 separate namespace

There are similarly named functions in both of these modules:

  kernel/sched/deadline.c:static inline void queue_push_tasks(struct rq *rq)
  kernel/sched/deadline.c:static inline void queue_pull_task(struct rq *rq)
  kernel/sched/deadline.c:static inline void queue_push_tasks(struct rq *rq)
  kernel/sched/deadline.c:static inline void queue_pull_task(struct rq *rq)
  kernel/sched/deadline.c:	queue_push_tasks(rq);
  kernel/sched/deadline.c:	queue_pull_task(rq);
  kernel/sched/deadline.c:			queue_push_tasks(rq);
  kernel/sched/deadline.c:			queue_pull_task(rq);
  kernel/sched/rt.c:static inline void queue_push_tasks(struct rq *rq)
  kernel/sched/rt.c:static inline void queue_pull_task(struct rq *rq)
  kernel/sched/rt.c:static inline void queue_push_tasks(struct rq *rq)
  kernel/sched/rt.c:	queue_push_tasks(rq);
  kernel/sched/rt.c:	queue_pull_task(rq);
  kernel/sched/rt.c:			queue_push_tasks(rq);
  kernel/sched/rt.c:			queue_pull_task(rq);

... which makes it harder to grep for them. Prefix them with
deadline_ and rt_, respectively.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 16 ++++++++--------
 kernel/sched/rt.c       | 14 +++++++-------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index af491f537636..8b7c2b35bec9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -511,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
 static void push_dl_tasks(struct rq *);
 static void pull_dl_task(struct rq *);
 
-static inline void queue_push_tasks(struct rq *rq)
+static inline void deadline_queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_dl_tasks(rq))
 		return;
@@ -519,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq)
 	queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
 }
 
-static inline void queue_pull_task(struct rq *rq)
+static inline void deadline_queue_pull_task(struct rq *rq)
 {
 	queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
 }
@@ -594,11 +594,11 @@ static inline void pull_dl_task(struct rq *rq)
 {
 }
 
-static inline void queue_push_tasks(struct rq *rq)
+static inline void deadline_queue_push_tasks(struct rq *rq)
 {
 }
 
-static inline void queue_pull_task(struct rq *rq)
+static inline void deadline_queue_pull_task(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1759,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	if (hrtick_enabled(rq))
 		start_hrtick_dl(rq, p);
 
-	queue_push_tasks(rq);
+	deadline_queue_push_tasks(rq);
 
 	return p;
 }
@@ -2309,7 +2309,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	queue_pull_task(rq);
+	deadline_queue_pull_task(rq);
 }
 
 /*
@@ -2331,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 	if (rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
-			queue_push_tasks(rq);
+			deadline_queue_push_tasks(rq);
 #endif
 		if (dl_task(rq->curr))
 			check_preempt_curr_dl(rq, p, 0);
@@ -2356,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 		 * or lowering its prio, so...
 		 */
 		if (!rq->dl.overloaded)
-			queue_pull_task(rq);
+			deadline_queue_pull_task(rq);
 
 		/*
 		 * If we now have a earlier deadline task than p,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a3d438fec46c..4f4fd3b157f1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -355,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 static void push_rt_tasks(struct rq *);
 static void pull_rt_task(struct rq *);
 
-static inline void queue_push_tasks(struct rq *rq)
+static inline void rt_queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_tasks(rq))
 		return;
@@ -363,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq)
 	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 }
 
-static inline void queue_pull_task(struct rq *rq)
+static inline void rt_queue_pull_task(struct rq *rq)
 {
 	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 }
@@ -421,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq)
 {
 }
 
-static inline void queue_push_tasks(struct rq *rq)
+static inline void rt_queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1565,7 +1565,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	/* The running task is never eligible for pushing */
 	dequeue_pushable_task(rq, p);
 
-	queue_push_tasks(rq);
+	rt_queue_push_tasks(rq);
 
 	return p;
 }
@@ -2185,7 +2185,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	queue_pull_task(rq);
+	rt_queue_pull_task(rq);
 }
 
 void __init init_sched_rt_class(void)
@@ -2216,7 +2216,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
-			queue_push_tasks(rq);
+			rt_queue_push_tasks(rq);
 #endif /* CONFIG_SMP */
 		if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
 			resched_curr(rq);
@@ -2240,7 +2240,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 		 * may need to pull tasks to this runqueue.
 		 */
 		if (oldprio < p->prio)
-			queue_pull_task(rq);
+			rt_queue_pull_task(rq);
 
 		/*
 		 * If there's a higher priority task waiting to run
-- 
cgit v1.2.3-71-gd317


From 14a7405b2e814221a951bd7a76ce4a8d24c1b3be Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 3 Mar 2018 16:32:24 +0100
Subject: sched/core: Undefine tracepoint creation at the end of core.c

Make it easier to concatenate all the scheduler .c files for single-module
compilation.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e1e334ba8ff9..4f5eeb63ab5b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7071,3 +7071,5 @@ const u32 sched_prio_to_wmult[40] = {
  /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
+
+#undef CREATE_TRACE_POINTS
-- 
cgit v1.2.3-71-gd317


From 167f5594b5efa20a26ff03b3424f793887e6b448 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Tue, 6 Mar 2018 15:56:47 +1100
Subject: kernel/memremap: Remove stale devres_free() call

devm_memremap_pages() was re-worked in e8d513483300 "memremap: change
devm_memremap_pages interface to use struct dev_pagemap" to take a
caller allocated struct dev_pagemap as a function parameter. A call to
devres_free() was left in the error cleanup path which results in a
kernel panic if the remap fails for some reason. Remove it to fix the
panic and let devm_memremap_pages() fail gracefully.

Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 kernel/memremap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 4dd4274cabe2..895e6b76b25e 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -427,7 +427,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_pfn_remap:
  err_radix:
 	pgmap_radix_release(res, pgoff);
-	devres_free(pgmap);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL(devm_memremap_pages);
-- 
cgit v1.2.3-71-gd317


From 5ad751053704df3f00d2bb2dc9345c697c212150 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 6 Mar 2018 10:49:12 +0100
Subject: panic: Add closing panic marker parenthesis

Otherwise it looks unbalanced.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Link: https://lkml.kernel.org/r/20180306094920.16917-2-bp@alien8.de
---
 kernel/panic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 2cfef408fec9..9fb023d0cae1 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -289,7 +289,7 @@ void panic(const char *fmt, ...)
 		disabled_wait(caller);
 	}
 #endif
-	pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf);
+	pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
 	local_irq_enable();
 	for (i = 0; ; i += PANIC_TIMER_STEP) {
 		touch_softlockup_watchdog();
-- 
cgit v1.2.3-71-gd317


From 3f553b308bb004eb730da8e00a28150c157c7724 Mon Sep 17 00:00:00 2001
From: Leon Yu <chianglungyu@gmail.com>
Date: Tue, 6 Mar 2018 23:16:24 +0800
Subject: module: propagate error in modules_open()

otherwise kernel can oops later in seq_release() due to dereferencing null
file->private_data which is only set if seq_open() succeeds.

BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
IP: seq_release+0xc/0x30
Call Trace:
 close_pdeo+0x37/0xd0
 proc_reg_release+0x5d/0x60
 __fput+0x9d/0x1d0
 ____fput+0x9/0x10
 task_work_run+0x75/0x90
 do_exit+0x252/0xa00
 do_group_exit+0x36/0xb0
 SyS_exit_group+0xf/0x10

Fixes: 516fb7f2e73d ("/proc/module: use the same logic as /proc/kallsyms for address exposure")
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: stable@vger.kernel.org # 4.15+
Signed-off-by: Leon Yu <chianglungyu@gmail.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index ad2d420024f6..e42764acedb4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4228,7 +4228,7 @@ static int modules_open(struct inode *inode, struct file *file)
 		m->private = kallsyms_show_value() ? NULL : (void *)8ul;
 	}
 
-	return 0;
+	return err;
 }
 
 static const struct file_operations proc_modules_operations = {
-- 
cgit v1.2.3-71-gd317


From 13a453c241b78934a945b1af572d0533612c9bd1 Mon Sep 17 00:00:00 2001
From: Norbert Manthey <nmanthey@amazon.de>
Date: Tue, 27 Feb 2018 08:47:40 +0100
Subject: sched/fair: Add ';' after label attributes

Due to using GCC defines for configuration, some labels might be unused in
certain configurations. While adding a __maybe_unused to the label is
fine in general, the line has to be terminated with ';'. This is also
reflected in the GCC documentation, but GCC parsed the previous variant
without an error message.

This has been spotted while compiling with goto-cc, the compiler for the
CPROVER tool suite.

Signed-off-by: Norbert Manthey <nmanthey@amazon.de>
Signed-off-by: Michael Tautschnig <tautschn@amazon.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1519717660-16157-1-git-send-email-nmanthey@amazon.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f5591071ae98..097db34d5ba2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6797,7 +6797,7 @@ simple:
 
 	p = task_of(se);
 
-done: __maybe_unused
+done: __maybe_unused;
 #ifdef CONFIG_SMP
 	/*
 	 * Move the next running task to the front of
-- 
cgit v1.2.3-71-gd317


From 4042d003a0792a3b05c7c424219e4c6cf1abfe76 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 20 Dec 2017 15:37:26 +0100
Subject: cpufreq/schedutil: Remove unused CPUFREQ_DL

Bitrot...

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/cpufreq.h | 3 +--
 kernel/sched/deadline.c       | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 0b55834efd46..d963cfd3a0c2 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -9,8 +9,7 @@
  */
 
 #define SCHED_CPUFREQ_RT	(1U << 0)
-#define SCHED_CPUFREQ_DL	(1U << 1)
-#define SCHED_CPUFREQ_IOWAIT	(1U << 2)
+#define SCHED_CPUFREQ_IOWAIT	(1U << 1)
 
 #ifdef CONFIG_CPU_FREQ
 struct update_util_data {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8b7c2b35bec9..d1c7bf7c7e5b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -84,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
 	SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
 	SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
 	/* kick cpufreq (see the comment in kernel/sched/sched.h). */
-	cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
+	cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
 }
 
 static inline
@@ -98,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
 	if (dl_rq->running_bw > old)
 		dl_rq->running_bw = 0;
 	/* kick cpufreq (see the comment in kernel/sched/sched.h). */
-	cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
+	cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
 }
 
 static inline
-- 
cgit v1.2.3-71-gd317


From 8f111bc357aa811e0bb5fdfe34c4c9efdafc15b9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 20 Dec 2017 16:26:12 +0100
Subject: cpufreq/schedutil: Rewrite CPUFREQ_RT support

Instead of trying to duplicate scheduler state to track if an RT task
is running, directly use the scheduler runqueue state for it.

This vastly simplifies things and fixes a number of bugs related to
sugov and the scheduler getting out of sync wrt this state.

As a consequence we not also update the remove cfs/dl state when
iterating the shared mask.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/cpufreq.h    |  3 +-
 kernel/sched/cpufreq_schedutil.c | 74 ++++++++++++++++++----------------------
 kernel/sched/rt.c                |  9 +++--
 3 files changed, 41 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index d963cfd3a0c2..b48f2fb3b316 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -8,8 +8,7 @@
  * Interface between cpufreq drivers and the scheduler:
  */
 
-#define SCHED_CPUFREQ_RT	(1U << 0)
-#define SCHED_CPUFREQ_IOWAIT	(1U << 1)
+#define SCHED_CPUFREQ_IOWAIT	(1U << 0)
 
 #ifdef CONFIG_CPU_FREQ
 struct update_util_data {
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index feb5f89020f2..89fe78ecb88c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -57,7 +57,6 @@ struct sugov_cpu {
 	unsigned long		util_cfs;
 	unsigned long		util_dl;
 	unsigned long		max;
-	unsigned int		flags;
 
 	/* The field below is for single-CPU policies only: */
 #ifdef CONFIG_NO_HZ_COMMON
@@ -183,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
 
 static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
 {
+	struct rq *rq = cpu_rq(sg_cpu->cpu);
+	unsigned long util;
+
+	if (rq->rt.rt_nr_running) {
+		util = sg_cpu->max;
+	} else {
+		util = sg_cpu->util_dl;
+		if (rq->cfs.h_nr_running)
+			util += sg_cpu->util_cfs;
+	}
+
 	/*
 	 * Ideally we would like to set util_dl as min/guaranteed freq and
 	 * util_cfs + util_dl as requested freq. However, cpufreq is not yet
 	 * ready for such an interface. So, we only do the latter for now.
 	 */
-	return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max);
+	return min(util, sg_cpu->max);
 }
 
-static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time)
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
 {
-	if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) {
+	if (flags & SCHED_CPUFREQ_IOWAIT) {
 		if (sg_cpu->iowait_boost_pending)
 			return;
 
@@ -262,12 +272,11 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 {
 	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
-	struct cpufreq_policy *policy = sg_policy->policy;
 	unsigned long util, max;
 	unsigned int next_f;
 	bool busy;
 
-	sugov_set_iowait_boost(sg_cpu, time);
+	sugov_set_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
 
 	if (!sugov_should_update_freq(sg_policy, time))
@@ -275,25 +284,22 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 
 	busy = sugov_cpu_is_busy(sg_cpu);
 
-	if (flags & SCHED_CPUFREQ_RT) {
-		next_f = policy->cpuinfo.max_freq;
-	} else {
-		sugov_get_util(sg_cpu);
-		max = sg_cpu->max;
-		util = sugov_aggregate_util(sg_cpu);
-		sugov_iowait_boost(sg_cpu, &util, &max);
-		next_f = get_next_freq(sg_policy, util, max);
-		/*
-		 * Do not reduce the frequency if the CPU has not been idle
-		 * recently, as the reduction is likely to be premature then.
-		 */
-		if (busy && next_f < sg_policy->next_freq) {
-			next_f = sg_policy->next_freq;
+	sugov_get_util(sg_cpu);
+	max = sg_cpu->max;
+	util = sugov_aggregate_util(sg_cpu);
+	sugov_iowait_boost(sg_cpu, &util, &max);
+	next_f = get_next_freq(sg_policy, util, max);
+	/*
+	 * Do not reduce the frequency if the CPU has not been idle
+	 * recently, as the reduction is likely to be premature then.
+	 */
+	if (busy && next_f < sg_policy->next_freq) {
+		next_f = sg_policy->next_freq;
 
-			/* Reset cached freq as next_freq has changed */
-			sg_policy->cached_raw_freq = 0;
-		}
+		/* Reset cached freq as next_freq has changed */
+		sg_policy->cached_raw_freq = 0;
 	}
+
 	sugov_update_commit(sg_policy, time, next_f);
 }
 
@@ -309,6 +315,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 		unsigned long j_util, j_max;
 		s64 delta_ns;
 
+		sugov_get_util(j_sg_cpu);
+
 		/*
 		 * If the CFS CPU utilization was last updated before the
 		 * previous frequency update and the time elapsed between the
@@ -322,21 +330,15 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 		if (delta_ns > TICK_NSEC) {
 			j_sg_cpu->iowait_boost = 0;
 			j_sg_cpu->iowait_boost_pending = false;
-			j_sg_cpu->util_cfs = 0;
-			if (j_sg_cpu->util_dl == 0)
-				continue;
 		}
-		if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
-			return policy->cpuinfo.max_freq;
 
 		j_max = j_sg_cpu->max;
 		j_util = sugov_aggregate_util(j_sg_cpu);
+		sugov_iowait_boost(j_sg_cpu, &j_util, &j_max);
 		if (j_util * max > j_max * util) {
 			util = j_util;
 			max = j_max;
 		}
-
-		sugov_iowait_boost(j_sg_cpu, &util, &max);
 	}
 
 	return get_next_freq(sg_policy, util, max);
@@ -351,18 +353,11 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
 
 	raw_spin_lock(&sg_policy->update_lock);
 
-	sugov_get_util(sg_cpu);
-	sg_cpu->flags = flags;
-
-	sugov_set_iowait_boost(sg_cpu, time);
+	sugov_set_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
 
 	if (sugov_should_update_freq(sg_policy, time)) {
-		if (flags & SCHED_CPUFREQ_RT)
-			next_f = sg_policy->policy->cpuinfo.max_freq;
-		else
-			next_f = sugov_next_freq_shared(sg_cpu, time);
-
+		next_f = sugov_next_freq_shared(sg_cpu, time);
 		sugov_update_commit(sg_policy, time, next_f);
 	}
 
@@ -673,7 +668,6 @@ static int sugov_start(struct cpufreq_policy *policy)
 		memset(sg_cpu, 0, sizeof(*sg_cpu));
 		sg_cpu->cpu			= cpu;
 		sg_cpu->sg_policy		= sg_policy;
-		sg_cpu->flags			= 0;
 		sg_cpu->iowait_boost_max	= policy->cpuinfo.max_freq;
 	}
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4f4fd3b157f1..86b77987435e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -957,9 +957,6 @@ static void update_curr_rt(struct rq *rq)
 	if (unlikely((s64)delta_exec <= 0))
 		return;
 
-	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
-	cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
-
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
 
@@ -1001,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
 
 	sub_nr_running(rq, rt_rq->rt_nr_running);
 	rt_rq->rt_queued = 0;
+
+	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+	cpufreq_update_util(rq, 0);
 }
 
 static void
@@ -1017,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
 
 	add_nr_running(rq, rt_rq->rt_nr_running);
 	rt_rq->rt_queued = 1;
+
+	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+	cpufreq_update_util(rq, 0);
 }
 
 #if defined CONFIG_SMP
-- 
cgit v1.2.3-71-gd317


From a22e47a4e3f5a9e50a827c5d94705ace3b1eac0b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 21 Dec 2017 10:01:24 +0100
Subject: sched/core: Convert nohz_flags to atomic_t

Using atomic_t allows us to use the more flexible bitops provided
there. Also its smaller.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  6 +++---
 kernel/sched/fair.c  | 23 +++++++++++++++--------
 kernel/sched/sched.h | 11 ++++++-----
 3 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4f5eeb63ab5b..96ad1c003d74 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -583,7 +583,7 @@ static inline bool got_nohz_idle_kick(void)
 {
 	int cpu = smp_processor_id();
 
-	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+	if (!(atomic_read(nohz_flags(cpu)) & NOHZ_BALANCE_KICK))
 		return false;
 
 	if (idle_cpu(cpu) && !need_resched())
@@ -593,7 +593,7 @@ static inline bool got_nohz_idle_kick(void)
 	 * We can't run Idle Load Balance on this CPU for this time so we
 	 * cancel it and clear NOHZ_BALANCE_KICK
 	 */
-	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+	atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(cpu));
 	return false;
 }
 
@@ -6074,7 +6074,7 @@ void __init sched_init(void)
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
 		rq->last_load_update_tick = jiffies;
-		rq->nohz_flags = 0;
+		atomic_set(&rq->nohz_flags, 0);
 #endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 097db34d5ba2..5d150478dd58 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9072,6 +9072,7 @@ static inline int find_new_ilb(void)
  */
 static void nohz_balancer_kick(void)
 {
+	unsigned int flags;
 	int ilb_cpu;
 
 	nohz.next_balance++;
@@ -9081,7 +9082,8 @@ static void nohz_balancer_kick(void)
 	if (ilb_cpu >= nr_cpu_ids)
 		return;
 
-	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
+	flags = atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu));
+	if (flags & NOHZ_BALANCE_KICK)
 		return;
 	/*
 	 * Use smp_send_reschedule() instead of resched_cpu().
@@ -9095,7 +9097,9 @@ static void nohz_balancer_kick(void)
 
 void nohz_balance_exit_idle(unsigned int cpu)
 {
-	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+	unsigned int flags = atomic_read(nohz_flags(cpu));
+
+	if (unlikely(flags & NOHZ_TICK_STOPPED)) {
 		/*
 		 * Completely isolated CPUs don't ever set, so we must test.
 		 */
@@ -9103,7 +9107,8 @@ void nohz_balance_exit_idle(unsigned int cpu)
 			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
 			atomic_dec(&nohz.nr_cpus);
 		}
-		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+
+		atomic_andnot(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 	}
 }
 
@@ -9155,7 +9160,7 @@ void nohz_balance_enter_idle(int cpu)
 	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 		return;
 
-	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
+	if (atomic_read(nohz_flags(cpu)) & NOHZ_TICK_STOPPED)
 		return;
 
 	/* If we're a completely isolated CPU, we don't play: */
@@ -9164,7 +9169,7 @@ void nohz_balance_enter_idle(int cpu)
 
 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 	atomic_inc(&nohz.nr_cpus);
-	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+	atomic_or(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
 #endif
 
@@ -9302,8 +9307,10 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
 
-	if (idle != CPU_IDLE ||
-	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
+	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_BALANCE_KICK))
+		return;
+
+	if (idle != CPU_IDLE)
 		goto end;
 
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
@@ -9349,7 +9356,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 end:
-	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
+	atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 23ba4dd76ac4..d98e761b962f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -763,7 +763,7 @@ struct rq {
 #ifdef CONFIG_SMP
 	unsigned long		last_load_update_tick;
 #endif /* CONFIG_SMP */
-	unsigned long		nohz_flags;
+	atomic_t nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
 
 	/* capture load from *all* tasks on this CPU: */
@@ -2034,10 +2034,11 @@ extern void cfs_bandwidth_usage_inc(void);
 extern void cfs_bandwidth_usage_dec(void);
 
 #ifdef CONFIG_NO_HZ_COMMON
-enum rq_nohz_flag_bits {
-	NOHZ_TICK_STOPPED,
-	NOHZ_BALANCE_KICK,
-};
+#define NOHZ_TICK_STOPPED_BIT	0
+#define NOHZ_BALANCE_KICK_BIT	1
+
+#define NOHZ_TICK_STOPPED	BIT(NOHZ_TICK_STOPPED_BIT)
+#define NOHZ_BALANCE_KICK	BIT(NOHZ_BALANCE_KICK_BIT)
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 
-- 
cgit v1.2.3-71-gd317


From b7031a02ec753bf9b52a94a966b05e1abad3b7a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 21 Dec 2017 10:11:09 +0100
Subject: sched/fair: Add NOHZ_STATS_KICK

Split the NOHZ idle balancer into doing two separate actions:

 - update blocked load statistic

 - actually load-balance

Since the latter requires the former, ensure this happens. For now
always tag both bits at the same time.

Prepares for a future where we can toggle only the STATS bit.

Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  4 ++--
 kernel/sched/fair.c  | 52 +++++++++++++++++++++++++++++++++++-----------------
 kernel/sched/sched.h |  4 ++++
 3 files changed, 41 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 96ad1c003d74..69c9a6b07b61 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -583,7 +583,7 @@ static inline bool got_nohz_idle_kick(void)
 {
 	int cpu = smp_processor_id();
 
-	if (!(atomic_read(nohz_flags(cpu)) & NOHZ_BALANCE_KICK))
+	if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
 		return false;
 
 	if (idle_cpu(cpu) && !need_resched())
@@ -593,7 +593,7 @@ static inline bool got_nohz_idle_kick(void)
 	 * We can't run Idle Load Balance on this CPU for this time so we
 	 * cancel it and clear NOHZ_BALANCE_KICK
 	 */
-	atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+	atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
 	return false;
 }
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5d150478dd58..fc058967c999 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9082,8 +9082,8 @@ static void nohz_balancer_kick(void)
 	if (ilb_cpu >= nr_cpu_ids)
 		return;
 
-	flags = atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu));
-	if (flags & NOHZ_BALANCE_KICK)
+	flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
+	if (flags & NOHZ_KICK_MASK)
 		return;
 	/*
 	 * Use smp_send_reschedule() instead of resched_cpu().
@@ -9202,8 +9202,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 	int need_serialize, need_decay = 0;
 	u64 max_cost = 0;
 
-	update_blocked_averages(cpu);
-
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		/*
@@ -9298,20 +9296,27 @@ out:
  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
  * rebalancing for all the CPUs for whom scheduler ticks are stopped.
  */
-static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
-	int this_cpu = this_rq->cpu;
-	struct rq *rq;
-	int balance_cpu;
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
+	int this_cpu = this_rq->cpu;
+	unsigned int flags;
+	int balance_cpu;
+	struct rq *rq;
 
-	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_BALANCE_KICK))
-		return;
+	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+		return false;
 
-	if (idle != CPU_IDLE)
-		goto end;
+	if (idle != CPU_IDLE) {
+		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+		return false;
+	}
+
+	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+
+	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
@@ -9339,7 +9344,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 			cpu_load_update_idle(rq);
 			rq_unlock_irq(rq, &rf);
 
-			rebalance_domains(rq, CPU_IDLE);
+			update_blocked_averages(rq->cpu);
+			if (flags & NOHZ_BALANCE_KICK)
+				rebalance_domains(rq, CPU_IDLE);
 		}
 
 		if (time_after(next_balance, rq->next_balance)) {
@@ -9348,6 +9355,10 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		}
 	}
 
+	update_blocked_averages(this_cpu);
+	if (flags & NOHZ_BALANCE_KICK)
+		rebalance_domains(this_rq, CPU_IDLE);
+
 	/*
 	 * next_balance will be updated only when there is a need.
 	 * When the CPU is attached to null domain for ex, it will not be
@@ -9355,8 +9366,8 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	 */
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
-end:
-	atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
+
+	return true;
 }
 
 /*
@@ -9443,7 +9454,10 @@ unlock:
 	return kick;
 }
 #else
-static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+	return false;
+}
 #endif
 
 /*
@@ -9464,7 +9478,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 	 * load balance only within the local sched_domain hierarchy
 	 * and abort nohz_idle_balance altogether if we pull some load.
 	 */
-	nohz_idle_balance(this_rq, idle);
+	if (nohz_idle_balance(this_rq, idle))
+		return;
+
+	/* normal load balance */
+	update_blocked_averages(this_rq->cpu);
 	rebalance_domains(this_rq, idle);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d98e761b962f..5295f274053b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2036,9 +2036,13 @@ extern void cfs_bandwidth_usage_dec(void);
 #ifdef CONFIG_NO_HZ_COMMON
 #define NOHZ_TICK_STOPPED_BIT	0
 #define NOHZ_BALANCE_KICK_BIT	1
+#define NOHZ_STATS_KICK_BIT	2
 
 #define NOHZ_TICK_STOPPED	BIT(NOHZ_TICK_STOPPED_BIT)
 #define NOHZ_BALANCE_KICK	BIT(NOHZ_BALANCE_KICK_BIT)
+#define NOHZ_STATS_KICK		BIT(NOHZ_STATS_KICK_BIT)
+
+#define NOHZ_KICK_MASK	(NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 
-- 
cgit v1.2.3-71-gd317


From 4550487a993d579c7329bb5b19e516d36800c8bf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 21 Dec 2017 10:47:48 +0100
Subject: sched/fair: Restructure nohz_balance_kick()

The current:

	if (nohz_kick_needed())
		nohz_balancer_kick()

is pointless complexity, fold them into a single call and avoid the
various conditions at the call site.

When we introduce multiple different needs to kick the ilb, the above
construct also becomes a problem.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 218 ++++++++++++++++++++++++++--------------------------
 1 file changed, 111 insertions(+), 107 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fc058967c999..fa483d889f07 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9065,12 +9065,29 @@ static inline int find_new_ilb(void)
 	return nr_cpu_ids;
 }
 
+static inline void set_cpu_sd_state_busy(void)
+{
+	struct sched_domain *sd;
+	int cpu = smp_processor_id();
+
+	rcu_read_lock();
+	sd = rcu_dereference(per_cpu(sd_llc, cpu));
+
+	if (!sd || !sd->nohz_idle)
+		goto unlock;
+	sd->nohz_idle = 0;
+
+	atomic_inc(&sd->shared->nr_busy_cpus);
+unlock:
+	rcu_read_unlock();
+}
+
 /*
  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
-static void nohz_balancer_kick(void)
+static void kick_ilb(void)
 {
 	unsigned int flags;
 	int ilb_cpu;
@@ -9085,6 +9102,7 @@ static void nohz_balancer_kick(void)
 	flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
 	if (flags & NOHZ_KICK_MASK)
 		return;
+
 	/*
 	 * Use smp_send_reschedule() instead of resched_cpu().
 	 * This way we generate a sched IPI on the target CPU which
@@ -9092,7 +9110,94 @@ static void nohz_balancer_kick(void)
 	 * will be run before returning from the IPI.
 	 */
 	smp_send_reschedule(ilb_cpu);
-	return;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer in the presence
+ * of an idle cpu in the system.
+ *   - This rq has more than one task.
+ *   - This rq has at least one CFS task and the capacity of the CPU is
+ *     significantly reduced because of RT tasks or IRQs.
+ *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ *     multiple busy cpu.
+ *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
+ *     domain span are idle.
+ */
+static void nohz_balancer_kick(struct rq *rq)
+{
+	unsigned long now = jiffies;
+	struct sched_domain_shared *sds;
+	struct sched_domain *sd;
+	int nr_busy, i, cpu = rq->cpu;
+	bool kick = false;
+
+	if (unlikely(rq->idle_balance))
+		return;
+
+	/*
+	 * We may be recently in ticked or tickless idle mode. At the first
+	 * busy tick after returning from idle, we will update the busy stats.
+	 */
+	set_cpu_sd_state_busy();
+	nohz_balance_exit_idle(cpu);
+
+	/*
+	 * None are in tickless mode and hence no need for NOHZ idle load
+	 * balancing.
+	 */
+	if (likely(!atomic_read(&nohz.nr_cpus)))
+		return;
+
+	if (time_before(now, nohz.next_balance))
+		return;
+
+	if (rq->nr_running >= 2) {
+		kick = true;
+		goto out;
+	}
+
+	rcu_read_lock();
+	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+	if (sds) {
+		/*
+		 * XXX: write a coherent comment on why we do this.
+		 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+		 */
+		nr_busy = atomic_read(&sds->nr_busy_cpus);
+		if (nr_busy > 1) {
+			kick = true;
+			goto unlock;
+		}
+
+	}
+
+	sd = rcu_dereference(rq->sd);
+	if (sd) {
+		if ((rq->cfs.h_nr_running >= 1) &&
+				check_cpu_capacity(rq, sd)) {
+			kick = true;
+			goto unlock;
+		}
+	}
+
+	sd = rcu_dereference(per_cpu(sd_asym, cpu));
+	if (sd) {
+		for_each_cpu(i, sched_domain_span(sd)) {
+			if (i == cpu ||
+			    !cpumask_test_cpu(i, nohz.idle_cpus_mask))
+				continue;
+
+			if (sched_asym_prefer(i, cpu)) {
+				kick = true;
+				goto unlock;
+			}
+		}
+	}
+unlock:
+	rcu_read_unlock();
+out:
+	if (kick)
+		kick_ilb();
 }
 
 void nohz_balance_exit_idle(unsigned int cpu)
@@ -9112,23 +9217,6 @@ void nohz_balance_exit_idle(unsigned int cpu)
 	}
 }
 
-static inline void set_cpu_sd_state_busy(void)
-{
-	struct sched_domain *sd;
-	int cpu = smp_processor_id();
-
-	rcu_read_lock();
-	sd = rcu_dereference(per_cpu(sd_llc, cpu));
-
-	if (!sd || !sd->nohz_idle)
-		goto unlock;
-	sd->nohz_idle = 0;
-
-	atomic_inc(&sd->shared->nr_busy_cpus);
-unlock:
-	rcu_read_unlock();
-}
-
 void set_cpu_sd_state_idle(void)
 {
 	struct sched_domain *sd;
@@ -9171,6 +9259,8 @@ void nohz_balance_enter_idle(int cpu)
 	atomic_inc(&nohz.nr_cpus);
 	atomic_or(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
+#else
+static inline void nohz_balancer_kick(struct rq *rq) { }
 #endif
 
 static DEFINE_SPINLOCK(balancing);
@@ -9369,90 +9459,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 
 	return true;
 }
-
-/*
- * Current heuristic for kicking the idle load balancer in the presence
- * of an idle CPU in the system.
- *   - This rq has more than one task.
- *   - This rq has at least one CFS task and the capacity of the CPU is
- *     significantly reduced because of RT tasks or IRQs.
- *   - At parent of LLC scheduler domain level, this CPU's scheduler group has
- *     multiple busy CPUs.
- *   - For SD_ASYM_PACKING, if the lower numbered CPU's in the scheduler
- *     domain span are idle.
- */
-static inline bool nohz_kick_needed(struct rq *rq)
-{
-	unsigned long now = jiffies;
-	struct sched_domain_shared *sds;
-	struct sched_domain *sd;
-	int nr_busy, i, cpu = rq->cpu;
-	bool kick = false;
-
-	if (unlikely(rq->idle_balance))
-		return false;
-
-       /*
-	* We may be recently in ticked or tickless idle mode. At the first
-	* busy tick after returning from idle, we will update the busy stats.
-	*/
-	set_cpu_sd_state_busy();
-	nohz_balance_exit_idle(cpu);
-
-	/*
-	 * None are in tickless mode and hence no need for NOHZ idle load
-	 * balancing.
-	 */
-	if (likely(!atomic_read(&nohz.nr_cpus)))
-		return false;
-
-	if (time_before(now, nohz.next_balance))
-		return false;
-
-	if (rq->nr_running >= 2)
-		return true;
-
-	rcu_read_lock();
-	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-	if (sds) {
-		/*
-		 * XXX: write a coherent comment on why we do this.
-		 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
-		 */
-		nr_busy = atomic_read(&sds->nr_busy_cpus);
-		if (nr_busy > 1) {
-			kick = true;
-			goto unlock;
-		}
-
-	}
-
-	sd = rcu_dereference(rq->sd);
-	if (sd) {
-		if ((rq->cfs.h_nr_running >= 1) &&
-				check_cpu_capacity(rq, sd)) {
-			kick = true;
-			goto unlock;
-		}
-	}
-
-	sd = rcu_dereference(per_cpu(sd_asym, cpu));
-	if (sd) {
-		for_each_cpu(i, sched_domain_span(sd)) {
-			if (i == cpu ||
-			    !cpumask_test_cpu(i, nohz.idle_cpus_mask))
-				continue;
-
-			if (sched_asym_prefer(i, cpu)) {
-				kick = true;
-				goto unlock;
-			}
-		}
-	}
-unlock:
-	rcu_read_unlock();
-	return kick;
-}
 #else
 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
@@ -9497,10 +9503,8 @@ void trigger_load_balance(struct rq *rq)
 
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ_COMMON
-	if (nohz_kick_needed(rq))
-		nohz_balancer_kick();
-#endif
+
+	nohz_balancer_kick(rq);
 }
 
 static void rq_online_fair(struct rq *rq)
-- 
cgit v1.2.3-71-gd317


From a4064fb614f83c0a097c5ff7fe433c4aa139c7af Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 21 Dec 2017 10:42:50 +0100
Subject: sched/fair: Add NOHZ stats balancing

Teach the idle balancer about the need to update statistics which have
a different periodicity from regular balancing.

Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fa483d889f07..d8693fa9e7c5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9053,6 +9053,7 @@ static struct {
 	cpumask_var_t idle_cpus_mask;
 	atomic_t nr_cpus;
 	unsigned long next_balance;     /* in jiffy units */
+	unsigned long next_stats;
 } nohz ____cacheline_aligned;
 
 static inline int find_new_ilb(void)
@@ -9087,9 +9088,8 @@ unlock:
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
-static void kick_ilb(void)
+static void kick_ilb(unsigned int flags)
 {
-	unsigned int flags;
 	int ilb_cpu;
 
 	nohz.next_balance++;
@@ -9099,7 +9099,7 @@ static void kick_ilb(void)
 	if (ilb_cpu >= nr_cpu_ids)
 		return;
 
-	flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
+	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
 	if (flags & NOHZ_KICK_MASK)
 		return;
 
@@ -9129,7 +9129,7 @@ static void nohz_balancer_kick(struct rq *rq)
 	struct sched_domain_shared *sds;
 	struct sched_domain *sd;
 	int nr_busy, i, cpu = rq->cpu;
-	bool kick = false;
+	unsigned int flags = 0;
 
 	if (unlikely(rq->idle_balance))
 		return;
@@ -9148,11 +9148,14 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return;
 
+	if (time_after(now, nohz.next_stats))
+		flags = NOHZ_STATS_KICK;
+
 	if (time_before(now, nohz.next_balance))
-		return;
+		goto out;
 
 	if (rq->nr_running >= 2) {
-		kick = true;
+		flags = NOHZ_KICK_MASK;
 		goto out;
 	}
 
@@ -9165,7 +9168,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 */
 		nr_busy = atomic_read(&sds->nr_busy_cpus);
 		if (nr_busy > 1) {
-			kick = true;
+			flags = NOHZ_KICK_MASK;
 			goto unlock;
 		}
 
@@ -9175,7 +9178,7 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (sd) {
 		if ((rq->cfs.h_nr_running >= 1) &&
 				check_cpu_capacity(rq, sd)) {
-			kick = true;
+			flags = NOHZ_KICK_MASK;
 			goto unlock;
 		}
 	}
@@ -9188,7 +9191,7 @@ static void nohz_balancer_kick(struct rq *rq)
 				continue;
 
 			if (sched_asym_prefer(i, cpu)) {
-				kick = true;
+				flags = NOHZ_KICK_MASK;
 				goto unlock;
 			}
 		}
@@ -9196,8 +9199,8 @@ static void nohz_balancer_kick(struct rq *rq)
 unlock:
 	rcu_read_unlock();
 out:
-	if (kick)
-		kick_ilb();
+	if (flags)
+		kick_ilb(flags);
 }
 
 void nohz_balance_exit_idle(unsigned int cpu)
@@ -9389,7 +9392,9 @@ out:
 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
 	/* Earliest time when we have to do rebalance again */
-	unsigned long next_balance = jiffies + 60*HZ;
+	unsigned long now = jiffies;
+	unsigned long next_balance = now + 60*HZ;
+	unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
 	int update_next_balance = 0;
 	int this_cpu = this_rq->cpu;
 	unsigned int flags;
@@ -9449,6 +9454,8 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	if (flags & NOHZ_BALANCE_KICK)
 		rebalance_domains(this_rq, CPU_IDLE);
 
+	nohz.next_stats = next_stats;
+
 	/*
 	 * next_balance will be updated only when there is a need.
 	 * When the CPU is attached to null domain for ex, it will not be
-- 
cgit v1.2.3-71-gd317


From e022e0d38ad475fc650f22efa3deb2fb96e62542 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 21 Dec 2017 11:20:23 +0100
Subject: sched/fair: Update blocked load from NEWIDLE

Since we already iterate CPUs looking for work on NEWIDLE, use this
iteration to age the blocked load. If the domain for which this is
done completely spand the idle set, we can push the ILB based aging
forward.

Suggested-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  1 +
 kernel/sched/fair.c  | 49 +++++++++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |  1 +
 3 files changed, 45 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 69c9a6b07b61..8a10a2ce30a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6074,6 +6074,7 @@ void __init sched_init(void)
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
 		rq->last_load_update_tick = jiffies;
+		rq->last_blocked_load_update_tick = jiffies;
 		atomic_set(&rq->nohz_flags, 0);
 #endif
 #endif /* CONFIG_SMP */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d8693fa9e7c5..85232dad89c9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5376,6 +5376,14 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 	}
 	return load;
 }
+
+static struct {
+	cpumask_var_t idle_cpus_mask;
+	atomic_t nr_cpus;
+	unsigned long next_balance;     /* in jiffy units */
+	unsigned long next_stats;
+} nohz ____cacheline_aligned;
+
 #endif /* CONFIG_NO_HZ_COMMON */
 
 /**
@@ -7022,6 +7030,7 @@ enum fbq_type { regular, remote, all };
 #define LBF_NEED_BREAK	0x02
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED	0x08
+#define LBF_NOHZ_STATS	0x10
 
 struct lb_env {
 	struct sched_domain	*sd;
@@ -7460,6 +7469,10 @@ static void update_blocked_averages(int cpu)
 		if (cfs_rq_is_decayed(cfs_rq))
 			list_del_leaf_cfs_rq(cfs_rq);
 	}
+
+#ifdef CONFIG_NO_HZ_COMMON
+	rq->last_blocked_load_update_tick = jiffies;
+#endif
 	rq_unlock_irqrestore(rq, &rf);
 }
 
@@ -7519,6 +7532,9 @@ static inline void update_blocked_averages(int cpu)
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+#ifdef CONFIG_NO_HZ_COMMON
+	rq->last_blocked_load_update_tick = jiffies;
+#endif
 	rq_unlock_irqrestore(rq, &rf);
 }
 
@@ -7853,6 +7869,21 @@ group_type group_classify(struct sched_group *group,
 	return group_other;
 }
 
+static void update_nohz_stats(struct rq *rq)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+	unsigned int cpu = rq->cpu;
+
+	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+		return;
+
+	if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+		return;
+
+	update_blocked_averages(cpu);
+#endif
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -7875,6 +7906,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
+		if (env->flags & LBF_NOHZ_STATS)
+			update_nohz_stats(rq);
+
 		/* Bias balancing toward CPUs of our domain: */
 		if (local_group)
 			load = target_load(i, load_idx);
@@ -8030,6 +8064,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	if (child && child->flags & SD_PREFER_SIBLING)
 		prefer_sibling = 1;
 
+#ifdef CONFIG_NO_HZ_COMMON
+	if (env->idle == CPU_NEWLY_IDLE) {
+		env->flags |= LBF_NOHZ_STATS;
+
+		if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
+			nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
+	}
+#endif
+
 	load_idx = get_sd_load_idx(env->sd, env->idle);
 
 	do {
@@ -9049,12 +9092,6 @@ static inline int on_null_domain(struct rq *rq)
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
  */
-static struct {
-	cpumask_var_t idle_cpus_mask;
-	atomic_t nr_cpus;
-	unsigned long next_balance;     /* in jiffy units */
-	unsigned long next_stats;
-} nohz ____cacheline_aligned;
 
 static inline int find_new_ilb(void)
 {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5295f274053b..21381d276709 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -762,6 +762,7 @@ struct rq {
 #ifdef CONFIG_NO_HZ_COMMON
 #ifdef CONFIG_SMP
 	unsigned long		last_load_update_tick;
+	unsigned long		last_blocked_load_update_tick;
 #endif /* CONFIG_SMP */
 	atomic_t nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
-- 
cgit v1.2.3-71-gd317


From 00357f5ec5d67a52a175da6f29f85c2c19d59bc8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 21 Dec 2017 15:06:50 +0100
Subject: sched/nohz: Clean up nohz enter/exit

The primary observation is that nohz enter/exit is always from the
current CPU, therefore NOHZ_TICK_STOPPED does not in fact need to be
an atomic.

Secondary is that we appear to have 2 nearly identical hooks in the
nohz enter code, set_cpu_sd_state_idle() and
nohz_balance_enter_idle(). Fold the whole set_cpu_sd_state thing into
nohz_balance_{enter,exit}_idle.

Removes an atomic op from both enter and exit paths.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/nohz.h |  2 --
 kernel/sched/core.c        |  2 +-
 kernel/sched/fair.c        | 73 +++++++++++++++++++++++-----------------------
 kernel/sched/sched.h       | 11 ++++---
 kernel/time/tick-sched.c   |  7 -----
 5 files changed, 43 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 094217273ff9..b36f4cf38111 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -16,11 +16,9 @@ static inline void cpu_load_update_nohz_stop(void) { }
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
-extern void set_cpu_sd_state_idle(void);
 extern int get_nohz_timer_target(void);
 #else
 static inline void nohz_balance_enter_idle(int cpu) { }
-static inline void set_cpu_sd_state_idle(void) { }
 #endif
 
 #ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a10a2ce30a4..c7faeb7bd03a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5861,7 +5861,7 @@ int sched_cpu_dying(unsigned int cpu)
 
 	calc_load_migrate(rq);
 	update_max_interval();
-	nohz_balance_exit_idle(cpu);
+	nohz_balance_exit_idle(rq);
 	hrtick_clear(rq);
 	return 0;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 85232dad89c9..494d5db9a6cd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9103,23 +9103,6 @@ static inline int find_new_ilb(void)
 	return nr_cpu_ids;
 }
 
-static inline void set_cpu_sd_state_busy(void)
-{
-	struct sched_domain *sd;
-	int cpu = smp_processor_id();
-
-	rcu_read_lock();
-	sd = rcu_dereference(per_cpu(sd_llc, cpu));
-
-	if (!sd || !sd->nohz_idle)
-		goto unlock;
-	sd->nohz_idle = 0;
-
-	atomic_inc(&sd->shared->nr_busy_cpus);
-unlock:
-	rcu_read_unlock();
-}
-
 /*
  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
@@ -9175,8 +9158,7 @@ static void nohz_balancer_kick(struct rq *rq)
 	 * We may be recently in ticked or tickless idle mode. At the first
 	 * busy tick after returning from idle, we will update the busy stats.
 	 */
-	set_cpu_sd_state_busy();
-	nohz_balance_exit_idle(cpu);
+	nohz_balance_exit_idle(rq);
 
 	/*
 	 * None are in tickless mode and hence no need for NOHZ idle load
@@ -9240,27 +9222,39 @@ out:
 		kick_ilb(flags);
 }
 
-void nohz_balance_exit_idle(unsigned int cpu)
+static void set_cpu_sd_state_busy(int cpu)
 {
-	unsigned int flags = atomic_read(nohz_flags(cpu));
+	struct sched_domain *sd;
 
-	if (unlikely(flags & NOHZ_TICK_STOPPED)) {
-		/*
-		 * Completely isolated CPUs don't ever set, so we must test.
-		 */
-		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
-			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-			atomic_dec(&nohz.nr_cpus);
-		}
+	rcu_read_lock();
+	sd = rcu_dereference(per_cpu(sd_llc, cpu));
 
-		atomic_andnot(NOHZ_TICK_STOPPED, nohz_flags(cpu));
-	}
+	if (!sd || !sd->nohz_idle)
+		goto unlock;
+	sd->nohz_idle = 0;
+
+	atomic_inc(&sd->shared->nr_busy_cpus);
+unlock:
+	rcu_read_unlock();
 }
 
-void set_cpu_sd_state_idle(void)
+void nohz_balance_exit_idle(struct rq *rq)
+{
+	SCHED_WARN_ON(rq != this_rq());
+
+	if (likely(!rq->nohz_tick_stopped))
+		return;
+
+	rq->nohz_tick_stopped = 0;
+	cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
+	atomic_dec(&nohz.nr_cpus);
+
+	set_cpu_sd_state_busy(rq->cpu);
+}
+
+static void set_cpu_sd_state_idle(int cpu)
 {
 	struct sched_domain *sd;
-	int cpu = smp_processor_id();
 
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_llc, cpu));
@@ -9280,6 +9274,10 @@ unlock:
  */
 void nohz_balance_enter_idle(int cpu)
 {
+	struct rq *rq = cpu_rq(cpu);
+
+	SCHED_WARN_ON(cpu != smp_processor_id());
+
 	/* If this CPU is going down, then nothing needs to be done: */
 	if (!cpu_active(cpu))
 		return;
@@ -9288,16 +9286,19 @@ void nohz_balance_enter_idle(int cpu)
 	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 		return;
 
-	if (atomic_read(nohz_flags(cpu)) & NOHZ_TICK_STOPPED)
+	if (rq->nohz_tick_stopped)
 		return;
 
 	/* If we're a completely isolated CPU, we don't play: */
-	if (on_null_domain(cpu_rq(cpu)))
+	if (on_null_domain(rq))
 		return;
 
+	rq->nohz_tick_stopped = 1;
+
 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 	atomic_inc(&nohz.nr_cpus);
-	atomic_or(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+
+	set_cpu_sd_state_idle(cpu);
 }
 #else
 static inline void nohz_balancer_kick(struct rq *rq) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 21381d276709..818f22dbc7ea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -764,6 +764,7 @@ struct rq {
 	unsigned long		last_load_update_tick;
 	unsigned long		last_blocked_load_update_tick;
 #endif /* CONFIG_SMP */
+	unsigned int		nohz_tick_stopped;
 	atomic_t nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
 
@@ -2035,11 +2036,9 @@ extern void cfs_bandwidth_usage_inc(void);
 extern void cfs_bandwidth_usage_dec(void);
 
 #ifdef CONFIG_NO_HZ_COMMON
-#define NOHZ_TICK_STOPPED_BIT	0
-#define NOHZ_BALANCE_KICK_BIT	1
-#define NOHZ_STATS_KICK_BIT	2
+#define NOHZ_BALANCE_KICK_BIT	0
+#define NOHZ_STATS_KICK_BIT	1
 
-#define NOHZ_TICK_STOPPED	BIT(NOHZ_TICK_STOPPED_BIT)
 #define NOHZ_BALANCE_KICK	BIT(NOHZ_BALANCE_KICK_BIT)
 #define NOHZ_STATS_KICK		BIT(NOHZ_STATS_KICK_BIT)
 
@@ -2047,9 +2046,9 @@ extern void cfs_bandwidth_usage_dec(void);
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
 
-extern void nohz_balance_exit_idle(unsigned int cpu);
+extern void nohz_balance_exit_idle(struct rq *rq);
 #else
-static inline void nohz_balance_exit_idle(unsigned int cpu) { }
+static inline void nohz_balance_exit_idle(struct rq *rq) { }
 #endif
 
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f2fa2e940fe5..ab92aa4442df 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -954,13 +954,6 @@ void tick_nohz_idle_enter(void)
 	struct tick_sched *ts;
 
 	lockdep_assert_irqs_enabled();
-	/*
-	 * Update the idle state in the scheduler domain hierarchy
-	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
-	 * State will be updated to busy during the first busy tick after
-	 * exiting idle.
-	 */
-	set_cpu_sd_state_idle();
 
 	local_irq_disable();
 
-- 
cgit v1.2.3-71-gd317


From ea14b57e8a181ac0561eba7a787e088f8c89f822 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 2 Feb 2018 10:27:00 +0100
Subject: sched/cpufreq: Provide migration hint

It was suggested that a migration hint might be usefull for the
CPU-freq governors.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/cpufreq.h |  1 +
 kernel/sched/fair.c           | 31 +++++++++++++++++++------------
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index b48f2fb3b316..59667444669f 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -9,6 +9,7 @@
  */
 
 #define SCHED_CPUFREQ_IOWAIT	(1U << 0)
+#define SCHED_CPUFREQ_MIGRATION	(1U << 1)
 
 #ifdef CONFIG_CPU_FREQ
 struct update_util_data {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 494d5db9a6cd..e8f5efe2936c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -772,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
 			 * For !fair tasks do:
 			 *
 			update_cfs_rq_load_avg(now, cfs_rq);
-			attach_entity_load_avg(cfs_rq, se);
+			attach_entity_load_avg(cfs_rq, se, 0);
 			switched_from_fair(rq, p);
 			 *
 			 * such that the next switched_to_fair() has the
@@ -3009,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 {
 	struct rq *rq = rq_of(cfs_rq);
 
-	if (&rq->cfs == cfs_rq) {
+	if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
 		/*
 		 * There are a few boundary cases this might miss but it should
 		 * get called often enough that that should (hopefully) not be
@@ -3028,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 		 *
 		 * See cpu_util().
 		 */
-		cpufreq_update_util(rq, 0);
+		cpufreq_update_util(rq, flags);
 	}
 }
 
@@ -3686,7 +3686,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 #endif
 
 	if (decayed)
-		cfs_rq_util_change(cfs_rq);
+		cfs_rq_util_change(cfs_rq, 0);
 
 	return decayed;
 }
@@ -3699,7 +3699,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  * Must call update_cfs_rq_load_avg() before this, since we rely on
  * cfs_rq->avg.last_update_time being current.
  */
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
 
@@ -3735,7 +3735,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
 	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
 
-	cfs_rq_util_change(cfs_rq);
+	cfs_rq_util_change(cfs_rq, flags);
 }
 
 /**
@@ -3754,7 +3754,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
 	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
 
-	cfs_rq_util_change(cfs_rq);
+	cfs_rq_util_change(cfs_rq, 0);
 }
 
 /*
@@ -3784,7 +3784,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
 	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
 
-		attach_entity_load_avg(cfs_rq, se);
+		/*
+		 * DO_ATTACH means we're here from enqueue_entity().
+		 * !last_update_time means we've passed through
+		 * migrate_task_rq_fair() indicating we migrated.
+		 *
+		 * IOW we're enqueueing a task on a new CPU.
+		 */
+		attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
 		update_tg_load_avg(cfs_rq, 0);
 
 	} else if (decayed && (flags & UPDATE_TG))
@@ -3880,13 +3887,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 
 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
 {
-	cfs_rq_util_change(cfs_rq);
+	cfs_rq_util_change(cfs_rq, 0);
 }
 
 static inline void remove_entity_load_avg(struct sched_entity *se) {}
 
 static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
 static inline void
 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 
@@ -9726,7 +9733,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
 
 	/* Synchronize entity with its cfs_rq */
 	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
-	attach_entity_load_avg(cfs_rq, se);
+	attach_entity_load_avg(cfs_rq, se, 0);
 	update_tg_load_avg(cfs_rq, false);
 	propagate_entity_cfs_rq(se);
 }
-- 
cgit v1.2.3-71-gd317


From f643ea2207010db26f17fca99db031bad87c8461 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 13 Feb 2018 11:31:17 +0100
Subject: sched/nohz: Stop NOHZ stats when decayed

Stopped the periodic update of blocked load when all idle CPUs have fully
decayed. We introduce a new nohz.has_blocked that reflect if some idle
CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
is set everytime that a Idle CPU can have blocked load and it is then clear
when no more blocked load has been detected during an update. We don't need
atomic operation but only to make cure of the right ordering when updating
nohz.idle_cpus_mask and nohz.has_blocked.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: brendan.jackman@arm.com
Cc: dietmar.eggemann@arm.com
Cc: morten.rasmussen@foss.arm.com
Cc: valentin.schneider@arm.com
Link: http://lkml.kernel.org/r/1518517879-2280-2-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 116 ++++++++++++++++++++++++++++++++++++++++++---------
 kernel/sched/sched.h |   1 +
 2 files changed, 97 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8f5efe2936c..78b06a0814d1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5387,8 +5387,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 static struct {
 	cpumask_var_t idle_cpus_mask;
 	atomic_t nr_cpus;
+	int has_blocked;		/* Idle CPUS has blocked load */
 	unsigned long next_balance;     /* in jiffy units */
-	unsigned long next_stats;
+	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
 } nohz ____cacheline_aligned;
 
 #endif /* CONFIG_NO_HZ_COMMON */
@@ -7038,6 +7039,7 @@ enum fbq_type { regular, remote, all };
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED	0x08
 #define LBF_NOHZ_STATS	0x10
+#define LBF_NOHZ_AGAIN	0x20
 
 struct lb_env {
 	struct sched_domain	*sd;
@@ -7422,8 +7424,6 @@ static void attach_tasks(struct lb_env *env)
 	rq_unlock(env->dst_rq, &rf);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->load.weight)
@@ -7441,11 +7441,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 	return true;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct cfs_rq *cfs_rq, *pos;
 	struct rq_flags rf;
+	bool done = true;
 
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
@@ -7475,10 +7478,14 @@ static void update_blocked_averages(int cpu)
 		 */
 		if (cfs_rq_is_decayed(cfs_rq))
 			list_del_leaf_cfs_rq(cfs_rq);
+		else
+			done = false;
 	}
 
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
+	if (done)
+		rq->has_blocked_load = 0;
 #endif
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -7541,6 +7548,8 @@ static inline void update_blocked_averages(int cpu)
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
+	if (cfs_rq_is_decayed(cfs_rq))
+		rq->has_blocked_load = 0;
 #endif
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -7876,18 +7885,25 @@ group_type group_classify(struct sched_group *group,
 	return group_other;
 }
 
-static void update_nohz_stats(struct rq *rq)
+static bool update_nohz_stats(struct rq *rq)
 {
 #ifdef CONFIG_NO_HZ_COMMON
 	unsigned int cpu = rq->cpu;
 
+	if (!rq->has_blocked_load)
+		return false;
+
 	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
-		return;
+		return false;
 
 	if (!time_after(jiffies, rq->last_blocked_load_update_tick))
-		return;
+		return true;
 
 	update_blocked_averages(cpu);
+
+	return rq->has_blocked_load;
+#else
+	return false;
 #endif
 }
 
@@ -7913,8 +7929,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
-		if (env->flags & LBF_NOHZ_STATS)
-			update_nohz_stats(rq);
+		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
+			env->flags |= LBF_NOHZ_AGAIN;
 
 		/* Bias balancing toward CPUs of our domain: */
 		if (local_group)
@@ -8072,12 +8088,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		prefer_sibling = 1;
 
 #ifdef CONFIG_NO_HZ_COMMON
-	if (env->idle == CPU_NEWLY_IDLE) {
+	if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
 		env->flags |= LBF_NOHZ_STATS;
-
-		if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
-			nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
-	}
 #endif
 
 	load_idx = get_sd_load_idx(env->sd, env->idle);
@@ -8133,6 +8145,15 @@ next_group:
 		sg = sg->next;
 	} while (sg != env->sd->groups);
 
+#ifdef CONFIG_NO_HZ_COMMON
+	if ((env->flags & LBF_NOHZ_AGAIN) &&
+	    cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
+
+		WRITE_ONCE(nohz.next_blocked,
+			   jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
+	}
+#endif
+
 	if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 
@@ -9174,7 +9195,8 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return;
 
-	if (time_after(now, nohz.next_stats))
+	if (READ_ONCE(nohz.has_blocked) &&
+	    time_after(now, READ_ONCE(nohz.next_blocked)))
 		flags = NOHZ_STATS_KICK;
 
 	if (time_before(now, nohz.next_balance))
@@ -9293,8 +9315,21 @@ void nohz_balance_enter_idle(int cpu)
 	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 		return;
 
+	/*
+	 * Can be set safely without rq->lock held
+	 * If a clear happens, it will have evaluated last additions because
+	 * rq->lock is held during the check and the clear
+	 */
+	rq->has_blocked_load = 1;
+
+	/*
+	 * The tick is still stopped but load could have been added in the
+	 * meantime. We set the nohz.has_blocked flag to trig a check of the
+	 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
+	 * of nohz.has_blocked can only happen after checking the new load
+	 */
 	if (rq->nohz_tick_stopped)
-		return;
+		goto out;
 
 	/* If we're a completely isolated CPU, we don't play: */
 	if (on_null_domain(rq))
@@ -9305,7 +9340,21 @@ void nohz_balance_enter_idle(int cpu)
 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 	atomic_inc(&nohz.nr_cpus);
 
+	/*
+	 * Ensures that if nohz_idle_balance() fails to observe our
+	 * @idle_cpus_mask store, it must observe the @has_blocked
+	 * store.
+	 */
+	smp_mb__after_atomic();
+
 	set_cpu_sd_state_idle(cpu);
+
+out:
+	/*
+	 * Each time a cpu enter idle, we assume that it has blocked load and
+	 * enable the periodic update of the load of idle cpus
+	 */
+	WRITE_ONCE(nohz.has_blocked, 1);
 }
 #else
 static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9439,7 +9488,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	/* Earliest time when we have to do rebalance again */
 	unsigned long now = jiffies;
 	unsigned long next_balance = now + 60*HZ;
-	unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
+	bool has_blocked_load = false;
 	int update_next_balance = 0;
 	int this_cpu = this_rq->cpu;
 	unsigned int flags;
@@ -9458,6 +9507,22 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 
 	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 
+	/*
+	 * We assume there will be no idle load after this update and clear
+	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
+	 * set the has_blocked flag and trig another update of idle load.
+	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
+	 * setting the flag, we are sure to not clear the state and not
+	 * check the load of an idle cpu.
+	 */
+	WRITE_ONCE(nohz.has_blocked, 0);
+
+	/*
+	 * Ensures that if we miss the CPU, we must see the has_blocked
+	 * store from nohz_balance_enter_idle().
+	 */
+	smp_mb();
+
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
@@ -9467,11 +9532,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		 * work being done for other CPUs. Next load
 		 * balancing owner will pick it up.
 		 */
-		if (need_resched())
-			break;
+		if (need_resched()) {
+			has_blocked_load = true;
+			goto abort;
+		}
 
 		rq = cpu_rq(balance_cpu);
 
+		update_blocked_averages(rq->cpu);
+		has_blocked_load |= rq->has_blocked_load;
+
 		/*
 		 * If time for next balance is due,
 		 * do the balance.
@@ -9484,7 +9554,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 			cpu_load_update_idle(rq);
 			rq_unlock_irq(rq, &rf);
 
-			update_blocked_averages(rq->cpu);
 			if (flags & NOHZ_BALANCE_KICK)
 				rebalance_domains(rq, CPU_IDLE);
 		}
@@ -9499,7 +9568,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	if (flags & NOHZ_BALANCE_KICK)
 		rebalance_domains(this_rq, CPU_IDLE);
 
-	nohz.next_stats = next_stats;
+	WRITE_ONCE(nohz.next_blocked,
+		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+abort:
+	/* There is still blocked load, enable periodic update */
+	if (has_blocked_load)
+		WRITE_ONCE(nohz.has_blocked, 1);
 
 	/*
 	 * next_balance will be updated only when there is a need.
@@ -10135,6 +10210,7 @@ __init void init_sched_fair_class(void)
 
 #ifdef CONFIG_NO_HZ_COMMON
 	nohz.next_balance = jiffies;
+	nohz.next_blocked = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
 #endif
 #endif /* SMP */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 818f22dbc7ea..22909ffc04fb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -763,6 +763,7 @@ struct rq {
 #ifdef CONFIG_SMP
 	unsigned long		last_load_update_tick;
 	unsigned long		last_blocked_load_update_tick;
+	unsigned int		has_blocked_load;
 #endif /* CONFIG_SMP */
 	unsigned int		nohz_tick_stopped;
 	atomic_t nohz_flags;
-- 
cgit v1.2.3-71-gd317


From 1936c53ce8c8d4555e9ccad2dc8d98e0637b11f7 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 13 Feb 2018 11:31:18 +0100
Subject: sched/fair: Reduce the periodic update duration

Instead of using the cfs_rq_is_decayed() which monitors all *_avg
and *_sum, we create a cfs_rq_has_blocked() which only takes care of
util_avg and load_avg. We are only interested by these 2 values which are
decaying faster than the *_sum so we can stop the periodic update earlier.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: brendan.jackman@arm.com
Cc: dietmar.eggemann@arm.com
Cc: morten.rasmussen@foss.arm.com
Cc: valentin.schneider@arm.com
Link: http://lkml.kernel.org/r/1518517879-2280-3-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 78b06a0814d1..aad7c03dbad8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7424,6 +7424,19 @@ static void attach_tasks(struct lb_env *env)
 	rq_unlock(env->dst_rq, &rf);
 }
 
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->avg.load_avg)
+		return true;
+
+	if (cfs_rq->avg.util_avg)
+		return true;
+
+	return false;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->load.weight)
@@ -7441,8 +7454,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 	return true;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -7478,7 +7489,9 @@ static void update_blocked_averages(int cpu)
 		 */
 		if (cfs_rq_is_decayed(cfs_rq))
 			list_del_leaf_cfs_rq(cfs_rq);
-		else
+
+		/* Don't need periodic decay once load/util_avg are null */
+		if (cfs_rq_has_blocked(cfs_rq))
 			done = false;
 	}
 
@@ -7548,7 +7561,7 @@ static inline void update_blocked_averages(int cpu)
 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
-	if (cfs_rq_is_decayed(cfs_rq))
+	if (!cfs_rq_has_blocked(cfs_rq))
 		rq->has_blocked_load = 0;
 #endif
 	rq_unlock_irqrestore(rq, &rf);
-- 
cgit v1.2.3-71-gd317


From 63928384faefba1b31c3bb77361965715a9fc71c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 13 Feb 2018 16:54:17 +0100
Subject: sched/nohz: Optimize nohz_idle_balance()

Avoid calling update_blocked_averages() when it does not in fact have
any by re-using/extending update_nohz_stats().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aad7c03dbad8..5c357561db5d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7898,7 +7898,7 @@ group_type group_classify(struct sched_group *group,
 	return group_other;
 }
 
-static bool update_nohz_stats(struct rq *rq)
+static bool update_nohz_stats(struct rq *rq, bool force)
 {
 #ifdef CONFIG_NO_HZ_COMMON
 	unsigned int cpu = rq->cpu;
@@ -7909,7 +7909,7 @@ static bool update_nohz_stats(struct rq *rq)
 	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
 		return false;
 
-	if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+	if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
 		return true;
 
 	update_blocked_averages(cpu);
@@ -7942,7 +7942,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
-		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
+		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
 			env->flags |= LBF_NOHZ_AGAIN;
 
 		/* Bias balancing toward CPUs of our domain: */
@@ -9552,8 +9552,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 
 		rq = cpu_rq(balance_cpu);
 
-		update_blocked_averages(rq->cpu);
-		has_blocked_load |= rq->has_blocked_load;
+		has_blocked_load |= update_nohz_stats(rq, true);
 
 		/*
 		 * If time for next balance is due,
-- 
cgit v1.2.3-71-gd317


From af3fe03c562055bc3c116eabe73f141ae31bf234 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 20 Feb 2018 10:58:39 +0100
Subject: sched/fair: Move rebalance_domains()

This pure code movement results in two #ifdef CONFIG_NO_HZ_COMMON
sections landing next to each other.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 236 ++++++++++++++++++++++++++--------------------------
 1 file changed, 118 insertions(+), 118 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c357561db5d..0da79d8a6a2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9121,6 +9121,124 @@ out_unlock:
 	return 0;
 }
 
+static DEFINE_SPINLOCK(balancing);
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+void update_max_interval(void)
+{
+	max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in init_sched_domains.
+ */
+static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
+{
+	int continue_balancing = 1;
+	int cpu = rq->cpu;
+	unsigned long interval;
+	struct sched_domain *sd;
+	/* Earliest time when we have to do rebalance again */
+	unsigned long next_balance = jiffies + 60*HZ;
+	int update_next_balance = 0;
+	int need_serialize, need_decay = 0;
+	u64 max_cost = 0;
+
+	rcu_read_lock();
+	for_each_domain(cpu, sd) {
+		/*
+		 * Decay the newidle max times here because this is a regular
+		 * visit to all the domains. Decay ~1% per second.
+		 */
+		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+			sd->max_newidle_lb_cost =
+				(sd->max_newidle_lb_cost * 253) / 256;
+			sd->next_decay_max_lb_cost = jiffies + HZ;
+			need_decay = 1;
+		}
+		max_cost += sd->max_newidle_lb_cost;
+
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+
+		/*
+		 * Stop the load balance at this level. There is another
+		 * CPU in our sched group which is doing load balancing more
+		 * actively.
+		 */
+		if (!continue_balancing) {
+			if (need_decay)
+				continue;
+			break;
+		}
+
+		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+
+		need_serialize = sd->flags & SD_SERIALIZE;
+		if (need_serialize) {
+			if (!spin_trylock(&balancing))
+				goto out;
+		}
+
+		if (time_after_eq(jiffies, sd->last_balance + interval)) {
+			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
+				/*
+				 * The LBF_DST_PINNED logic could have changed
+				 * env->dst_cpu, so we can't know our idle
+				 * state even if we migrated tasks. Update it.
+				 */
+				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+			}
+			sd->last_balance = jiffies;
+			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+		}
+		if (need_serialize)
+			spin_unlock(&balancing);
+out:
+		if (time_after(next_balance, sd->last_balance + interval)) {
+			next_balance = sd->last_balance + interval;
+			update_next_balance = 1;
+		}
+	}
+	if (need_decay) {
+		/*
+		 * Ensure the rq-wide value also decays but keep it at a
+		 * reasonable floor to avoid funnies with rq->avg_idle.
+		 */
+		rq->max_idle_balance_cost =
+			max((u64)sysctl_sched_migration_cost, max_cost);
+	}
+	rcu_read_unlock();
+
+	/*
+	 * next_balance will be updated only when there is a need.
+	 * When the cpu is attached to null domain for ex, it will not be
+	 * updated.
+	 */
+	if (likely(update_next_balance)) {
+		rq->next_balance = next_balance;
+
+#ifdef CONFIG_NO_HZ_COMMON
+		/*
+		 * If this CPU has been elected to perform the nohz idle
+		 * balance. Other idle CPUs have already rebalanced with
+		 * nohz_idle_balance() and nohz.next_balance has been
+		 * updated accordingly. This CPU is now running the idle load
+		 * balance for itself and we need to update the
+		 * nohz.next_balance accordingly.
+		 */
+		if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
+			nohz.next_balance = rq->next_balance;
+#endif
+	}
+}
+
 static inline int on_null_domain(struct rq *rq)
 {
 	return unlikely(!rcu_dereference_sched(rq->sd));
@@ -9373,124 +9491,6 @@ out:
 static inline void nohz_balancer_kick(struct rq *rq) { }
 #endif
 
-static DEFINE_SPINLOCK(balancing);
-
-/*
- * Scale the max load_balance interval with the number of CPUs in the system.
- * This trades load-balance latency on larger machines for less cross talk.
- */
-void update_max_interval(void)
-{
-	max_load_balance_interval = HZ*num_online_cpus()/10;
-}
-
-/*
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in init_sched_domains.
- */
-static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
-{
-	int continue_balancing = 1;
-	int cpu = rq->cpu;
-	unsigned long interval;
-	struct sched_domain *sd;
-	/* Earliest time when we have to do rebalance again */
-	unsigned long next_balance = jiffies + 60*HZ;
-	int update_next_balance = 0;
-	int need_serialize, need_decay = 0;
-	u64 max_cost = 0;
-
-	rcu_read_lock();
-	for_each_domain(cpu, sd) {
-		/*
-		 * Decay the newidle max times here because this is a regular
-		 * visit to all the domains. Decay ~1% per second.
-		 */
-		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
-			sd->max_newidle_lb_cost =
-				(sd->max_newidle_lb_cost * 253) / 256;
-			sd->next_decay_max_lb_cost = jiffies + HZ;
-			need_decay = 1;
-		}
-		max_cost += sd->max_newidle_lb_cost;
-
-		if (!(sd->flags & SD_LOAD_BALANCE))
-			continue;
-
-		/*
-		 * Stop the load balance at this level. There is another
-		 * CPU in our sched group which is doing load balancing more
-		 * actively.
-		 */
-		if (!continue_balancing) {
-			if (need_decay)
-				continue;
-			break;
-		}
-
-		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
-
-		need_serialize = sd->flags & SD_SERIALIZE;
-		if (need_serialize) {
-			if (!spin_trylock(&balancing))
-				goto out;
-		}
-
-		if (time_after_eq(jiffies, sd->last_balance + interval)) {
-			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
-				/*
-				 * The LBF_DST_PINNED logic could have changed
-				 * env->dst_cpu, so we can't know our idle
-				 * state even if we migrated tasks. Update it.
-				 */
-				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
-			}
-			sd->last_balance = jiffies;
-			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
-		}
-		if (need_serialize)
-			spin_unlock(&balancing);
-out:
-		if (time_after(next_balance, sd->last_balance + interval)) {
-			next_balance = sd->last_balance + interval;
-			update_next_balance = 1;
-		}
-	}
-	if (need_decay) {
-		/*
-		 * Ensure the rq-wide value also decays but keep it at a
-		 * reasonable floor to avoid funnies with rq->avg_idle.
-		 */
-		rq->max_idle_balance_cost =
-			max((u64)sysctl_sched_migration_cost, max_cost);
-	}
-	rcu_read_unlock();
-
-	/*
-	 * next_balance will be updated only when there is a need.
-	 * When the CPU is attached to null domain for ex, it will not be
-	 * updated.
-	 */
-	if (likely(update_next_balance)) {
-		rq->next_balance = next_balance;
-
-#ifdef CONFIG_NO_HZ_COMMON
-		/*
-		 * If this CPU has been elected to perform the nohz idle
-		 * balance. Other idle CPUs have already rebalanced with
-		 * nohz_idle_balance() and nohz.next_balance has been
-		 * updated accordingly. This CPU is now running the idle load
-		 * balance for itself and we need to update the
-		 * nohz.next_balance accordingly.
-		 */
-		if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
-			nohz.next_balance = rq->next_balance;
-#endif
-	}
-}
-
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
-- 
cgit v1.2.3-71-gd317


From dd707247ababb685ac4b8b2c6a7bf2923725e6ac Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 20 Feb 2018 10:59:45 +0100
Subject: sched/nohz: Merge CONFIG_NO_HZ_COMMON blocks

Now that we have two back-to-back NO_HZ_COMMON blocks, merge them.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0da79d8a6a2c..d6767f533029 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9487,11 +9487,7 @@ out:
 	 */
 	WRITE_ONCE(nohz.has_blocked, 1);
 }
-#else
-static inline void nohz_balancer_kick(struct rq *rq) { }
-#endif
 
-#ifdef CONFIG_NO_HZ_COMMON
 /*
  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
  * rebalancing for all the CPUs for whom scheduler ticks are stopped.
@@ -9598,12 +9594,14 @@ abort:
 
 	return true;
 }
-#else
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void nohz_balancer_kick(struct rq *rq) { }
+
 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
 	return false;
 }
-#endif
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
-- 
cgit v1.2.3-71-gd317


From 47ea54121e46a685aa2320df8b0f71aaeedff23f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 20 Feb 2018 11:45:47 +0100
Subject: sched/fair: Move idle_balance()

We're going to want to call nohz_idle_balance() or parts thereof from
idle_balance(). Since we already have a forward declaration of
idle_balance() move it down such that it's below nohz_idle_balance()
avoiding the need for a forward declaration for that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 228 ++++++++++++++++++++++++++--------------------------
 1 file changed, 114 insertions(+), 114 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d6767f533029..058badcfa94b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8916,120 +8916,6 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 		*next_balance = next;
 }
 
-/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
-{
-	unsigned long next_balance = jiffies + HZ;
-	int this_cpu = this_rq->cpu;
-	struct sched_domain *sd;
-	int pulled_task = 0;
-	u64 curr_cost = 0;
-
-	/*
-	 * We must set idle_stamp _before_ calling idle_balance(), such that we
-	 * measure the duration of idle_balance() as idle time.
-	 */
-	this_rq->idle_stamp = rq_clock(this_rq);
-
-	/*
-	 * Do not pull tasks towards !active CPUs...
-	 */
-	if (!cpu_active(this_cpu))
-		return 0;
-
-	/*
-	 * This is OK, because current is on_cpu, which avoids it being picked
-	 * for load-balance and preemption/IRQs are still disabled avoiding
-	 * further scheduler activity on it and we're being very careful to
-	 * re-start the picking loop.
-	 */
-	rq_unpin_lock(this_rq, rf);
-
-	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
-	    !this_rq->rd->overload) {
-		rcu_read_lock();
-		sd = rcu_dereference_check_sched_domain(this_rq->sd);
-		if (sd)
-			update_next_balance(sd, &next_balance);
-		rcu_read_unlock();
-
-		goto out;
-	}
-
-	raw_spin_unlock(&this_rq->lock);
-
-	update_blocked_averages(this_cpu);
-	rcu_read_lock();
-	for_each_domain(this_cpu, sd) {
-		int continue_balancing = 1;
-		u64 t0, domain_cost;
-
-		if (!(sd->flags & SD_LOAD_BALANCE))
-			continue;
-
-		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
-			update_next_balance(sd, &next_balance);
-			break;
-		}
-
-		if (sd->flags & SD_BALANCE_NEWIDLE) {
-			t0 = sched_clock_cpu(this_cpu);
-
-			pulled_task = load_balance(this_cpu, this_rq,
-						   sd, CPU_NEWLY_IDLE,
-						   &continue_balancing);
-
-			domain_cost = sched_clock_cpu(this_cpu) - t0;
-			if (domain_cost > sd->max_newidle_lb_cost)
-				sd->max_newidle_lb_cost = domain_cost;
-
-			curr_cost += domain_cost;
-		}
-
-		update_next_balance(sd, &next_balance);
-
-		/*
-		 * Stop searching for tasks to pull if there are
-		 * now runnable tasks on this rq.
-		 */
-		if (pulled_task || this_rq->nr_running > 0)
-			break;
-	}
-	rcu_read_unlock();
-
-	raw_spin_lock(&this_rq->lock);
-
-	if (curr_cost > this_rq->max_idle_balance_cost)
-		this_rq->max_idle_balance_cost = curr_cost;
-
-	/*
-	 * While browsing the domains, we released the rq lock, a task could
-	 * have been enqueued in the meantime. Since we're not going idle,
-	 * pretend we pulled a task.
-	 */
-	if (this_rq->cfs.h_nr_running && !pulled_task)
-		pulled_task = 1;
-
-out:
-	/* Move the next balance forward */
-	if (time_after(this_rq->next_balance, next_balance))
-		this_rq->next_balance = next_balance;
-
-	/* Is there a task of a high priority class? */
-	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
-		pulled_task = -1;
-
-	if (pulled_task)
-		this_rq->idle_stamp = 0;
-
-	rq_repin_lock(this_rq, rf);
-
-	return pulled_task;
-}
-
 /*
  * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
  * running tasks off the busiest CPU onto idle CPUs. It requires at
@@ -9603,6 +9489,120 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 }
 #endif /* CONFIG_NO_HZ_COMMON */
 
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+{
+	unsigned long next_balance = jiffies + HZ;
+	int this_cpu = this_rq->cpu;
+	struct sched_domain *sd;
+	int pulled_task = 0;
+	u64 curr_cost = 0;
+
+	/*
+	 * We must set idle_stamp _before_ calling idle_balance(), such that we
+	 * measure the duration of idle_balance() as idle time.
+	 */
+	this_rq->idle_stamp = rq_clock(this_rq);
+
+	/*
+	 * Do not pull tasks towards !active CPUs...
+	 */
+	if (!cpu_active(this_cpu))
+		return 0;
+
+	/*
+	 * This is OK, because current is on_cpu, which avoids it being picked
+	 * for load-balance and preemption/IRQs are still disabled avoiding
+	 * further scheduler activity on it and we're being very careful to
+	 * re-start the picking loop.
+	 */
+	rq_unpin_lock(this_rq, rf);
+
+	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+	    !this_rq->rd->overload) {
+		rcu_read_lock();
+		sd = rcu_dereference_check_sched_domain(this_rq->sd);
+		if (sd)
+			update_next_balance(sd, &next_balance);
+		rcu_read_unlock();
+
+		goto out;
+	}
+
+	raw_spin_unlock(&this_rq->lock);
+
+	update_blocked_averages(this_cpu);
+	rcu_read_lock();
+	for_each_domain(this_cpu, sd) {
+		int continue_balancing = 1;
+		u64 t0, domain_cost;
+
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+
+		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+			update_next_balance(sd, &next_balance);
+			break;
+		}
+
+		if (sd->flags & SD_BALANCE_NEWIDLE) {
+			t0 = sched_clock_cpu(this_cpu);
+
+			pulled_task = load_balance(this_cpu, this_rq,
+						   sd, CPU_NEWLY_IDLE,
+						   &continue_balancing);
+
+			domain_cost = sched_clock_cpu(this_cpu) - t0;
+			if (domain_cost > sd->max_newidle_lb_cost)
+				sd->max_newidle_lb_cost = domain_cost;
+
+			curr_cost += domain_cost;
+		}
+
+		update_next_balance(sd, &next_balance);
+
+		/*
+		 * Stop searching for tasks to pull if there are
+		 * now runnable tasks on this rq.
+		 */
+		if (pulled_task || this_rq->nr_running > 0)
+			break;
+	}
+	rcu_read_unlock();
+
+	raw_spin_lock(&this_rq->lock);
+
+	if (curr_cost > this_rq->max_idle_balance_cost)
+		this_rq->max_idle_balance_cost = curr_cost;
+
+	/*
+	 * While browsing the domains, we released the rq lock, a task could
+	 * have been enqueued in the meantime. Since we're not going idle,
+	 * pretend we pulled a task.
+	 */
+	if (this_rq->cfs.h_nr_running && !pulled_task)
+		pulled_task = 1;
+
+out:
+	/* Move the next balance forward */
+	if (time_after(this_rq->next_balance, next_balance))
+		this_rq->next_balance = next_balance;
+
+	/* Is there a task of a high priority class? */
+	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+		pulled_task = -1;
+
+	if (pulled_task)
+		this_rq->idle_stamp = 0;
+
+	rq_repin_lock(this_rq, rf);
+
+	return pulled_task;
+}
+
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
-- 
cgit v1.2.3-71-gd317


From 31e77c93e432dec79c7d90b888bbfc3652592741 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 14 Feb 2018 16:26:46 +0100
Subject: sched/fair: Update blocked load when newly idle

When NEWLY_IDLE load balance is not triggered, we might need to update the
blocked load anyway. We can kick an ilb so an idle CPU will take care of
updating blocked load or we can try to update them locally before entering
idle. In the latter case, we reuse part of the nohz_idle_balance.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: brendan.jackman@arm.com
Cc: dietmar.eggemann@arm.com
Cc: morten.rasmussen@foss.arm.com
Cc: valentin.schneider@arm.com
Link: http://lkml.kernel.org/r/1518622006-16089-4-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 105 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 058badcfa94b..3582117e1580 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9375,10 +9375,14 @@ out:
 }
 
 /*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the CPUs for whom scheduler ticks are stopped.
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * The function returns false if the loop has stopped before running
+ * through all idle CPUs.
  */
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+			       enum cpu_idle_type idle)
 {
 	/* Earliest time when we have to do rebalance again */
 	unsigned long now = jiffies;
@@ -9386,20 +9390,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 	bool has_blocked_load = false;
 	int update_next_balance = 0;
 	int this_cpu = this_rq->cpu;
-	unsigned int flags;
 	int balance_cpu;
+	int ret = false;
 	struct rq *rq;
 
-	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
-		return false;
-
-	if (idle != CPU_IDLE) {
-		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-		return false;
-	}
-
-	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
-
 	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
 
 	/*
@@ -9443,10 +9437,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		if (time_after_eq(jiffies, rq->next_balance)) {
 			struct rq_flags rf;
 
-			rq_lock_irq(rq, &rf);
+			rq_lock_irqsave(rq, &rf);
 			update_rq_clock(rq);
 			cpu_load_update_idle(rq);
-			rq_unlock_irq(rq, &rf);
+			rq_unlock_irqrestore(rq, &rf);
 
 			if (flags & NOHZ_BALANCE_KICK)
 				rebalance_domains(rq, CPU_IDLE);
@@ -9458,13 +9452,21 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		}
 	}
 
-	update_blocked_averages(this_cpu);
+	/* Newly idle CPU doesn't need an update */
+	if (idle != CPU_NEWLY_IDLE) {
+		update_blocked_averages(this_cpu);
+		has_blocked_load |= this_rq->has_blocked_load;
+	}
+
 	if (flags & NOHZ_BALANCE_KICK)
 		rebalance_domains(this_rq, CPU_IDLE);
 
 	WRITE_ONCE(nohz.next_blocked,
 		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
+	/* The full idle balance loop has been done */
+	ret = true;
+
 abort:
 	/* There is still blocked load, enable periodic update */
 	if (has_blocked_load)
@@ -9478,15 +9480,79 @@ abort:
 	if (likely(update_next_balance))
 		nohz.next_balance = next_balance;
 
+	return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+	int this_cpu = this_rq->cpu;
+	unsigned int flags;
+
+	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+		return false;
+
+	if (idle != CPU_IDLE) {
+		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+		return false;
+	}
+
+	/*
+	 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+	 */
+	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+	if (!(flags & NOHZ_KICK_MASK))
+		return false;
+
+	_nohz_idle_balance(this_rq, flags, idle);
+
 	return true;
 }
+
+static void nohz_newidle_balance(struct rq *this_rq)
+{
+	int this_cpu = this_rq->cpu;
+
+	/*
+	 * This CPU doesn't want to be disturbed by scheduler
+	 * housekeeping
+	 */
+	if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+		return;
+
+	/* Will wake up very soon. No time for doing anything else*/
+	if (this_rq->avg_idle < sysctl_sched_migration_cost)
+		return;
+
+	/* Don't need to update blocked load of idle CPUs*/
+	if (!READ_ONCE(nohz.has_blocked) ||
+	    time_before(jiffies, READ_ONCE(nohz.next_blocked)))
+		return;
+
+	raw_spin_unlock(&this_rq->lock);
+	/*
+	 * This CPU is going to be idle and blocked load of idle CPUs
+	 * need to be updated. Run the ilb locally as it is a good
+	 * candidate for ilb instead of waking up another idle CPU.
+	 * Kick an normal ilb if we failed to do the update.
+	 */
+	if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
+		kick_ilb(NOHZ_STATS_KICK);
+	raw_spin_lock(&this_rq->lock);
+}
+
 #else /* !CONFIG_NO_HZ_COMMON */
 static inline void nohz_balancer_kick(struct rq *rq) { }
 
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
 	return false;
 }
+
+static inline void nohz_newidle_balance(struct rq *this_rq) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
 /*
@@ -9523,12 +9589,15 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 
 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
 	    !this_rq->rd->overload) {
+
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 		if (sd)
 			update_next_balance(sd, &next_balance);
 		rcu_read_unlock();
 
+		nohz_newidle_balance(this_rq);
+
 		goto out;
 	}
 
-- 
cgit v1.2.3-71-gd317


From d17067e4487adc53bedb43681b3cb5a1714ff6ca Mon Sep 17 00:00:00 2001
From: gaurav jindal <gauravjindal1104@gmail.com>
Date: Wed, 21 Feb 2018 18:24:07 +0530
Subject: sched/completions: Use bool in try_wait_for_completion()

Since the return type of the function is bool, the internal
'ret' variable should be bool too.

Signed-off-by: Gaurav Jindal<gauravjindal1104@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180221125407.GA14292@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/completion.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 5d2d56b0817a..e426b0cb9ac6 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -280,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
 bool try_wait_for_completion(struct completion *x)
 {
 	unsigned long flags;
-	int ret = 1;
+	bool ret = true;
 
 	/*
 	 * Since x->done will need to be locked only
@@ -289,11 +289,11 @@ bool try_wait_for_completion(struct completion *x)
 	 * return early in the blocking case.
 	 */
 	if (!READ_ONCE(x->done))
-		return 0;
+		return false;
 
 	spin_lock_irqsave(&x->wait.lock, flags);
 	if (!x->done)
-		ret = 0;
+		ret = false;
 	else if (x->done != UINT_MAX)
 		x->done--;
 	spin_unlock_irqrestore(&x->wait.lock, flags);
-- 
cgit v1.2.3-71-gd317


From bd903afeb504db5655a45bb4cf86f38be5b1bf62 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 5 Mar 2018 21:55:04 -0800
Subject: perf/core: Fix ctx_event_type in ctx_resched()

In ctx_resched(), EVENT_FLEXIBLE should be sched_out when EVENT_PINNED is
added. However, ctx_resched() calculates ctx_event_type before checking
this condition. As a result, pinned events will NOT get higher priority
than flexible events.

The following shows this issue on an Intel CPU (where ref-cycles can
only use one hardware counter).

  1. First start:
       perf stat -C 0 -e ref-cycles  -I 1000
  2. Then, in the second console, run:
       perf stat -C 0 -e ref-cycles:D -I 1000

The second perf uses pinned events, which is expected to have higher
priority. However, because it failed in ctx_resched(). It is never
run.

This patch fixes this by calculating ctx_event_type after re-evaluating
event_type.

Reported-by: Ephraim Park <ephiepark@fb.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <jolsa@redhat.com>
Cc: <kernel-team@fb.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: 487f05e18aa4 ("perf/core: Optimize event rescheduling on active contexts")
Link: http://lkml.kernel.org/r/20180306055504.3283731-1-songliubraving@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 96db9ae5d5af..4b838470fac4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2246,7 +2246,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 			struct perf_event_context *task_ctx,
 			enum event_type_t event_type)
 {
-	enum event_type_t ctx_event_type = event_type & EVENT_ALL;
+	enum event_type_t ctx_event_type;
 	bool cpu_event = !!(event_type & EVENT_CPU);
 
 	/*
@@ -2256,6 +2256,8 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 	if (event_type & EVENT_PINNED)
 		event_type |= EVENT_FLEXIBLE;
 
+	ctx_event_type = event_type & EVENT_ALL;
+
 	perf_pmu_disable(cpuctx->ctx.pmu);
 	if (task_ctx)
 		task_ctx_sched_out(cpuctx, task_ctx, event_type);
-- 
cgit v1.2.3-71-gd317


From 6b0ef92fee2a3189eba6d6b827b247cb4f6da7e9 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Fri, 9 Mar 2018 14:56:28 +0800
Subject: rtmutex: Make rt_mutex_futex_unlock() safe for irq-off callsites

When running rcutorture with TREE03 config, CONFIG_PROVE_LOCKING=y, and
kernel cmdline argument "rcutorture.gp_exp=1", lockdep reports a
HARDIRQ-safe->HARDIRQ-unsafe deadlock:

 ================================
 WARNING: inconsistent lock state
 4.16.0-rc4+ #1 Not tainted
 --------------------------------
 inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
 takes:
 __schedule+0xbe/0xaf0
 {IN-HARDIRQ-W} state was registered at:
   _raw_spin_lock+0x2a/0x40
   scheduler_tick+0x47/0xf0
...
 other info that might help us debug this:
  Possible unsafe locking scenario:
        CPU0
        ----
   lock(&rq->lock);
   <Interrupt>
     lock(&rq->lock);
  *** DEADLOCK ***
 1 lock held by rcu_torture_rea/724:
 rcu_torture_read_lock+0x0/0x70
 stack backtrace:
 CPU: 2 PID: 724 Comm: rcu_torture_rea Not tainted 4.16.0-rc4+ #1
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014
 Call Trace:
  lock_acquire+0x90/0x200
  ? __schedule+0xbe/0xaf0
  _raw_spin_lock+0x2a/0x40
  ? __schedule+0xbe/0xaf0
  __schedule+0xbe/0xaf0
  preempt_schedule_irq+0x2f/0x60
  retint_kernel+0x1b/0x2d
 RIP: 0010:rcu_read_unlock_special+0x0/0x680
  ? rcu_torture_read_unlock+0x60/0x60
  __rcu_read_unlock+0x64/0x70
  rcu_torture_read_unlock+0x17/0x60
  rcu_torture_reader+0x275/0x450
  ? rcutorture_booster_init+0x110/0x110
  ? rcu_torture_stall+0x230/0x230
  ? kthread+0x10e/0x130
  kthread+0x10e/0x130
  ? kthread_create_worker_on_cpu+0x70/0x70
  ? call_usermodehelper_exec_async+0x11a/0x150
  ret_from_fork+0x3a/0x50

This happens with the following even sequence:

	preempt_schedule_irq();
	  local_irq_enable();
	  __schedule():
	    local_irq_disable(); // irq off
	    ...
	    rcu_note_context_switch():
	      rcu_note_preempt_context_switch():
	        rcu_read_unlock_special():
	          local_irq_save(flags);
	          ...
		  raw_spin_unlock_irqrestore(...,flags); // irq remains off
	          rt_mutex_futex_unlock():
	            raw_spin_lock_irq();
	            ...
	            raw_spin_unlock_irq(); // accidentally set irq on

	    <return to __schedule()>
	    rq_lock():
	      raw_spin_lock(); // acquiring rq->lock with irq on

which means rq->lock becomes a HARDIRQ-unsafe lock, which can cause
deadlocks in scheduler code.

This problem was introduced by commit 02a7c234e540 ("rcu: Suppress
lockdep false-positive ->boost_mtx complaints"). That brought the user
of rt_mutex_futex_unlock() with irq off.

To fix this, replace the *lock_irq() in rt_mutex_futex_unlock() with
*lock_irq{save,restore}() to make it safe to call rt_mutex_futex_unlock()
with irq off.

Fixes: 02a7c234e540 ("rcu: Suppress lockdep false-positive ->boost_mtx complaints")
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Link: https://lkml.kernel.org/r/20180309065630.8283-1-boqun.feng@gmail.com
---
 kernel/locking/rtmutex.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 65cc0cb984e6..940633c63254 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1616,11 +1616,12 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
 void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
 {
 	DEFINE_WAKE_Q(wake_q);
+	unsigned long flags;
 	bool postunlock;
 
-	raw_spin_lock_irq(&lock->wait_lock);
+	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 	postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
-	raw_spin_unlock_irq(&lock->wait_lock);
+	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	if (postunlock)
 		rt_mutex_postunlock(&wake_q);
-- 
cgit v1.2.3-71-gd317


From 0862ca422b79cb5aa70823ee0f07f6b468f86070 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 9 Mar 2018 15:50:59 -0800
Subject: bug: use %pB in BUG and stack protector failure

The BUG and stack protector reports were still using a raw %p.  This
changes it to %pB for more meaningful output.

Link: http://lkml.kernel.org/r/20180301225704.GA34198@beast
Fixes: ad67b74d2469 ("printk: hash addresses printed with %p")
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Richard Weinberger <richard.weinberger@gmail.com>,
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 2 +-
 lib/bug.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 2cfef408fec9..4b794f1d8561 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -640,7 +640,7 @@ device_initcall(register_warn_debugfs);
  */
 __visible void __stack_chk_fail(void)
 {
-	panic("stack-protector: Kernel stack is corrupted in: %p\n",
+	panic("stack-protector: Kernel stack is corrupted in: %pB\n",
 		__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__stack_chk_fail);
diff --git a/lib/bug.c b/lib/bug.c
index c1b0fad31b10..44f432cb064d 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -191,7 +191,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 	if (file)
 		pr_crit("kernel BUG at %s:%u!\n", file, line);
 	else
-		pr_crit("Kernel BUG at %p [verbose debug info unavailable]\n",
+		pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n",
 			(void *)bugaddr);
 
 	return BUG_TRAP_TYPE_BUG;
-- 
cgit v1.2.3-71-gd317


From 9e5b127d6f33468143d90c8a45ca12410e4c3fa7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Mar 2018 12:52:04 +0100
Subject: perf/core: Fix perf_output_read_group()

Mark reported his arm64 perf fuzzer runs sometimes splat like:

  armv8pmu_read_counter+0x1e8/0x2d8
  armpmu_event_update+0x8c/0x188
  armpmu_read+0xc/0x18
  perf_output_read+0x550/0x11e8
  perf_event_read_event+0x1d0/0x248
  perf_event_exit_task+0x468/0xbb8
  do_exit+0x690/0x1310
  do_group_exit+0xd0/0x2b0
  get_signal+0x2e8/0x17a8
  do_signal+0x144/0x4f8
  do_notify_resume+0x148/0x1e8
  work_pending+0x8/0x14

which asserts that we only call pmu::read() on ACTIVE events.

The above callchain does:

  perf_event_exit_task()
    perf_event_exit_task_context()
      task_ctx_sched_out() // INACTIVE
      perf_event_exit_event()
        perf_event_set_state(EXIT) // EXIT
        sync_child_event()
          perf_event_read_event()
            perf_output_read()
              perf_output_read_group()
                leader->pmu->read()

Which results in doing a pmu::read() on an !ACTIVE event.

I _think_ this is 'new' since we added attr.inherit_stat, which added
the perf_event_read_event() to the exit path, without that
perf_event_read_output() would only trigger from samples and for
@event to trigger a sample, it's leader _must_ be ACTIVE too.

Still, adding this check makes it consistent with the @sub case for
the siblings.

Reported-and-Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 57898102847f..8b6a2774e084 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5730,7 +5730,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = running;
 
-	if (leader != event)
+	if ((leader != event) &&
+	    (leader->state == PERF_EVENT_STATE_ACTIVE))
 		leader->pmu->read(leader);
 
 	values[n++] = perf_event_count(leader);
-- 
cgit v1.2.3-71-gd317


From 8e1a2031e4b556b01ca53cd1fb2d83d811a6605b Mon Sep 17 00:00:00 2001
From: Alexey Budankov <alexey.budankov@linux.intel.com>
Date: Fri, 8 Sep 2017 11:47:03 +0300
Subject: perf/cor: Use RB trees for pinned/flexible groups

Change event groups into RB trees sorted by CPU and then by a 64bit
index, so that multiplexing hrtimer interrupt handler would be able
skipping to the current CPU's list and ignore groups allocated for the
other CPUs.

New API for manipulating event groups in the trees is implemented as well
as adoption on the API in the current implementation.

pinned_group_sched_in() and flexible_group_sched_in() API are
introduced to consolidate code enabling the whole group from pinned
and flexible groups appropriately.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Dmitri Prokhorov <Dmitry.Prohorov@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valery Cherepennikov <valery.cherepennikov@intel.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/372f9c8b-0cfe-4240-e44d-83d863d40813@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  16 ++-
 kernel/events/core.c       | 307 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 267 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7546822a1d74..6e3f854a34d8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -558,7 +558,11 @@ struct perf_event {
 	 */
 	struct list_head		group_entry;
 	struct list_head		sibling_list;
-
+	/*
+	 * Node on the pinned or flexible tree located at the event context;
+	 */
+	struct rb_node			group_node;
+	u64				group_index;
 	/*
 	 * We need storage to track the entries in perf_pmu_migrate_context; we
 	 * cannot use the event_entry because of RCU and we want to keep the
@@ -690,6 +694,12 @@ struct perf_event {
 #endif /* CONFIG_PERF_EVENTS */
 };
 
+
+struct perf_event_groups {
+	struct rb_root	tree;
+	u64		index;
+};
+
 /**
  * struct perf_event_context - event context structure
  *
@@ -710,8 +720,8 @@ struct perf_event_context {
 	struct mutex			mutex;
 
 	struct list_head		active_ctx_list;
-	struct list_head		pinned_groups;
-	struct list_head		flexible_groups;
+	struct perf_event_groups	pinned_groups;
+	struct perf_event_groups	flexible_groups;
 	struct list_head		event_list;
 	int				nr_events;
 	int				nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8b6a2774e084..c9fee3640f40 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1460,8 +1460,21 @@ static enum event_type_t get_event_type(struct perf_event *event)
 	return event_type;
 }
 
-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+/*
+ * Helper function to initialize group leader event;
+ */
+void init_event_group(struct perf_event *event)
+{
+	RB_CLEAR_NODE(&event->group_node);
+	event->group_index = 0;
+}
+
+/*
+ * Extract pinned or flexible groups from the context
+ * based on event attrs bits;
+ */
+static struct perf_event_groups *
+get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
 {
 	if (event->attr.pinned)
 		return &ctx->pinned_groups;
@@ -1469,6 +1482,169 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 		return &ctx->flexible_groups;
 }
 
+/*
+ * Helper function to initializes perf event groups object;
+ */
+void perf_event_groups_init(struct perf_event_groups *groups)
+{
+	groups->tree = RB_ROOT;
+	groups->index = 0;
+}
+
+/*
+ * Compare function for event groups;
+ * Implements complex key that first sorts by CPU and then by
+ * virtual index which provides ordering when rotating
+ * groups for the same CPU;
+ */
+int perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+{
+	if (left->cpu < right->cpu) {
+		return 1;
+	} else if (left->cpu > right->cpu) {
+		return 0;
+	} else {
+		if (left->group_index < right->group_index) {
+			return 1;
+		} else if(left->group_index > right->group_index) {
+			return 0;
+		} else {
+			return 0;
+		}
+	}
+}
+
+/*
+ * Insert a group into a tree using event->cpu as a key. If event->cpu node
+ * is already attached to the tree then the event is added to the attached
+ * group's group_list list.
+ */
+static void
+perf_event_groups_insert(struct perf_event_groups *groups,
+		struct perf_event *event)
+{
+	struct perf_event *node_event;
+	struct rb_node *parent;
+	struct rb_node **node;
+
+	event->group_index = ++groups->index;
+
+	node = &groups->tree.rb_node;
+	parent = *node;
+
+	while (*node) {
+		parent = *node;
+		node_event = container_of(*node,
+				struct perf_event, group_node);
+
+		if (perf_event_groups_less(event, node_event))
+			node = &parent->rb_left;
+		else
+			node = &parent->rb_right;
+	}
+
+	rb_link_node(&event->group_node, parent, node);
+	rb_insert_color(&event->group_node, &groups->tree);
+}
+
+/*
+ * Helper function to insert event into the pinned or
+ * flexible groups;
+ */
+static void
+add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct perf_event_groups *groups;
+
+	groups = get_event_groups(event, ctx);
+	perf_event_groups_insert(groups, event);
+}
+
+/*
+ * Delete a group from a tree. If the group is directly attached to the tree
+ * it also detaches all groups on the group's group_list list.
+ */
+static void
+perf_event_groups_delete(struct perf_event_groups *groups,
+		struct perf_event *event)
+{
+	if (!RB_EMPTY_NODE(&event->group_node) &&
+	    !RB_EMPTY_ROOT(&groups->tree))
+		rb_erase(&event->group_node, &groups->tree);
+
+	init_event_group(event);
+}
+
+/*
+ * Helper function to delete event from its groups;
+ */
+static void
+del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct perf_event_groups *groups;
+
+	groups = get_event_groups(event, ctx);
+	perf_event_groups_delete(groups, event);
+}
+
+/*
+ * Get a group by a cpu key from groups tree with the least group_index;
+ */
+static struct perf_event *
+perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+{
+	struct perf_event *node_event = NULL, *match = NULL;
+	struct rb_node *node = groups->tree.rb_node;
+
+	while (node) {
+		node_event = container_of(node,
+				struct perf_event, group_node);
+
+		if (cpu < node_event->cpu) {
+			node = node->rb_left;
+		} else if (cpu > node_event->cpu) {
+			node = node->rb_right;
+		} else {
+			match = node_event;
+			node = node->rb_left;
+		}
+	}
+
+	return match;
+}
+
+/*
+ * Find group list by a cpu key and rotate it.
+ */
+static void
+perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
+{
+	struct perf_event *event =
+			perf_event_groups_first(groups, cpu);
+
+	if (event) {
+		perf_event_groups_delete(groups, event);
+		perf_event_groups_insert(groups, event);
+	}
+}
+
+/*
+ * Iterate event groups thru the whole tree.
+ */
+#define perf_event_groups_for_each(event, groups, node)		\
+	for (event = rb_entry_safe(rb_first(&((groups)->tree)),	\
+				typeof(*event), node); event;	\
+		event = rb_entry_safe(rb_next(&event->node),	\
+				typeof(*event), node))
+/*
+ * Iterate event groups with cpu == key.
+ */
+#define perf_event_groups_for_each_cpu(event, key, groups, node) \
+	for (event = perf_event_groups_first(groups, key);	 \
+		event && event->cpu == key;			 \
+		event = rb_entry_safe(rb_next(&event->node),	 \
+				typeof(*event), node))
+
 /*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -1489,12 +1665,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	 * perf_group_detach can, at all times, locate all siblings.
 	 */
 	if (event->group_leader == event) {
-		struct list_head *list;
-
 		event->group_caps = event->event_caps;
-
-		list = ctx_group_list(event, ctx);
-		list_add_tail(&event->group_entry, list);
+		add_event_to_groups(event, ctx);
 	}
 
 	list_update_cgroup_event(event, ctx, true);
@@ -1688,7 +1860,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	list_del_rcu(&event->event_entry);
 
 	if (event->group_leader == event)
-		list_del_init(&event->group_entry);
+		del_event_from_groups(event, ctx);
 
 	/*
 	 * If event was in error state, then keep it
@@ -1706,7 +1878,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 static void perf_group_detach(struct perf_event *event)
 {
 	struct perf_event *sibling, *tmp;
-	struct list_head *list = NULL;
 
 	lockdep_assert_held(&event->ctx->lock);
 
@@ -1727,22 +1898,23 @@ static void perf_group_detach(struct perf_event *event)
 		goto out;
 	}
 
-	if (!list_empty(&event->group_entry))
-		list = &event->group_entry;
-
 	/*
 	 * If this was a group event with sibling events then
 	 * upgrade the siblings to singleton events by adding them
 	 * to whatever list we are on.
 	 */
 	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
-		if (list)
-			list_move_tail(&sibling->group_entry, list);
+
 		sibling->group_leader = sibling;
 
 		/* Inherit group flags from the previous leader */
 		sibling->group_caps = event->group_caps;
 
+		if (!RB_EMPTY_NODE(&event->group_node)) {
+			list_del_init(&sibling->group_entry);
+			add_event_to_groups(sibling, event->ctx);
+		}
+
 		WARN_ON_ONCE(sibling->ctx != event->ctx);
 	}
 
@@ -2186,6 +2358,22 @@ static int group_can_go_on(struct perf_event *event,
 	return can_add_hw;
 }
 
+static int
+flexible_group_sched_in(struct perf_event *event,
+			struct perf_event_context *ctx,
+		        struct perf_cpu_context *cpuctx,
+			int *can_add_hw)
+{
+	if (event->state <= PERF_EVENT_STATE_OFF || !event_filter_match(event))
+		return 0;
+
+	if (group_can_go_on(event, cpuctx, *can_add_hw))
+		if (group_sched_in(event, cpuctx, ctx))
+			*can_add_hw = 0;
+
+	return 1;
+}
+
 static void add_event_to_ctx(struct perf_event *event,
 			       struct perf_event_context *ctx)
 {
@@ -2652,6 +2840,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
 			  enum event_type_t event_type)
 {
+	int sw = -1, cpu = smp_processor_id();
 	int is_active = ctx->is_active;
 	struct perf_event *event;
 
@@ -2700,12 +2889,20 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 
 	perf_pmu_disable(ctx->pmu);
 	if (is_active & EVENT_PINNED) {
-		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+		perf_event_groups_for_each_cpu(event, cpu,
+				&ctx->pinned_groups, group_node)
+			group_sched_out(event, cpuctx, ctx);
+		perf_event_groups_for_each_cpu(event, sw,
+				&ctx->pinned_groups, group_node)
 			group_sched_out(event, cpuctx, ctx);
 	}
 
 	if (is_active & EVENT_FLEXIBLE) {
-		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+		perf_event_groups_for_each_cpu(event, cpu,
+				&ctx->flexible_groups, group_node)
+			group_sched_out(event, cpuctx, ctx);
+		perf_event_groups_for_each_cpu(event, sw,
+				&ctx->flexible_groups, group_node)
 			group_sched_out(event, cpuctx, ctx);
 	}
 	perf_pmu_enable(ctx->pmu);
@@ -2996,23 +3193,28 @@ static void
 ctx_pinned_sched_in(struct perf_event_context *ctx,
 		    struct perf_cpu_context *cpuctx)
 {
+	int sw = -1, cpu = smp_processor_id();
 	struct perf_event *event;
+	int can_add_hw;
+
+	perf_event_groups_for_each_cpu(event, sw,
+			&ctx->pinned_groups, group_node) {
+		can_add_hw = 1;
+		if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) {
+			if (event->state == PERF_EVENT_STATE_INACTIVE)
+				perf_event_set_state(event,
+						PERF_EVENT_STATE_ERROR);
+		}
+	}
 
-	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-		if (event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-		if (!event_filter_match(event))
-			continue;
-
-		if (group_can_go_on(event, cpuctx, 1))
-			group_sched_in(event, cpuctx, ctx);
-
-		/*
-		 * If this pinned group hasn't been scheduled,
-		 * put it in error state.
-		 */
-		if (event->state == PERF_EVENT_STATE_INACTIVE)
-			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+	perf_event_groups_for_each_cpu(event, cpu,
+			&ctx->pinned_groups, group_node) {
+		can_add_hw = 1;
+		if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) {
+			if (event->state == PERF_EVENT_STATE_INACTIVE)
+				perf_event_set_state(event,
+						PERF_EVENT_STATE_ERROR);
+		}
 	}
 }
 
@@ -3020,25 +3222,19 @@ static void
 ctx_flexible_sched_in(struct perf_event_context *ctx,
 		      struct perf_cpu_context *cpuctx)
 {
+	int sw = -1, cpu = smp_processor_id();
 	struct perf_event *event;
 	int can_add_hw = 1;
 
-	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-		/* Ignore events in OFF or ERROR state */
-		if (event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-		/*
-		 * Listen to the 'cpu' scheduling filter constraint
-		 * of events:
-		 */
-		if (!event_filter_match(event))
-			continue;
+	perf_event_groups_for_each_cpu(event, sw,
+			&ctx->flexible_groups, group_node)
+		flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw);
+
+	can_add_hw = 1;
+	perf_event_groups_for_each_cpu(event, cpu,
+			&ctx->flexible_groups, group_node)
+		flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw);
 
-		if (group_can_go_on(event, cpuctx, can_add_hw)) {
-			if (group_sched_in(event, cpuctx, ctx))
-				can_add_hw = 0;
-		}
-	}
 }
 
 static void
@@ -3119,7 +3315,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	 * However, if task's ctx is not carrying any pinned
 	 * events, no need to flip the cpuctx's events around.
 	 */
-	if (!list_empty(&ctx->pinned_groups))
+	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
 	perf_pmu_enable(ctx->pmu);
@@ -3356,8 +3552,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
 	 * Rotate the first entry last of non-pinned groups. Rotation might be
 	 * disabled by the inheritance code.
 	 */
-	if (!ctx->rotate_disable)
-		list_rotate_left(&ctx->flexible_groups);
+	if (!ctx->rotate_disable) {
+		int sw = -1, cpu = smp_processor_id();
+
+		perf_event_groups_rotate(&ctx->flexible_groups, sw);
+		perf_event_groups_rotate(&ctx->flexible_groups, cpu);
+	}
 }
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3715,8 +3915,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 	raw_spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->active_ctx_list);
-	INIT_LIST_HEAD(&ctx->pinned_groups);
-	INIT_LIST_HEAD(&ctx->flexible_groups);
+	perf_event_groups_init(&ctx->pinned_groups);
+	perf_event_groups_init(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
 }
@@ -9561,6 +9761,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->group_entry);
 	INIT_LIST_HEAD(&event->event_entry);
 	INIT_LIST_HEAD(&event->sibling_list);
+	init_event_group(event);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
 	INIT_LIST_HEAD(&event->addr_filters.list);
@@ -11085,7 +11286,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
-	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+	perf_event_groups_for_each(event, &parent_ctx->pinned_groups, group_node) {
 		ret = inherit_task_group(event, parent, parent_ctx,
 					 child, ctxn, &inherited_all);
 		if (ret)
@@ -11101,7 +11302,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	parent_ctx->rotate_disable = 1;
 	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
 
-	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
+	perf_event_groups_for_each(event, &parent_ctx->flexible_groups, group_node) {
 		ret = inherit_task_group(event, parent, parent_ctx,
 					 child, ctxn, &inherited_all);
 		if (ret)
-- 
cgit v1.2.3-71-gd317


From 161c85fab7875f306eee9655dee71068feeb14ce Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 13 Nov 2017 14:28:27 +0100
Subject: perf/core: Cleanup the rb-tree code

Trivial comment and code fixups..

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Dmitri Prokhorov <Dmitry.Prohorov@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valery Cherepennikov <valery.cherepennikov@intel.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 84 +++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c9fee3640f40..22165b009d73 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1461,9 +1461,9 @@ static enum event_type_t get_event_type(struct perf_event *event)
 }
 
 /*
- * Helper function to initialize group leader event;
+ * Helper function to initialize event group nodes.
  */
-void init_event_group(struct perf_event *event)
+static void init_event_group(struct perf_event *event)
 {
 	RB_CLEAR_NODE(&event->group_node);
 	event->group_index = 0;
@@ -1471,7 +1471,7 @@ void init_event_group(struct perf_event *event)
 
 /*
  * Extract pinned or flexible groups from the context
- * based on event attrs bits;
+ * based on event attrs bits.
  */
 static struct perf_event_groups *
 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
@@ -1483,9 +1483,9 @@ get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
 }
 
 /*
- * Helper function to initializes perf event groups object;
+ * Helper function to initializes perf_event_group trees.
  */
-void perf_event_groups_init(struct perf_event_groups *groups)
+static void perf_event_groups_init(struct perf_event_groups *groups)
 {
 	groups->tree = RB_ROOT;
 	groups->index = 0;
@@ -1493,35 +1493,34 @@ void perf_event_groups_init(struct perf_event_groups *groups)
 
 /*
  * Compare function for event groups;
- * Implements complex key that first sorts by CPU and then by
- * virtual index which provides ordering when rotating
- * groups for the same CPU;
+ *
+ * Implements complex key that first sorts by CPU and then by virtual index
+ * which provides ordering when rotating groups for the same CPU.
  */
-int perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+static bool
+perf_event_groups_less(struct perf_event *left, struct perf_event *right)
 {
-	if (left->cpu < right->cpu) {
-		return 1;
-	} else if (left->cpu > right->cpu) {
-		return 0;
-	} else {
-		if (left->group_index < right->group_index) {
-			return 1;
-		} else if(left->group_index > right->group_index) {
-			return 0;
-		} else {
-			return 0;
-		}
-	}
+	if (left->cpu < right->cpu)
+		return true;
+	if (left->cpu > right->cpu)
+		return false;
+
+	if (left->group_index < right->group_index)
+		return true;
+	if (left->group_index > right->group_index)
+		return false;
+
+	return false;
 }
 
 /*
- * Insert a group into a tree using event->cpu as a key. If event->cpu node
- * is already attached to the tree then the event is added to the attached
- * group's group_list list.
+ * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
+ * key (see perf_event_groups_less). This places it last inside the CPU
+ * subtree.
  */
 static void
 perf_event_groups_insert(struct perf_event_groups *groups,
-		struct perf_event *event)
+			 struct perf_event *event)
 {
 	struct perf_event *node_event;
 	struct rb_node *parent;
@@ -1534,8 +1533,7 @@ perf_event_groups_insert(struct perf_event_groups *groups,
 
 	while (*node) {
 		parent = *node;
-		node_event = container_of(*node,
-				struct perf_event, group_node);
+		node_event = container_of(*node, struct perf_event, group_node);
 
 		if (perf_event_groups_less(event, node_event))
 			node = &parent->rb_left;
@@ -1548,8 +1546,7 @@ perf_event_groups_insert(struct perf_event_groups *groups,
 }
 
 /*
- * Helper function to insert event into the pinned or
- * flexible groups;
+ * Helper function to insert event into the pinned or flexible groups.
  */
 static void
 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
@@ -1561,22 +1558,21 @@ add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
 }
 
 /*
- * Delete a group from a tree. If the group is directly attached to the tree
- * it also detaches all groups on the group's group_list list.
+ * Delete a group from a tree.
  */
 static void
 perf_event_groups_delete(struct perf_event_groups *groups,
-		struct perf_event *event)
+			 struct perf_event *event)
 {
-	if (!RB_EMPTY_NODE(&event->group_node) &&
-	    !RB_EMPTY_ROOT(&groups->tree))
-		rb_erase(&event->group_node, &groups->tree);
+	WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
+		     RB_EMPTY_ROOT(&groups->tree));
 
+	rb_erase(&event->group_node, &groups->tree);
 	init_event_group(event);
 }
 
 /*
- * Helper function to delete event from its groups;
+ * Helper function to delete event from its groups.
  */
 static void
 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
@@ -1588,7 +1584,7 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
 }
 
 /*
- * Get a group by a cpu key from groups tree with the least group_index;
+ * Get the leftmost event in the @cpu subtree.
  */
 static struct perf_event *
 perf_event_groups_first(struct perf_event_groups *groups, int cpu)
@@ -1597,8 +1593,7 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu)
 	struct rb_node *node = groups->tree.rb_node;
 
 	while (node) {
-		node_event = container_of(node,
-				struct perf_event, group_node);
+		node_event = container_of(node, struct perf_event, group_node);
 
 		if (cpu < node_event->cpu) {
 			node = node->rb_left;
@@ -1614,13 +1609,14 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu)
 }
 
 /*
- * Find group list by a cpu key and rotate it.
+ * Rotate the @cpu subtree.
+ *
+ * Re-insert the leftmost event at the tail of the subtree.
  */
 static void
 perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
 {
-	struct perf_event *event =
-			perf_event_groups_first(groups, cpu);
+	struct perf_event *event = perf_event_groups_first(groups, cpu);
 
 	if (event) {
 		perf_event_groups_delete(groups, event);
@@ -1629,7 +1625,7 @@ perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
 }
 
 /*
- * Iterate event groups thru the whole tree.
+ * Iterate through the whole groups tree.
  */
 #define perf_event_groups_for_each(event, groups, node)		\
 	for (event = rb_entry_safe(rb_first(&((groups)->tree)),	\
-- 
cgit v1.2.3-71-gd317


From 1cac7b1ae3579457200213303fc28ca13b75592f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 13 Nov 2017 14:28:30 +0100
Subject: perf/core: Fix event schedule order

Scheduling in events with cpu=-1 before events with cpu=# changes
semantics and is undesirable in that it would priorize these events.

Given that groups->index is across all groups we actually have an
inter-group ordering, meaning we can merge-sort two groups, which is
just what we need to preserve semantics.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Dmitri Prokhorov <Dmitry.Prohorov@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valery Cherepennikov <valery.cherepennikov@intel.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 157 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 108 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 22165b009d73..2d8c0208ca4a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1608,6 +1608,21 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu)
 	return match;
 }
 
+/*
+ * Like rb_entry_next_safe() for the @cpu subtree.
+ */
+static struct perf_event *
+perf_event_groups_next(struct perf_event *event)
+{
+	struct perf_event *next;
+
+	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
+	if (next && next->cpu == event->cpu)
+		return next;
+
+	return NULL;
+}
+
 /*
  * Rotate the @cpu subtree.
  *
@@ -2354,22 +2369,6 @@ static int group_can_go_on(struct perf_event *event,
 	return can_add_hw;
 }
 
-static int
-flexible_group_sched_in(struct perf_event *event,
-			struct perf_event_context *ctx,
-		        struct perf_cpu_context *cpuctx,
-			int *can_add_hw)
-{
-	if (event->state <= PERF_EVENT_STATE_OFF || !event_filter_match(event))
-		return 0;
-
-	if (group_can_go_on(event, cpuctx, *can_add_hw))
-		if (group_sched_in(event, cpuctx, ctx))
-			*can_add_hw = 0;
-
-	return 1;
-}
-
 static void add_event_to_ctx(struct perf_event *event,
 			       struct perf_event_context *ctx)
 {
@@ -3185,52 +3184,112 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 }
 
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
-		    struct perf_cpu_context *cpuctx)
+static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
+			      int (*func)(struct perf_event *, void *), void *data)
 {
-	int sw = -1, cpu = smp_processor_id();
-	struct perf_event *event;
-	int can_add_hw;
+	struct perf_event **evt, *evt1, *evt2;
+	int ret;
 
-	perf_event_groups_for_each_cpu(event, sw,
-			&ctx->pinned_groups, group_node) {
-		can_add_hw = 1;
-		if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) {
-			if (event->state == PERF_EVENT_STATE_INACTIVE)
-				perf_event_set_state(event,
-						PERF_EVENT_STATE_ERROR);
+	evt1 = perf_event_groups_first(groups, -1);
+	evt2 = perf_event_groups_first(groups, cpu);
+
+	while (evt1 || evt2) {
+		if (evt1 && evt2) {
+			if (evt1->group_index < evt2->group_index)
+				evt = &evt1;
+			else
+				evt = &evt2;
+		} else if (evt1) {
+			evt = &evt1;
+		} else {
+			evt = &evt2;
 		}
+
+		ret = func(*evt, data);
+		if (ret)
+			return ret;
+
+		*evt = perf_event_groups_next(*evt);
 	}
 
-	perf_event_groups_for_each_cpu(event, cpu,
-			&ctx->pinned_groups, group_node) {
-		can_add_hw = 1;
-		if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) {
-			if (event->state == PERF_EVENT_STATE_INACTIVE)
-				perf_event_set_state(event,
-						PERF_EVENT_STATE_ERROR);
-		}
+	return 0;
+}
+
+struct sched_in_data {
+	struct perf_event_context *ctx;
+	struct perf_cpu_context *cpuctx;
+	int can_add_hw;
+};
+
+static int pinned_sched_in(struct perf_event *event, void *data)
+{
+	struct sched_in_data *sid = data;
+
+	if (event->state <= PERF_EVENT_STATE_OFF)
+		return 0;
+
+	if (!event_filter_match(event))
+		return 0;
+
+	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw))
+		group_sched_in(event, sid->cpuctx, sid->ctx);
+
+	/*
+	 * If this pinned group hasn't been scheduled,
+	 * put it in error state.
+	 */
+	if (event->state == PERF_EVENT_STATE_INACTIVE)
+		perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+
+	return 0;
+}
+
+static int flexible_sched_in(struct perf_event *event, void *data)
+{
+	struct sched_in_data *sid = data;
+
+	if (event->state <= PERF_EVENT_STATE_OFF)
+		return 0;
+
+	if (!event_filter_match(event))
+		return 0;
+
+	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
+		if (group_sched_in(event, sid->cpuctx, sid->ctx))
+			sid->can_add_hw = 0;
 	}
+
+	return 0;
 }
 
 static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
-		      struct perf_cpu_context *cpuctx)
+ctx_pinned_sched_in(struct perf_event_context *ctx,
+		    struct perf_cpu_context *cpuctx)
 {
-	int sw = -1, cpu = smp_processor_id();
-	struct perf_event *event;
-	int can_add_hw = 1;
+	struct sched_in_data sid = {
+		.ctx = ctx,
+		.cpuctx = cpuctx,
+		.can_add_hw = 1,
+	};
 
-	perf_event_groups_for_each_cpu(event, sw,
-			&ctx->flexible_groups, group_node)
-		flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw);
+	visit_groups_merge(&ctx->pinned_groups,
+			   smp_processor_id(),
+			   pinned_sched_in, &sid);
+}
 
-	can_add_hw = 1;
-	perf_event_groups_for_each_cpu(event, cpu,
-			&ctx->flexible_groups, group_node)
-		flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw);
+static void
+ctx_flexible_sched_in(struct perf_event_context *ctx,
+		      struct perf_cpu_context *cpuctx)
+{
+	struct sched_in_data sid = {
+		.ctx = ctx,
+		.cpuctx = cpuctx,
+		.can_add_hw = 1,
+	};
 
+	visit_groups_merge(&ctx->flexible_groups,
+			   smp_processor_id(),
+			   flexible_sched_in, &sid);
 }
 
 static void
-- 
cgit v1.2.3-71-gd317


From 8343aae66167df6708128a778e750d48dbe31302 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 13 Nov 2017 14:28:33 +0100
Subject: perf/core: Remove perf_event::group_entry

Now that all the grouping is done with RB trees, we no longer need
group_entry and can replace the whole thing with sibling_list.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Dmitri Prokhorov <Dmitry.Prohorov@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valery Cherepennikov <valery.cherepennikov@intel.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/perf_event.c           |  2 +-
 arch/arm/mach-imx/mmdc.c                 |  2 +-
 arch/arm/mm/cache-l2x0-pmu.c             |  2 +-
 arch/mips/kernel/perf_event_mipsxx.c     |  2 +-
 arch/powerpc/perf/core-book3s.c          |  2 +-
 arch/powerpc/perf/core-fsl-emb.c         |  2 +-
 arch/sparc/kernel/perf_event.c           |  2 +-
 arch/x86/events/core.c                   |  2 +-
 arch/x86/events/intel/uncore.c           |  2 +-
 drivers/bus/arm-cci.c                    |  2 +-
 drivers/bus/arm-ccn.c                    |  2 +-
 drivers/perf/arm_dsu_pmu.c               |  2 +-
 drivers/perf/arm_pmu.c                   |  2 +-
 drivers/perf/hisilicon/hisi_uncore_pmu.c |  3 +--
 drivers/perf/qcom_l2_pmu.c               |  4 ++--
 drivers/perf/qcom_l3_pmu.c               |  2 +-
 drivers/perf/xgene_pmu.c                 |  2 +-
 include/linux/perf_event.h               |  5 -----
 kernel/events/core.c                     | 37 ++++++++++++++++----------------
 19 files changed, 36 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index a1f6bc7f1e4c..435864c24479 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -351,7 +351,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		evtype[n] = group->hw.event_base;
 		current_idx[n++] = PMC_NO_INDEX;
 	}
-	list_for_each_entry(pe, &group->sibling_list, group_entry) {
+	list_for_each_entry(pe, &group->sibling_list, sibling_list) {
 		if (!is_software_event(pe) && pe->state != PERF_EVENT_STATE_OFF) {
 			if (n >= max_count)
 				return -1;
diff --git a/arch/arm/mach-imx/mmdc.c b/arch/arm/mach-imx/mmdc.c
index 5fb1d2254b5e..27a9ca20933e 100644
--- a/arch/arm/mach-imx/mmdc.c
+++ b/arch/arm/mach-imx/mmdc.c
@@ -269,7 +269,7 @@ static bool mmdc_pmu_group_is_valid(struct perf_event *event)
 			return false;
 	}
 
-	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
 		if (!mmdc_pmu_group_event_is_valid(sibling, pmu, &counter_mask))
 			return false;
 	}
diff --git a/arch/arm/mm/cache-l2x0-pmu.c b/arch/arm/mm/cache-l2x0-pmu.c
index 0a1e2280141f..3a89ea4c2b57 100644
--- a/arch/arm/mm/cache-l2x0-pmu.c
+++ b/arch/arm/mm/cache-l2x0-pmu.c
@@ -293,7 +293,7 @@ static bool l2x0_pmu_group_is_valid(struct perf_event *event)
 	else if (!is_software_event(leader))
 		return false;
 
-	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
 		if (sibling->pmu == pmu)
 			num_hw++;
 		else if (!is_software_event(sibling))
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 6668f67a61c3..46097ff3208b 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -711,7 +711,7 @@ static int validate_group(struct perf_event *event)
 	if (mipsxx_pmu_alloc_counter(&fake_cpuc, &leader->hw) < 0)
 		return -EINVAL;
 
-	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
 		if (mipsxx_pmu_alloc_counter(&fake_cpuc, &sibling->hw) < 0)
 			return -EINVAL;
 	}
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index f89bbd54ecec..7c1f66050433 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -1426,7 +1426,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		flags[n] = group->hw.event_base;
 		events[n++] = group->hw.config;
 	}
-	list_for_each_entry(event, &group->sibling_list, group_entry) {
+	list_for_each_entry(event, &group->sibling_list, sibling_list) {
 		if (event->pmu->task_ctx_nr == perf_hw_context &&
 		    event->state != PERF_EVENT_STATE_OFF) {
 			if (n >= max_count)
diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c
index 5d747b4cb8ee..94c2e63662c6 100644
--- a/arch/powerpc/perf/core-fsl-emb.c
+++ b/arch/powerpc/perf/core-fsl-emb.c
@@ -277,7 +277,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		ctrs[n] = group;
 		n++;
 	}
-	list_for_each_entry(event, &group->sibling_list, group_entry) {
+	list_for_each_entry(event, &group->sibling_list, sibling_list) {
 		if (!is_software_event(event) &&
 		    event->state != PERF_EVENT_STATE_OFF) {
 			if (n >= max_count)
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 5c1f54758312..a0a86d369119 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1342,7 +1342,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		events[n] = group->hw.event_base;
 		current_idx[n++] = PIC_NO_INDEX;
 	}
-	list_for_each_entry(event, &group->sibling_list, group_entry) {
+	list_for_each_entry(event, &group->sibling_list, sibling_list) {
 		if (!is_software_event(event) &&
 		    event->state != PERF_EVENT_STATE_OFF) {
 			if (n >= max_count)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 9c86e10f1196..77a4125b6b1f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -990,7 +990,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
 	if (!dogrp)
 		return n;
 
-	list_for_each_entry(event, &leader->sibling_list, group_entry) {
+	list_for_each_entry(event, &leader->sibling_list, sibling_list) {
 		if (!is_x86_event(event) ||
 		    event->state <= PERF_EVENT_STATE_OFF)
 			continue;
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 7874c980d569..9e374cd22ad2 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -354,7 +354,7 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
 	if (!dogrp)
 		return n;
 
-	list_for_each_entry(event, &leader->sibling_list, group_entry) {
+	list_for_each_entry(event, &leader->sibling_list, sibling_list) {
 		if (!is_box_event(box, event) ||
 		    event->state <= PERF_EVENT_STATE_OFF)
 			continue;
diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c
index 5426c04fe24b..c98435bdb64f 100644
--- a/drivers/bus/arm-cci.c
+++ b/drivers/bus/arm-cci.c
@@ -1311,7 +1311,7 @@ validate_group(struct perf_event *event)
 	if (!validate_event(event->pmu, &fake_pmu, leader))
 		return -EINVAL;
 
-	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
 		if (!validate_event(event->pmu, &fake_pmu, sibling))
 			return -EINVAL;
 	}
diff --git a/drivers/bus/arm-ccn.c b/drivers/bus/arm-ccn.c
index b52332e52ca5..1c310a4be000 100644
--- a/drivers/bus/arm-ccn.c
+++ b/drivers/bus/arm-ccn.c
@@ -847,7 +847,7 @@ static int arm_ccn_pmu_event_init(struct perf_event *event)
 		return -EINVAL;
 
 	list_for_each_entry(sibling, &event->group_leader->sibling_list,
-			group_entry)
+			sibling_list)
 		if (sibling->pmu != event->pmu &&
 				!is_software_event(sibling))
 			return -EINVAL;
diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c
index 38f2cc2a6c74..660680d78147 100644
--- a/drivers/perf/arm_dsu_pmu.c
+++ b/drivers/perf/arm_dsu_pmu.c
@@ -536,7 +536,7 @@ static bool dsu_pmu_validate_group(struct perf_event *event)
 	memset(fake_hw.used_mask, 0, sizeof(fake_hw.used_mask));
 	if (!dsu_pmu_validate_event(event->pmu, &fake_hw, leader))
 		return false;
-	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
 		if (!dsu_pmu_validate_event(event->pmu, &fake_hw, sibling))
 			return false;
 	}
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 0c2ed11c0603..628d7a7b9526 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -311,7 +311,7 @@ validate_group(struct perf_event *event)
 	if (!validate_event(event->pmu, &fake_pmu, leader))
 		return -EINVAL;
 
-	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
 		if (!validate_event(event->pmu, &fake_pmu, sibling))
 			return -EINVAL;
 	}
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c
index 7ed24b954422..e3356087fd76 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c
@@ -82,8 +82,7 @@ static bool hisi_validate_event_group(struct perf_event *event)
 			counters++;
 	}
 
-	list_for_each_entry(sibling, &event->group_leader->sibling_list,
-			    group_entry) {
+	list_for_each_entry(sibling, &event->group_leader->sibling_list, sibling_list) {
 		if (is_software_event(sibling))
 			continue;
 		if (sibling->pmu != event->pmu)
diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c
index 4fdc8486a8e4..5e535a718965 100644
--- a/drivers/perf/qcom_l2_pmu.c
+++ b/drivers/perf/qcom_l2_pmu.c
@@ -535,7 +535,7 @@ static int l2_cache_event_init(struct perf_event *event)
 	}
 
 	list_for_each_entry(sibling, &event->group_leader->sibling_list,
-			    group_entry)
+			    sibling_list)
 		if (sibling->pmu != event->pmu &&
 		    !is_software_event(sibling)) {
 			dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
@@ -572,7 +572,7 @@ static int l2_cache_event_init(struct perf_event *event)
 	}
 
 	list_for_each_entry(sibling, &event->group_leader->sibling_list,
-			    group_entry) {
+			    sibling_list) {
 		if ((sibling != event) &&
 		    !is_software_event(sibling) &&
 		    (L2_EVT_GROUP(sibling->attr.config) ==
diff --git a/drivers/perf/qcom_l3_pmu.c b/drivers/perf/qcom_l3_pmu.c
index 7f6b62b29e9d..5dedf4b1a552 100644
--- a/drivers/perf/qcom_l3_pmu.c
+++ b/drivers/perf/qcom_l3_pmu.c
@@ -468,7 +468,7 @@ static bool qcom_l3_cache__validate_event_group(struct perf_event *event)
 	counters = event_num_counters(event);
 	counters += event_num_counters(leader);
 
-	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
 		if (is_software_event(sibling))
 			continue;
 		if (sibling->pmu != event->pmu)
diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c
index eb23311bc70c..f1f4a56cab5e 100644
--- a/drivers/perf/xgene_pmu.c
+++ b/drivers/perf/xgene_pmu.c
@@ -950,7 +950,7 @@ static int xgene_perf_event_init(struct perf_event *event)
 		return -EINVAL;
 
 	list_for_each_entry(sibling, &event->group_leader->sibling_list,
-			group_entry)
+			sibling_list)
 		if (sibling->pmu != event->pmu &&
 				!is_software_event(sibling))
 			return -EINVAL;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6e3f854a34d8..84044ec21b31 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -549,14 +549,9 @@ struct perf_event {
 	struct list_head		event_entry;
 
 	/*
-	 * XXX: group_entry and sibling_list should be mutually exclusive;
-	 * either you're a sibling on a group, or you're the group leader.
-	 * Rework the code to always use the same list element.
-	 *
 	 * Locked for modification by both ctx->mutex and ctx->lock; holding
 	 * either sufficies for read.
 	 */
-	struct list_head		group_entry;
 	struct list_head		sibling_list;
 	/*
 	 * Node on the pinned or flexible tree located at the event context;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2d8c0208ca4a..9a07bbe66451 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader)
 {
 	struct perf_event *sibling;
 
-	list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+	list_for_each_entry(sibling, &leader->sibling_list, sibling_list)
 		perf_event_update_time(sibling);
 }
 
@@ -1835,12 +1835,12 @@ static void perf_group_attach(struct perf_event *event)
 
 	group_leader->group_caps &= event->event_caps;
 
-	list_add_tail(&event->group_entry, &group_leader->sibling_list);
+	list_add_tail(&event->sibling_list, &group_leader->sibling_list);
 	group_leader->nr_siblings++;
 
 	perf_event__header_size(group_leader);
 
-	list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+	list_for_each_entry(pos, &group_leader->sibling_list, sibling_list)
 		perf_event__header_size(pos);
 }
 
@@ -1904,7 +1904,7 @@ static void perf_group_detach(struct perf_event *event)
 	 * If this is a sibling, remove it from its group.
 	 */
 	if (event->group_leader != event) {
-		list_del_init(&event->group_entry);
+		list_del_init(&event->sibling_list);
 		event->group_leader->nr_siblings--;
 		goto out;
 	}
@@ -1914,7 +1914,7 @@ static void perf_group_detach(struct perf_event *event)
 	 * upgrade the siblings to singleton events by adding them
 	 * to whatever list we are on.
 	 */
-	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
 
 		sibling->group_leader = sibling;
 
@@ -1922,7 +1922,7 @@ static void perf_group_detach(struct perf_event *event)
 		sibling->group_caps = event->group_caps;
 
 		if (!RB_EMPTY_NODE(&event->group_node)) {
-			list_del_init(&sibling->group_entry);
+			list_del_init(&sibling->sibling_list);
 			add_event_to_groups(sibling, event->ctx);
 		}
 
@@ -1932,7 +1932,7 @@ static void perf_group_detach(struct perf_event *event)
 out:
 	perf_event__header_size(event->group_leader);
 
-	list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+	list_for_each_entry(tmp, &event->group_leader->sibling_list, sibling_list)
 		perf_event__header_size(tmp);
 }
 
@@ -1960,7 +1960,7 @@ static inline int pmu_filter_match(struct perf_event *event)
 	if (!__pmu_filter_match(event))
 		return 0;
 
-	list_for_each_entry(child, &event->sibling_list, group_entry) {
+	list_for_each_entry(child, &event->sibling_list, sibling_list) {
 		if (!__pmu_filter_match(child))
 			return 0;
 	}
@@ -2028,7 +2028,7 @@ group_sched_out(struct perf_event *group_event,
 	/*
 	 * Schedule out siblings (if any):
 	 */
-	list_for_each_entry(event, &group_event->sibling_list, group_entry)
+	list_for_each_entry(event, &group_event->sibling_list, sibling_list)
 		event_sched_out(event, cpuctx, ctx);
 
 	perf_pmu_enable(ctx->pmu);
@@ -2307,7 +2307,7 @@ group_sched_in(struct perf_event *group_event,
 	/*
 	 * Schedule in siblings as one group (if any):
 	 */
-	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+	list_for_each_entry(event, &group_event->sibling_list, sibling_list) {
 		if (event_sched_in(event, cpuctx, ctx)) {
 			partial_group = event;
 			goto group_error;
@@ -2323,7 +2323,7 @@ group_error:
 	 * partial group before returning:
 	 * The events up to the failed event are scheduled out normally.
 	 */
-	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+	list_for_each_entry(event, &group_event->sibling_list, sibling_list) {
 		if (event == partial_group)
 			break;
 
@@ -3796,7 +3796,7 @@ static void __perf_event_read(void *info)
 
 	pmu->read(event);
 
-	list_for_each_entry(sub, &event->sibling_list, group_entry) {
+	list_for_each_entry(sub, &event->sibling_list, sibling_list) {
 		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
 			/*
 			 * Use sibling's PMU rather than @event's since
@@ -4642,7 +4642,7 @@ static int __perf_read_group_add(struct perf_event *leader,
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
 
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+	list_for_each_entry(sub, &leader->sibling_list, sibling_list) {
 		values[n++] += perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
@@ -4836,7 +4836,7 @@ static void perf_event_for_each(struct perf_event *event,
 	event = event->group_leader;
 
 	perf_event_for_each_child(event, func);
-	list_for_each_entry(sibling, &event->sibling_list, group_entry)
+	list_for_each_entry(sibling, &event->sibling_list, sibling_list)
 		perf_event_for_each_child(sibling, func);
 }
 
@@ -5995,7 +5995,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 
 	__output_copy(handle, values, n * sizeof(u64));
 
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+	list_for_each_entry(sub, &leader->sibling_list, sibling_list) {
 		n = 0;
 
 		if ((sub != event) &&
@@ -9813,7 +9813,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	mutex_init(&event->child_mutex);
 	INIT_LIST_HEAD(&event->child_list);
 
-	INIT_LIST_HEAD(&event->group_entry);
 	INIT_LIST_HEAD(&event->event_entry);
 	INIT_LIST_HEAD(&event->sibling_list);
 	init_event_group(event);
@@ -10581,7 +10580,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		put_ctx(gctx);
 
 		list_for_each_entry(sibling, &group_leader->sibling_list,
-				    group_entry) {
+				    sibling_list) {
 			perf_remove_from_context(sibling, 0);
 			put_ctx(gctx);
 		}
@@ -10603,7 +10602,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * reachable through the group lists.
 		 */
 		list_for_each_entry(sibling, &group_leader->sibling_list,
-				    group_entry) {
+				    sibling_list) {
 			perf_event__state_init(sibling);
 			perf_install_in_context(ctx, sibling, sibling->cpu);
 			get_ctx(ctx);
@@ -11242,7 +11241,7 @@ static int inherit_group(struct perf_event *parent_event,
 	 * case inherit_event() will create individual events, similar to what
 	 * perf_group_detach() would do anyway.
 	 */
-	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+	list_for_each_entry(sub, &parent_event->sibling_list, sibling_list) {
 		child_ctr = inherit_event(sub, parent, parent_ctx,
 					    child, leader, child_ctx);
 		if (IS_ERR(child_ctr))
-- 
cgit v1.2.3-71-gd317


From 6668128a9e25f7a11d25359e46df2541e6b43fc9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 13 Nov 2017 14:28:38 +0100
Subject: perf/core: Optimize ctx_sched_out()

When an event group contains more events than can be scheduled on the
hardware, iterating the full event group for ctx_sched_out is a waste
of time.

Keep track of the events that got programmed on the hardware, such
that we can iterate this smaller list in order to schedule them out.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Dmitri Prokhorov <Dmitry.Prohorov@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valery Cherepennikov <valery.cherepennikov@intel.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  5 +++++
 kernel/events/core.c       | 53 +++++++++++++++++++++++++---------------------
 2 files changed, 34 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 84044ec21b31..2bb200e1bbea 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -553,6 +553,7 @@ struct perf_event {
 	 * either sufficies for read.
 	 */
 	struct list_head		sibling_list;
+	struct list_head		active_list;
 	/*
 	 * Node on the pinned or flexible tree located at the event context;
 	 */
@@ -718,6 +719,10 @@ struct perf_event_context {
 	struct perf_event_groups	pinned_groups;
 	struct perf_event_groups	flexible_groups;
 	struct list_head		event_list;
+
+	struct list_head		pinned_active;
+	struct list_head		flexible_active;
+
 	int				nr_events;
 	int				nr_active;
 	int				is_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9a07bbe66451..4d601c06074f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1647,14 +1647,6 @@ perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
 				typeof(*event), node); event;	\
 		event = rb_entry_safe(rb_next(&event->node),	\
 				typeof(*event), node))
-/*
- * Iterate event groups with cpu == key.
- */
-#define perf_event_groups_for_each_cpu(event, key, groups, node) \
-	for (event = perf_event_groups_first(groups, key);	 \
-		event && event->cpu == key;			 \
-		event = rb_entry_safe(rb_next(&event->node),	 \
-				typeof(*event), node))
 
 /*
  * Add a event from the lists for its context.
@@ -1889,8 +1881,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 static void perf_group_detach(struct perf_event *event)
 {
 	struct perf_event *sibling, *tmp;
+	struct perf_event_context *ctx = event->ctx;
 
-	lockdep_assert_held(&event->ctx->lock);
+	lockdep_assert_held(&ctx->lock);
 
 	/*
 	 * We can have double detach due to exit/hot-unplug + close.
@@ -1924,6 +1917,13 @@ static void perf_group_detach(struct perf_event *event)
 		if (!RB_EMPTY_NODE(&event->group_node)) {
 			list_del_init(&sibling->sibling_list);
 			add_event_to_groups(sibling, event->ctx);
+
+			if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
+				struct list_head *list = sibling->attr.pinned ?
+					&ctx->pinned_active : &ctx->flexible_active;
+
+				list_add_tail(&sibling->active_list, list);
+			}
 		}
 
 		WARN_ON_ONCE(sibling->ctx != event->ctx);
@@ -1988,6 +1988,13 @@ event_sched_out(struct perf_event *event,
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return;
 
+	/*
+	 * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
+	 * we can schedule events _OUT_ individually through things like
+	 * __perf_remove_from_context().
+	 */
+	list_del_init(&event->active_list);
+
 	perf_pmu_disable(event->pmu);
 
 	event->pmu->del(event, 0);
@@ -2835,9 +2842,8 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
 			  enum event_type_t event_type)
 {
-	int sw = -1, cpu = smp_processor_id();
+	struct perf_event *event, *tmp;
 	int is_active = ctx->is_active;
-	struct perf_event *event;
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -2884,20 +2890,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 
 	perf_pmu_disable(ctx->pmu);
 	if (is_active & EVENT_PINNED) {
-		perf_event_groups_for_each_cpu(event, cpu,
-				&ctx->pinned_groups, group_node)
-			group_sched_out(event, cpuctx, ctx);
-		perf_event_groups_for_each_cpu(event, sw,
-				&ctx->pinned_groups, group_node)
+		list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
 			group_sched_out(event, cpuctx, ctx);
 	}
 
 	if (is_active & EVENT_FLEXIBLE) {
-		perf_event_groups_for_each_cpu(event, cpu,
-				&ctx->flexible_groups, group_node)
-			group_sched_out(event, cpuctx, ctx);
-		perf_event_groups_for_each_cpu(event, sw,
-				&ctx->flexible_groups, group_node)
+		list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
 			group_sched_out(event, cpuctx, ctx);
 	}
 	perf_pmu_enable(ctx->pmu);
@@ -3231,8 +3229,10 @@ static int pinned_sched_in(struct perf_event *event, void *data)
 	if (!event_filter_match(event))
 		return 0;
 
-	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw))
-		group_sched_in(event, sid->cpuctx, sid->ctx);
+	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
+		if (!group_sched_in(event, sid->cpuctx, sid->ctx))
+			list_add_tail(&event->active_list, &sid->ctx->pinned_active);
+	}
 
 	/*
 	 * If this pinned group hasn't been scheduled,
@@ -3255,7 +3255,9 @@ static int flexible_sched_in(struct perf_event *event, void *data)
 		return 0;
 
 	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
-		if (group_sched_in(event, sid->cpuctx, sid->ctx))
+		if (!group_sched_in(event, sid->cpuctx, sid->ctx))
+			list_add_tail(&event->active_list, &sid->ctx->flexible_active);
+		else
 			sid->can_add_hw = 0;
 	}
 
@@ -3973,6 +3975,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 	perf_event_groups_init(&ctx->pinned_groups);
 	perf_event_groups_init(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
+	INIT_LIST_HEAD(&ctx->pinned_active);
+	INIT_LIST_HEAD(&ctx->flexible_active);
 	atomic_set(&ctx->refcount, 1);
 }
 
@@ -9815,6 +9819,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	INIT_LIST_HEAD(&event->event_entry);
 	INIT_LIST_HEAD(&event->sibling_list);
+	INIT_LIST_HEAD(&event->active_list);
 	init_event_group(event);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
-- 
cgit v1.2.3-71-gd317


From 6e6804d2fa0eff6520f3a2b48ff52bcb9dc25a9d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 13 Nov 2017 14:28:41 +0100
Subject: perf/core: Simpify perf_event_groups_for_each()

The last argument is, and always must be, the same.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Dmitri Prokhorov <Dmitry.Prohorov@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valery Cherepennikov <valery.cherepennikov@intel.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d601c06074f..fc5dd072c194 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1642,11 +1642,11 @@ perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
 /*
  * Iterate through the whole groups tree.
  */
-#define perf_event_groups_for_each(event, groups, node)		\
-	for (event = rb_entry_safe(rb_first(&((groups)->tree)),	\
-				typeof(*event), node); event;	\
-		event = rb_entry_safe(rb_next(&event->node),	\
-				typeof(*event), node))
+#define perf_event_groups_for_each(event, groups)			\
+	for (event = rb_entry_safe(rb_first(&((groups)->tree)),		\
+				typeof(*event), group_node); event;	\
+		event = rb_entry_safe(rb_next(&event->group_node),	\
+				typeof(*event), group_node))
 
 /*
  * Add a event from the lists for its context.
@@ -11345,7 +11345,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
-	perf_event_groups_for_each(event, &parent_ctx->pinned_groups, group_node) {
+	perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
 		ret = inherit_task_group(event, parent, parent_ctx,
 					 child, ctxn, &inherited_all);
 		if (ret)
@@ -11361,7 +11361,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	parent_ctx->rotate_disable = 1;
 	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
 
-	perf_event_groups_for_each(event, &parent_ctx->flexible_groups, group_node) {
+	perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
 		ret = inherit_task_group(event, parent, parent_ctx,
 					 child, ctxn, &inherited_all);
 		if (ret)
-- 
cgit v1.2.3-71-gd317


From 8703a7cfe148f73062c568e9a8549ce692104864 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 13 Nov 2017 14:28:44 +0100
Subject: perf/core: Fix tree based event rotation

Similar to how first programming cpu=-1 and then cpu=# is wrong, so is
rotating both. It was especially wrong when we were still programming
the PMU in this same order, because in that scenario we might never
actually end up running cpu=# events at all.

Cure this by using the active_list to pick the rotation event; since
at programming we already select the left-most event.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Dmitri Prokhorov <Dmitry.Prohorov@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valery Cherepennikov <valery.cherepennikov@intel.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 44 ++++++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc5dd072c194..460e485220e8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1623,22 +1623,6 @@ perf_event_groups_next(struct perf_event *event)
 	return NULL;
 }
 
-/*
- * Rotate the @cpu subtree.
- *
- * Re-insert the leftmost event at the tail of the subtree.
- */
-static void
-perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
-{
-	struct perf_event *event = perf_event_groups_first(groups, cpu);
-
-	if (event) {
-		perf_event_groups_delete(groups, event);
-		perf_event_groups_insert(groups, event);
-	}
-}
-
 /*
  * Iterate through the whole groups tree.
  */
@@ -3601,24 +3585,24 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
 }
 
 /*
- * Round-robin a context's events:
+ * Move @event to the tail of the @ctx's elegible events.
  */
-static void rotate_ctx(struct perf_event_context *ctx)
+static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
 {
 	/*
 	 * Rotate the first entry last of non-pinned groups. Rotation might be
 	 * disabled by the inheritance code.
 	 */
-	if (!ctx->rotate_disable) {
-		int sw = -1, cpu = smp_processor_id();
+	if (ctx->rotate_disable)
+		return;
 
-		perf_event_groups_rotate(&ctx->flexible_groups, sw);
-		perf_event_groups_rotate(&ctx->flexible_groups, cpu);
-	}
+	perf_event_groups_delete(&ctx->flexible_groups, event);
+	perf_event_groups_insert(&ctx->flexible_groups, event);
 }
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
+	struct perf_event *ctx_event = NULL, *cpuctx_event = NULL;
 	struct perf_event_context *ctx = NULL;
 	int rotate = 0;
 
@@ -3639,13 +3623,21 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 	perf_pmu_disable(cpuctx->ctx.pmu);
 
+	cpuctx_event = list_first_entry_or_null(&cpuctx->ctx.flexible_active,
+						struct perf_event, active_list);
+	if (ctx) {
+		ctx_event = list_first_entry_or_null(&ctx->flexible_active,
+						     struct perf_event, active_list);
+	}
+
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
 		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
 
-	rotate_ctx(&cpuctx->ctx);
-	if (ctx)
-		rotate_ctx(ctx);
+	if (cpuctx_event)
+		rotate_ctx(&cpuctx->ctx, cpuctx_event);
+	if (ctx_event)
+		rotate_ctx(ctx, ctx_event);
 
 	perf_event_sched_in(cpuctx, ctx, current);
 
-- 
cgit v1.2.3-71-gd317


From 8d5bce0c37fa10f21dbdd6a6d8fcba85202fe24e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Mar 2018 14:56:27 +0100
Subject: perf/core: Optimize perf_rotate_context() event scheduling

The event schedule order (as per perf_event_sched_in()) is:

 - cpu  pinned
 - task pinned
 - cpu  flexible
 - task flexible

But perf_rotate_context() will unschedule cpu-flexible even if it
doesn't need a rotation.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 60 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 460e485220e8..f98c0f88cc94 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -430,7 +430,7 @@ static void update_perf_cpu_limits(void)
 	WRITE_ONCE(perf_sample_allowed_ns, tmp);
 }
 
-static int perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
 
 int perf_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
@@ -1041,7 +1041,7 @@ list_update_cgroup_event(struct perf_event *event,
 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 {
 	struct perf_cpu_context *cpuctx;
-	int rotations = 0;
+	bool rotations;
 
 	lockdep_assert_irqs_disabled();
 
@@ -3600,52 +3600,66 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
 	perf_event_groups_insert(&ctx->flexible_groups, event);
 }
 
-static int perf_rotate_context(struct perf_cpu_context *cpuctx)
+static inline struct perf_event *
+ctx_first_active(struct perf_event_context *ctx)
 {
-	struct perf_event *ctx_event = NULL, *cpuctx_event = NULL;
+	return list_first_entry_or_null(&ctx->flexible_active,
+					struct perf_event, active_list);
+}
+
+static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+{
+	struct perf_event *cpu_event = NULL, *task_event = NULL;
+	bool cpu_rotate = false, task_rotate = false;
 	struct perf_event_context *ctx = NULL;
-	int rotate = 0;
+
+	/*
+	 * Since we run this from IRQ context, nobody can install new
+	 * events, thus the event count values are stable.
+	 */
 
 	if (cpuctx->ctx.nr_events) {
 		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-			rotate = 1;
+			cpu_rotate = true;
 	}
 
 	ctx = cpuctx->task_ctx;
 	if (ctx && ctx->nr_events) {
 		if (ctx->nr_events != ctx->nr_active)
-			rotate = 1;
+			task_rotate = true;
 	}
 
-	if (!rotate)
-		goto done;
+	if (!(cpu_rotate || task_rotate))
+		return false;
 
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 	perf_pmu_disable(cpuctx->ctx.pmu);
 
-	cpuctx_event = list_first_entry_or_null(&cpuctx->ctx.flexible_active,
-						struct perf_event, active_list);
-	if (ctx) {
-		ctx_event = list_first_entry_or_null(&ctx->flexible_active,
-						     struct perf_event, active_list);
-	}
+	if (task_rotate)
+		task_event = ctx_first_active(ctx);
+	if (cpu_rotate)
+		cpu_event = ctx_first_active(&cpuctx->ctx);
 
-	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-	if (ctx)
+	/*
+	 * As per the order given at ctx_resched() first 'pop' task flexible
+	 * and then, if needed CPU flexible.
+	 */
+	if (task_event || (ctx && cpu_event))
 		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+	if (cpu_event)
+		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
-	if (cpuctx_event)
-		rotate_ctx(&cpuctx->ctx, cpuctx_event);
-	if (ctx_event)
-		rotate_ctx(ctx, ctx_event);
+	if (task_event)
+		rotate_ctx(ctx, task_event);
+	if (cpu_event)
+		rotate_ctx(&cpuctx->ctx, cpu_event);
 
 	perf_event_sched_in(cpuctx, ctx, current);
 
 	perf_pmu_enable(cpuctx->ctx.pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-done:
 
-	return rotate;
+	return true;
 }
 
 void perf_event_task_tick(void)
-- 
cgit v1.2.3-71-gd317


From 33801b94741d6c3be9713c10aa627477216c21e2 Mon Sep 17 00:00:00 2001
From: "leilei.lin" <leilei.lin@alibaba-inc.com>
Date: Tue, 6 Mar 2018 17:36:37 +0800
Subject: perf/core: Fix installing cgroup events on CPU

There's two problems when installing cgroup events on CPUs: firstly
list_update_cgroup_event() only tries to set cpuctx->cgrp for the
first event, if that mismatches on @cgrp we'll not try again for later
additions.

Secondly, when we install a cgroup event into an active context, only
issue an event reprogram when the event matches the current cgroup
context. This avoids a pointless event reprogramming.

Signed-off-by: leilei.lin <leilei.lin@alibaba-inc.com>
[ Improved the changelog and comments. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: brendan.d.gregg@gmail.com
Cc: eranian@gmail.com
Cc: linux-kernel@vger.kernel.org
Cc: yang_oliver@hotmail.com
Link: http://lkml.kernel.org/r/20180306093637.28247-1-linxiulei@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 46 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index f98c0f88cc94..969f865f9f1c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -937,27 +937,39 @@ list_update_cgroup_event(struct perf_event *event,
 	if (!is_cgroup_event(event))
 		return;
 
-	if (add && ctx->nr_cgroups++)
-		return;
-	else if (!add && --ctx->nr_cgroups)
-		return;
 	/*
 	 * Because cgroup events are always per-cpu events,
 	 * this will always be called from the right CPU.
 	 */
 	cpuctx = __get_cpu_context(ctx);
-	cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
-	/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
-	if (add) {
+
+	/*
+	 * Since setting cpuctx->cgrp is conditional on the current @cgrp
+	 * matching the event's cgroup, we must do this for every new event,
+	 * because if the first would mismatch, the second would not try again
+	 * and we would leave cpuctx->cgrp unset.
+	 */
+	if (add && !cpuctx->cgrp) {
 		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
 
-		list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
 		if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
 			cpuctx->cgrp = cgrp;
-	} else {
-		list_del(cpuctx_entry);
-		cpuctx->cgrp = NULL;
 	}
+
+	if (add && ctx->nr_cgroups++)
+		return;
+	else if (!add && --ctx->nr_cgroups)
+		return;
+
+	/* no cgroup running */
+	if (!add)
+		cpuctx->cgrp = NULL;
+
+	cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+	if (add)
+		list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+	else
+		list_del(cpuctx_entry);
 }
 
 #else /* !CONFIG_CGROUP_PERF */
@@ -2489,6 +2501,18 @@ static int  __perf_install_in_context(void *info)
 		raw_spin_lock(&task_ctx->lock);
 	}
 
+#ifdef CONFIG_CGROUP_PERF
+	if (is_cgroup_event(event)) {
+		/*
+		 * If the current cgroup doesn't match the event's
+		 * cgroup, we should not try to schedule it.
+		 */
+		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
+		reprogram = cgroup_is_descendant(cgrp->css.cgroup,
+					event->cgrp->css.cgroup);
+	}
+#endif
+
 	if (reprogram) {
 		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 		add_event_to_ctx(event, ctx);
-- 
cgit v1.2.3-71-gd317


From b6b76dd62c56f257cdd30900ed22668c42a74030 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 12 Mar 2018 19:00:49 +0900
Subject: error-injection: Fix to prohibit jump optimization

Since the kprobe which was optimized by jump can not change
the execution path, the kprobe for error-injection must not
be optimized. To prohibit it, set a dummy post-handler as
officially stated in Documentation/kprobes.txt.

Fixes: 4b1a29a7f542 ("error-injection: Support fault injection framework")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/fail_function.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 21b0122cb39c..1d5632d8bbcc 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -14,6 +14,15 @@
 
 static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
 
+static void fei_post_handler(struct kprobe *kp, struct pt_regs *regs,
+			     unsigned long flags)
+{
+	/*
+	 * A dummy post handler is required to prohibit optimizing, because
+	 * jump optimization does not support execution path overriding.
+	 */
+}
+
 struct fei_attr {
 	struct list_head list;
 	struct kprobe kp;
@@ -56,6 +65,7 @@ static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr)
 			return NULL;
 		}
 		attr->kp.pre_handler = fei_kprobe_handler;
+		attr->kp.post_handler = fei_post_handler;
 		attr->retval = adjust_error_retval(addr, 0);
 		INIT_LIST_HEAD(&attr->list);
 	}
-- 
cgit v1.2.3-71-gd317


From cbd9d9f114f9e5931ba199f4450667656b892f32 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 12 Mar 2018 14:45:41 +0100
Subject: hw_breakpoint: Pass bp_type directly as find_slot_idx() argument

Pass bp_type directly as a find_slot_idx() argument,
so we don't need to have whole event to get the
breakpoint slot type. It will be used in following
changes.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Milind Chabbi <chabbi.milind@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <onestero@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20180312134548.31532-2-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/hw_breakpoint.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 3f8cb1e14588..395ca07965af 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -85,9 +85,9 @@ __weak int hw_breakpoint_weight(struct perf_event *bp)
 	return 1;
 }
 
-static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+static inline enum bp_type_idx find_slot_idx(u64 bp_type)
 {
-	if (bp->attr.bp_type & HW_BREAKPOINT_RW)
+	if (bp_type & HW_BREAKPOINT_RW)
 		return TYPE_DATA;
 
 	return TYPE_INST;
@@ -122,7 +122,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
 
 	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
 		if (iter->hw.target == tsk &&
-		    find_slot_idx(iter) == type &&
+		    find_slot_idx(iter->attr.bp_type) == type &&
 		    (iter->cpu < 0 || cpu == iter->cpu))
 			count += hw_breakpoint_weight(iter);
 	}
@@ -292,7 +292,7 @@ static int __reserve_bp_slot(struct perf_event *bp)
 	    bp->attr.bp_type == HW_BREAKPOINT_INVALID)
 		return -EINVAL;
 
-	type = find_slot_idx(bp);
+	type = find_slot_idx(bp->attr.bp_type);
 	weight = hw_breakpoint_weight(bp);
 
 	fetch_bp_busy_slots(&slots, bp, type);
@@ -329,7 +329,7 @@ static void __release_bp_slot(struct perf_event *bp)
 	enum bp_type_idx type;
 	int weight;
 
-	type = find_slot_idx(bp);
+	type = find_slot_idx(bp->attr.bp_type);
 	weight = hw_breakpoint_weight(bp);
 	toggle_bp_slot(bp, false, type, weight);
 }
-- 
cgit v1.2.3-71-gd317


From 1ad9ff7dea4c42bee43f98b7d7ce8037ee22e133 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 12 Mar 2018 14:45:42 +0100
Subject: hw_breakpoint: Pass bp_type argument to
 __reserve_bp_slot|__release_bp_slot()

Passing bp_type argument to __reserve_bp_slot() and __release_bp_slot()
functions, so we can pass another bp_type than the one defined in
bp->attr.bp_type. This will be handy in following change that fixes
breakpoint slot counts during its modification.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Milind Chabbi <chabbi.milind@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <onestero@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20180312134548.31532-3-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/hw_breakpoint.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 395ca07965af..9b31fbcc3305 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -277,7 +277,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
  *       ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
  *            + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
  */
-static int __reserve_bp_slot(struct perf_event *bp)
+static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
 {
 	struct bp_busy_slots slots = {0};
 	enum bp_type_idx type;
@@ -288,11 +288,11 @@ static int __reserve_bp_slot(struct perf_event *bp)
 		return -ENOMEM;
 
 	/* Basic checks */
-	if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
-	    bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+	if (bp_type == HW_BREAKPOINT_EMPTY ||
+	    bp_type == HW_BREAKPOINT_INVALID)
 		return -EINVAL;
 
-	type = find_slot_idx(bp->attr.bp_type);
+	type = find_slot_idx(bp_type);
 	weight = hw_breakpoint_weight(bp);
 
 	fetch_bp_busy_slots(&slots, bp, type);
@@ -317,19 +317,19 @@ int reserve_bp_slot(struct perf_event *bp)
 
 	mutex_lock(&nr_bp_mutex);
 
-	ret = __reserve_bp_slot(bp);
+	ret = __reserve_bp_slot(bp, bp->attr.bp_type);
 
 	mutex_unlock(&nr_bp_mutex);
 
 	return ret;
 }
 
-static void __release_bp_slot(struct perf_event *bp)
+static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
 {
 	enum bp_type_idx type;
 	int weight;
 
-	type = find_slot_idx(bp->attr.bp_type);
+	type = find_slot_idx(bp_type);
 	weight = hw_breakpoint_weight(bp);
 	toggle_bp_slot(bp, false, type, weight);
 }
@@ -339,7 +339,7 @@ void release_bp_slot(struct perf_event *bp)
 	mutex_lock(&nr_bp_mutex);
 
 	arch_unregister_hw_breakpoint(bp);
-	__release_bp_slot(bp);
+	__release_bp_slot(bp, bp->attr.bp_type);
 
 	mutex_unlock(&nr_bp_mutex);
 }
@@ -354,7 +354,7 @@ int dbg_reserve_bp_slot(struct perf_event *bp)
 	if (mutex_is_locked(&nr_bp_mutex))
 		return -1;
 
-	return __reserve_bp_slot(bp);
+	return __reserve_bp_slot(bp, bp->attr.bp_type);
 }
 
 int dbg_release_bp_slot(struct perf_event *bp)
@@ -362,7 +362,7 @@ int dbg_release_bp_slot(struct perf_event *bp)
 	if (mutex_is_locked(&nr_bp_mutex))
 		return -1;
 
-	__release_bp_slot(bp);
+	__release_bp_slot(bp, bp->attr.bp_type);
 
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From ea6a9d530c1779b9bd30944b803820c462f01eac Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 12 Mar 2018 14:45:43 +0100
Subject: hw_breakpoint: Add modify_bp_slot() function

Add the modify_bp_slot() function to keep slot numbers
correct when changing the breakpoint type.

Using existing __release_bp_slot()/__reserve_bp_slot()
call sequence to update the slot counts.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Milind Chabbi <chabbi.milind@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <onestero@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20180312134548.31532-4-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/hw_breakpoint.c | 46 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9b31fbcc3305..776948beb4ac 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -44,6 +44,7 @@
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/bug.h>
 
 #include <linux/hw_breakpoint.h>
 /*
@@ -344,6 +345,38 @@ void release_bp_slot(struct perf_event *bp)
 	mutex_unlock(&nr_bp_mutex);
 }
 
+static int __modify_bp_slot(struct perf_event *bp, u64 old_type)
+{
+	int err;
+
+	__release_bp_slot(bp, old_type);
+
+	err = __reserve_bp_slot(bp, bp->attr.bp_type);
+	if (err) {
+		/*
+		 * Reserve the old_type slot back in case
+		 * there's no space for the new type.
+		 *
+		 * This must succeed, because we just released
+		 * the old_type slot in the __release_bp_slot
+		 * call above. If not, something is broken.
+		 */
+		WARN_ON(__reserve_bp_slot(bp, old_type));
+	}
+
+	return err;
+}
+
+static int modify_bp_slot(struct perf_event *bp, u64 old_type)
+{
+	int ret;
+
+	mutex_lock(&nr_bp_mutex);
+	ret = __modify_bp_slot(bp, old_type);
+	mutex_unlock(&nr_bp_mutex);
+	return ret;
+}
+
 /*
  * Allow the kernel debugger to reserve breakpoint slots without
  * taking a lock using the dbg_* variant of for the reserve and
@@ -435,6 +468,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	u64 old_addr = bp->attr.bp_addr;
 	u64 old_len = bp->attr.bp_len;
 	int old_type = bp->attr.bp_type;
+	bool modify = attr->bp_type != old_type;
 	int err = 0;
 
 	/*
@@ -452,12 +486,9 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	bp->attr.bp_type = attr->bp_type;
 	bp->attr.bp_len = attr->bp_len;
 
-	if (attr->disabled)
-		goto end;
-
 	err = validate_hw_breakpoint(bp);
-	if (!err)
-		perf_event_enable(bp);
+	if (!err && modify)
+		err = modify_bp_slot(bp, old_type);
 
 	if (err) {
 		bp->attr.bp_addr = old_addr;
@@ -469,9 +500,10 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 		return err;
 	}
 
-end:
-	bp->attr.disabled = attr->disabled;
+	if (!attr->disabled)
+		perf_event_enable(bp);
 
+	bp->attr.disabled = attr->disabled;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
-- 
cgit v1.2.3-71-gd317


From 18ff57b220610a699947f20b156a8245ca7eee98 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 12 Mar 2018 14:45:44 +0100
Subject: hw_breakpoint: Factor out __modify_user_hw_breakpoint() function

Moving out all the functionality without the events
disabling/enabling calls, because we want to call another
disabling/enabling functions in following change.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Milind Chabbi <chabbi.milind@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <onestero@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20180312134548.31532-5-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/hw_breakpoint.c | 46 +++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 776948beb4ac..a556aba223da 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -456,6 +456,33 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
+static int __modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
+{
+	u64 old_addr = bp->attr.bp_addr;
+	u64 old_len  = bp->attr.bp_len;
+	int old_type = bp->attr.bp_type;
+	bool modify  = attr->bp_type != old_type;
+	int err = 0;
+
+	bp->attr.bp_addr = attr->bp_addr;
+	bp->attr.bp_type = attr->bp_type;
+	bp->attr.bp_len  = attr->bp_len;
+
+	err = validate_hw_breakpoint(bp);
+	if (!err && modify)
+		err = modify_bp_slot(bp, old_type);
+
+	if (err) {
+		bp->attr.bp_addr = old_addr;
+		bp->attr.bp_type = old_type;
+		bp->attr.bp_len  = old_len;
+		return err;
+	}
+
+	bp->attr.disabled = attr->disabled;
+	return 0;
+}
+
 /**
  * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
  * @bp: the breakpoint structure to modify
@@ -465,11 +492,7 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
  */
 int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
 {
-	u64 old_addr = bp->attr.bp_addr;
-	u64 old_len = bp->attr.bp_len;
-	int old_type = bp->attr.bp_type;
-	bool modify = attr->bp_type != old_type;
-	int err = 0;
+	int err;
 
 	/*
 	 * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
@@ -482,18 +505,9 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	else
 		perf_event_disable(bp);
 
-	bp->attr.bp_addr = attr->bp_addr;
-	bp->attr.bp_type = attr->bp_type;
-	bp->attr.bp_len = attr->bp_len;
-
-	err = validate_hw_breakpoint(bp);
-	if (!err && modify)
-		err = modify_bp_slot(bp, old_type);
+	err = __modify_user_hw_breakpoint(bp, attr);
 
 	if (err) {
-		bp->attr.bp_addr = old_addr;
-		bp->attr.bp_type = old_type;
-		bp->attr.bp_len = old_len;
 		if (!bp->attr.disabled)
 			perf_event_enable(bp);
 
@@ -502,8 +516,6 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 
 	if (!attr->disabled)
 		perf_event_enable(bp);
-
-	bp->attr.disabled = attr->disabled;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
-- 
cgit v1.2.3-71-gd317


From 705feaf321c37e4dca3637fd5cb3b275f17a06c9 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 12 Mar 2018 14:45:45 +0100
Subject: hw_breakpoint: Add perf_event_attr fields check in
 __modify_user_hw_breakpoint()

And rename it to modify_user_hw_breakpoint_check().

We are about to use modify_user_hw_breakpoint_check() for user space
breakpoints modification, we must be very strict to check only the
fields we can change have changed. As Peter explained:

 "Suppose someone does:

        attr = malloc(sizeof(*attr)); // uninitialized memory
        attr->type = BP;
        attr->bp_addr = new_addr;
        attr->bp_type = bp_type;
        attr->bp_len = bp_len;
        ioctl(fd, PERF_IOC_MOD_ATTR, &attr);

  And feeds absolute shite for the rest of the fields.
  Then we later want to extend IOC_MOD_ATTR to allow changing
  attr::sample_type but we can't, because that would break the
  above application."

I'm making this check optional because we already export
modify_user_hw_breakpoint() and with this check we could
break existing users.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Milind Chabbi <chabbi.milind@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <onestero@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20180312134548.31532-6-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/hw_breakpoint.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index a556aba223da..0c82663395f7 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -456,7 +456,9 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
-static int __modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
+static int
+modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
+			        bool check)
 {
 	u64 old_addr = bp->attr.bp_addr;
 	u64 old_len  = bp->attr.bp_len;
@@ -468,6 +470,9 @@ static int __modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_
 	bp->attr.bp_type = attr->bp_type;
 	bp->attr.bp_len  = attr->bp_len;
 
+	if (check && memcmp(&bp->attr, attr, sizeof(*attr)))
+		return -EINVAL;
+
 	err = validate_hw_breakpoint(bp);
 	if (!err && modify)
 		err = modify_bp_slot(bp, old_type);
@@ -505,7 +510,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	else
 		perf_event_disable(bp);
 
-	err = __modify_user_hw_breakpoint(bp, attr);
+	err = modify_user_hw_breakpoint_check(bp, attr, false);
 
 	if (err) {
 		if (!bp->attr.disabled)
-- 
cgit v1.2.3-71-gd317


From 5f970521d3279d99adcdebf329631e36cb9f0deb Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 12 Mar 2018 14:45:46 +0100
Subject: perf/core: Move perf_event_attr::sample_max_stack into
 perf_copy_attr()

Move the sample_max_stack check and setup into perf_copy_attr(),
so we have all perf_event_attr initial setup in one place
and can easily compare attrs in the new ioctl introduced
in following change.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Milind Chabbi <chabbi.milind@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <onestero@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20180312134548.31532-7-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 969f865f9f1c..ee145bdee6ed 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10125,6 +10125,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 			ret = -EINVAL;
 	}
 
+	if (!attr->sample_max_stack)
+		attr->sample_max_stack = sysctl_perf_event_max_stack;
+
 	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
 		ret = perf_reg_validate(attr->sample_regs_intr);
 out:
@@ -10338,9 +10341,6 @@ SYSCALL_DEFINE5(perf_event_open,
 	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
-	if (!attr.sample_max_stack)
-		attr.sample_max_stack = sysctl_perf_event_max_stack;
-
 	/*
 	 * In cgroup mode, the pid argument is used to pass the fd
 	 * opened to the cgroup directory in cgroupfs. The cpu argument
-- 
cgit v1.2.3-71-gd317


From 32ff77e8cc9e66cc4fb38098f64fd54cc8f54573 Mon Sep 17 00:00:00 2001
From: Milind Chabbi <chabbi.milind@gmail.com>
Date: Mon, 12 Mar 2018 14:45:47 +0100
Subject: perf/core: Implement fast breakpoint modification via
 _IOC_MODIFY_ATTRIBUTES

Problem and motivation: Once a breakpoint perf event (PERF_TYPE_BREAKPOINT)
is created, there is no flexibility to change the breakpoint type
(bp_type), breakpoint address (bp_addr), or breakpoint length (bp_len). The
only option is to close the perf event and configure a new breakpoint
event. This inflexibility has a significant performance overhead. For
example, sampling-based, lightweight performance profilers (and also
concurrency bug detection tools),  monitor different addresses for a short
duration using PERF_TYPE_BREAKPOINT and change the address (bp_addr) to
another address or change the kind of breakpoint (bp_type) from  "write" to
a "read" or vice-versa or change the length (bp_len) of the address being
monitored. The cost of these modifications is prohibitive since it involves
unmapping the circular buffer associated with the perf event, closing the
perf event, opening another perf event and mmaping another circular buffer.

Solution: The new ioctl flag for perf events,
PERF_EVENT_IOC_MODIFY_ATTRIBUTES, introduced in this patch takes a pointer
to a struct perf_event_attr as an argument to update an old breakpoint
event with new address, type, and size. This facility allows retaining a
previous mmaped perf events ring buffer and avoids having to close and
reopen another perf event.

This patch supports only changing PERF_TYPE_BREAKPOINT event type; future
implementations can extend this feature. The patch replicates some of its
functionality of modify_user_hw_breakpoint() in
kernel/events/hw_breakpoint.c. modify_user_hw_breakpoint cannot be called
directly since perf_event_ctx_lock() is already held in _perf_ioctl().

Evidence: Experiments show that the baseline (not able to modify an already
created breakpoint) costs an order of magnitude (~10x) more than the
suggested optimization (having the ability to dynamically modifying a
configured breakpoint via ioctl). When the breakpoints typically do not
trap, the speedup due to the suggested optimization is ~10x; even when the
breakpoints always trap, the speedup is ~4x due to the suggested
optimization.

Testing: tests posted at
https://github.com/linux-contrib/perf_event_modify_bp demonstrate the
performance significance of this patch. Tests also check the functional
correctness of the patch.

Signed-off-by: Milind Chabbi <chabbi.milind@gmail.com>
[ Using modify_user_hw_breakpoint_check function. ]
[ Reformated PERF_EVENT_IOC_*, so the values are all in one column. ]
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <onestero@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20180312134548.31532-8-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/hw_breakpoint.h         |  7 +++++
 include/uapi/linux/perf_event.h       | 23 +++++++++--------
 kernel/events/core.c                  | 48 +++++++++++++++++++++++++++++++++++
 kernel/events/hw_breakpoint.c         |  2 +-
 tools/include/uapi/linux/perf_event.h | 23 +++++++++--------
 5 files changed, 80 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index cf045885a499..6058c3844a76 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -53,6 +53,9 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
 /* FIXME: only change from the attr, and don't unregister */
 extern int
 modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr);
+extern int
+modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
+				bool check);
 
 /*
  * Kernel breakpoints are not associated with any particular thread.
@@ -97,6 +100,10 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
 static inline int
 modify_user_hw_breakpoint(struct perf_event *bp,
 			  struct perf_event_attr *attr)	{ return -ENOSYS; }
+static inline int
+modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
+				bool check)	{ return -ENOSYS; }
+
 static inline struct perf_event *
 register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr,
 				perf_overflow_handler_t	 triggered,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 6f873503552d..912b85b52344 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -448,17 +448,18 @@ struct perf_event_query_bpf {
 /*
  * Ioctls that can be done on a perf event fd:
  */
-#define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
-#define PERF_EVENT_IOC_DISABLE		_IO ('$', 1)
-#define PERF_EVENT_IOC_REFRESH		_IO ('$', 2)
-#define PERF_EVENT_IOC_RESET		_IO ('$', 3)
-#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, __u64)
-#define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
-#define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
-#define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
-#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
-#define PERF_EVENT_IOC_PAUSE_OUTPUT	_IOW('$', 9, __u32)
-#define PERF_EVENT_IOC_QUERY_BPF	_IOWR('$', 10, struct perf_event_query_bpf *)
+#define PERF_EVENT_IOC_ENABLE			_IO ('$', 0)
+#define PERF_EVENT_IOC_DISABLE			_IO ('$', 1)
+#define PERF_EVENT_IOC_REFRESH			_IO ('$', 2)
+#define PERF_EVENT_IOC_RESET			_IO ('$', 3)
+#define PERF_EVENT_IOC_PERIOD			_IOW('$', 4, __u64)
+#define PERF_EVENT_IOC_SET_OUTPUT		_IO ('$', 5)
+#define PERF_EVENT_IOC_SET_FILTER		_IOW('$', 6, char *)
+#define PERF_EVENT_IOC_ID			_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF			_IOW('$', 8, __u32)
+#define PERF_EVENT_IOC_PAUSE_OUTPUT		_IOW('$', 9, __u32)
+#define PERF_EVENT_IOC_QUERY_BPF		_IOWR('$', 10, struct perf_event_query_bpf *)
+#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES	_IOW('$', 11, struct perf_event_attr *)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ee145bdee6ed..3b4c7792a6ac 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2846,6 +2846,41 @@ int perf_event_refresh(struct perf_event *event, int refresh)
 }
 EXPORT_SYMBOL_GPL(perf_event_refresh);
 
+static int perf_event_modify_breakpoint(struct perf_event *bp,
+					 struct perf_event_attr *attr)
+{
+	int err;
+
+	_perf_event_disable(bp);
+
+	err = modify_user_hw_breakpoint_check(bp, attr, true);
+	if (err) {
+		if (!bp->attr.disabled)
+			_perf_event_enable(bp);
+
+		return err;
+	}
+
+	if (!attr->disabled)
+		_perf_event_enable(bp);
+	return 0;
+}
+
+static int perf_event_modify_attr(struct perf_event *event,
+				  struct perf_event_attr *attr)
+{
+	if (event->attr.type != attr->type)
+		return -EINVAL;
+
+	switch (event->attr.type) {
+	case PERF_TYPE_BREAKPOINT:
+		return perf_event_modify_breakpoint(event, attr);
+	default:
+		/* Place holder for future additions. */
+		return -EOPNOTSUPP;
+	}
+}
+
 static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
 			  enum event_type_t event_type)
@@ -4952,6 +4987,8 @@ static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+static int perf_copy_attr(struct perf_event_attr __user *uattr,
+			  struct perf_event_attr *attr);
 
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
@@ -5024,6 +5061,17 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 
 	case PERF_EVENT_IOC_QUERY_BPF:
 		return perf_event_query_prog_array(event, (void __user *)arg);
+
+	case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
+		struct perf_event_attr new_attr;
+		int err = perf_copy_attr((struct perf_event_attr __user *)arg,
+					 &new_attr);
+
+		if (err)
+			return err;
+
+		return perf_event_modify_attr(event,  &new_attr);
+	}
 	default:
 		return -ENOTTY;
 	}
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 0c82663395f7..6253d5519cd8 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -456,7 +456,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
-static int
+int
 modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
 			        bool check)
 {
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 6f873503552d..912b85b52344 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -448,17 +448,18 @@ struct perf_event_query_bpf {
 /*
  * Ioctls that can be done on a perf event fd:
  */
-#define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
-#define PERF_EVENT_IOC_DISABLE		_IO ('$', 1)
-#define PERF_EVENT_IOC_REFRESH		_IO ('$', 2)
-#define PERF_EVENT_IOC_RESET		_IO ('$', 3)
-#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, __u64)
-#define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
-#define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
-#define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
-#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
-#define PERF_EVENT_IOC_PAUSE_OUTPUT	_IOW('$', 9, __u32)
-#define PERF_EVENT_IOC_QUERY_BPF	_IOWR('$', 10, struct perf_event_query_bpf *)
+#define PERF_EVENT_IOC_ENABLE			_IO ('$', 0)
+#define PERF_EVENT_IOC_DISABLE			_IO ('$', 1)
+#define PERF_EVENT_IOC_REFRESH			_IO ('$', 2)
+#define PERF_EVENT_IOC_RESET			_IO ('$', 3)
+#define PERF_EVENT_IOC_PERIOD			_IOW('$', 4, __u64)
+#define PERF_EVENT_IOC_SET_OUTPUT		_IO ('$', 5)
+#define PERF_EVENT_IOC_SET_FILTER		_IOW('$', 6, char *)
+#define PERF_EVENT_IOC_ID			_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF			_IOW('$', 8, __u32)
+#define PERF_EVENT_IOC_PAUSE_OUTPUT		_IOW('$', 9, __u32)
+#define PERF_EVENT_IOC_QUERY_BPF		_IOWR('$', 10, struct perf_event_query_bpf *)
+#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES	_IOW('$', 11, struct perf_event_attr *)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
-- 
cgit v1.2.3-71-gd317


From 537f4146c53c95aac977852b371bafb9c6755ee1 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Tue, 6 Mar 2018 15:35:43 +0530
Subject: workqueue: use put_device() instead of kfree()

Never directly free @dev after calling device_register(), even
if it returned an error! Always use put_device() to give up the
reference initialized in this function instead.

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bb9a519cbf50..ccd1080dd6e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5337,7 +5337,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
 
 	ret = device_register(&wq_dev->dev);
 	if (ret) {
-		kfree(wq_dev);
+		put_device(&wq_dev->dev);
 		wq->wq_dev = NULL;
 		return ret;
 	}
-- 
cgit v1.2.3-71-gd317


From 6417250d3f894e66a68ba1cd93676143f2376a6f Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 6 Mar 2018 19:34:42 -0800
Subject: workqueue: remove unused cancel_work()

Found this by accident.
There are no usages of bare cancel_work() in current kernel source.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 1 -
 kernel/workqueue.c        | 8 --------
 2 files changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bc0cda180c8b..0c3301421c57 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -456,7 +456,6 @@ extern int schedule_on_each_cpu(work_func_t func);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 extern bool flush_work(struct work_struct *work);
-extern bool cancel_work(struct work_struct *work);
 extern bool cancel_work_sync(struct work_struct *work);
 
 extern bool flush_delayed_work(struct delayed_work *dwork);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ccd1080dd6e7..6ec6ba65127b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3018,14 +3018,6 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
 	return ret;
 }
 
-/*
- * See cancel_delayed_work()
- */
-bool cancel_work(struct work_struct *work)
-{
-	return __cancel_work(work, false);
-}
-
 /**
  * cancel_delayed_work - cancel a delayed work
  * @dwork: delayed_work to cancel
-- 
cgit v1.2.3-71-gd317


From af1d830bf32b27b387b97c8b29dc09e306a9ff7f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 14 Mar 2018 10:24:20 -0500
Subject: jump_label: Fix sparc64 warning

The kbuild test robot reported the following warning on sparc64:

  kernel/jump_label.c: In function '__jump_label_update':
  kernel/jump_label.c:376:51: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
       WARN_ONCE(1, "can't patch jump_label at %pS", (void *)entry->code);

On sparc64, the jump_label entry->code field is of type u32, but
pointers are 64-bit.  Silence the warning by casting entry->code to an
unsigned long before casting it to a pointer.  This is also what the
sparc jump label code does.

Fixes: dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: "David S . Miller" <davem@davemloft.net>
Link: https://lkml.kernel.org/r/c966fed42be6611254a62d46579ec7416548d572.1521041026.git.jpoimboe@redhat.com
---
 kernel/jump_label.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 52a0a7af8640..e7214093dcd1 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -373,7 +373,8 @@ static void __jump_label_update(struct static_key *key,
 			if (kernel_text_address(entry->code))
 				arch_jump_label_transform(entry, jump_label_type(entry));
 			else
-				WARN_ONCE(1, "can't patch jump_label at %pS", (void *)entry->code);
+				WARN_ONCE(1, "can't patch jump_label at %pS",
+					  (void *)(unsigned long)entry->code);
 		}
 	}
 }
-- 
cgit v1.2.3-71-gd317


From 17a2f1ced0280068897990b0dd25ce70555b8ac7 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshanlai@gmail.com>
Date: Fri, 1 Dec 2017 21:50:05 +0800
Subject: cpu/hotplug: Merge cpuhp_bp_states and cpuhp_ap_states

cpuhp_bp_states and cpuhp_ap_states have different set of steps without any
conflicting steps, so that they can be merged.

The original `[CPUHP_BRINGUP_CPU] = { },` is removed, because the new
cpuhp_hp_states has CPUHP_ONLINE index which is larger than
CPUHP_BRINGUP_CPU.

Signed-off-by: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Link: https://lkml.kernel.org/r/20171201135008.21633-1-jiangshanlai@gmail.com
---
 kernel/cpu.c | 42 +++++++++++++++---------------------------
 1 file changed, 15 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53f7dc65f9a3..a1860d42aacf 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -124,8 +124,7 @@ struct cpuhp_step {
 };
 
 static DEFINE_MUTEX(cpuhp_state_mutex);
-static struct cpuhp_step cpuhp_bp_states[];
-static struct cpuhp_step cpuhp_ap_states[];
+static struct cpuhp_step cpuhp_hp_states[];
 
 static bool cpuhp_is_ap_state(enum cpuhp_state state)
 {
@@ -138,10 +137,7 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state)
 
 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
 {
-	struct cpuhp_step *sp;
-
-	sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
-	return sp + state;
+	return cpuhp_hp_states + state;
 }
 
 /**
@@ -1224,7 +1220,7 @@ int __boot_cpu_id;
 #endif /* CONFIG_SMP */
 
 /* Boot processor state steps */
-static struct cpuhp_step cpuhp_bp_states[] = {
+static struct cpuhp_step cpuhp_hp_states[] = {
 	[CPUHP_OFFLINE] = {
 		.name			= "offline",
 		.startup.single		= NULL,
@@ -1289,24 +1285,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.teardown.single	= NULL,
 		.cant_stop		= true,
 	},
-	/*
-	 * Handled on controll processor until the plugged processor manages
-	 * this itself.
-	 */
-	[CPUHP_TEARDOWN_CPU] = {
-		.name			= "cpu:teardown",
-		.startup.single		= NULL,
-		.teardown.single	= takedown_cpu,
-		.cant_stop		= true,
-	},
-#else
-	[CPUHP_BRINGUP_CPU] = { },
-#endif
-};
-
-/* Application processor state steps */
-static struct cpuhp_step cpuhp_ap_states[] = {
-#ifdef CONFIG_SMP
 	/* Final state before CPU kills itself */
 	[CPUHP_AP_IDLE_DEAD] = {
 		.name			= "idle:dead",
@@ -1340,6 +1318,16 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 	[CPUHP_AP_ONLINE] = {
 		.name			= "ap:online",
 	},
+	/*
+	 * Handled on controll processor until the plugged processor manages
+	 * this itself.
+	 */
+	[CPUHP_TEARDOWN_CPU] = {
+		.name			= "cpu:teardown",
+		.startup.single		= NULL,
+		.teardown.single	= takedown_cpu,
+		.cant_stop		= true,
+	},
 	/* Handle smpboot threads park/unpark */
 	[CPUHP_AP_SMPBOOT_THREADS] = {
 		.name			= "smpboot/threads:online",
@@ -1408,11 +1396,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
 
 	switch (state) {
 	case CPUHP_AP_ONLINE_DYN:
-		step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN;
+		step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
 		end = CPUHP_AP_ONLINE_DYN_END;
 		break;
 	case CPUHP_BP_PREPARE_DYN:
-		step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN;
+		step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
 		end = CPUHP_BP_PREPARE_DYN_END;
 		break;
 	default:
-- 
cgit v1.2.3-71-gd317


From fcb3029a8d89487470f2e175645a9a993949233c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 15 Mar 2018 16:38:04 +0100
Subject: cpu/hotplug: Fix unused function warning

The cpuhp_is_ap_state() function is no longer called outside of the
CONFIG_SMP #ifdef section, causing a harmless warning:

kernel/cpu.c:129:13: error: 'cpuhp_is_ap_state' defined but not used [-Werror=unused-function]

This moves the function into the #ifdef to get a clean build again.

Fixes: 17a2f1ced028 ("cpu/hotplug: Merge cpuhp_bp_states and cpuhp_ap_states")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Link: https://lkml.kernel.org/r/20180315153829.3819606-1-arnd@arndb.de
---
 kernel/cpu.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index a1860d42aacf..0db8938fbb23 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -126,15 +126,6 @@ struct cpuhp_step {
 static DEFINE_MUTEX(cpuhp_state_mutex);
 static struct cpuhp_step cpuhp_hp_states[];
 
-static bool cpuhp_is_ap_state(enum cpuhp_state state)
-{
-	/*
-	 * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
-	 * purposes as that state is handled explicitly in cpu_down.
-	 */
-	return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
-}
-
 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
 {
 	return cpuhp_hp_states + state;
@@ -235,6 +226,15 @@ err:
 }
 
 #ifdef CONFIG_SMP
+static bool cpuhp_is_ap_state(enum cpuhp_state state)
+{
+	/*
+	 * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
+	 * purposes as that state is handled explicitly in cpu_down.
+	 */
+	return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
+}
+
 static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
 {
 	struct completion *done = bringup ? &st->done_up : &st->done_down;
-- 
cgit v1.2.3-71-gd317


From 1a8429132e1d2ada3832db5b4a0802c49affb750 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 9 Mar 2018 23:11:26 +0100
Subject: mm: remove blackfin MPU support

The CONFIG_MPU option was only defined on blackfin, and that architecture
is now being removed, so the respective code can be simplified.

A lot of other microcontrollers have an MPU, but I suspect that if we
want to bring that support back, we'd do it differently anyway.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/module.c |  4 ----
 mm/nommu.c      | 20 --------------------
 2 files changed, 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index ad2d420024f6..2c1df850029b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2181,10 +2181,6 @@ static void free_module(struct module *mod)
 	/* Finally, free the core (containing the module structure) */
 	disable_ro_nx(&mod->core_layout);
 	module_memfree(mod->core_layout.base);
-
-#ifdef CONFIG_MPU
-	update_protections(current->mm);
-#endif
 }
 
 void *__symbol_get(const char *symbol)
diff --git a/mm/nommu.c b/mm/nommu.c
index ebb6e618dade..838a8fdec5c2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -662,22 +662,6 @@ static void put_nommu_region(struct vm_region *region)
 	__put_nommu_region(region);
 }
 
-/*
- * update protection on a vma
- */
-static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
-{
-#ifdef CONFIG_MPU
-	struct mm_struct *mm = vma->vm_mm;
-	long start = vma->vm_start & PAGE_MASK;
-	while (start < vma->vm_end) {
-		protect_page(mm, start, flags);
-		start += PAGE_SIZE;
-	}
-	update_protections(mm);
-#endif
-}
-
 /*
  * add a VMA into a process's mm_struct in the appropriate place in the list
  * and tree and add to the address space's page tree also if not an anonymous
@@ -695,8 +679,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 	mm->map_count++;
 	vma->vm_mm = mm;
 
-	protect_vma(vma, vma->vm_flags);
-
 	/* add the VMA to the mapping */
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
@@ -757,8 +739,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 	struct mm_struct *mm = vma->vm_mm;
 	struct task_struct *curr = current;
 
-	protect_vma(vma, 0);
-
 	mm->map_count--;
 	for (i = 0; i < VMACACHE_SIZE; i++) {
 		/* if the vma is cached, invalidate the entire cache */
-- 
cgit v1.2.3-71-gd317


From edb39592a5877bd91b2e6ee15194268f35b04892 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 15 Mar 2018 17:36:56 +0100
Subject: perf: Fix sibling iteration

Mark noticed that the change to sibling_list changed some iteration
semantics; because previously we used group_list as list entry,
sibling events would always have an empty sibling_list.

But because we now use sibling_list for both list head and list entry,
siblings will report as having siblings.

Fix this with a custom for_each_sibling_event() iterator.

Fixes: 8343aae66167 ("perf/core: Remove perf_event::group_entry")
Reported-by: Mark Rutland <mark.rutland@arm.com>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: vincent.weaver@maine.edu
Cc: alexander.shishkin@linux.intel.com
Cc: torvalds@linux-foundation.org
Cc: alexey.budankov@linux.intel.com
Cc: valery.cherepennikov@intel.com
Cc: eranian@google.com
Cc: acme@redhat.com
Cc: linux-tip-commits@vger.kernel.org
Cc: davidcc@google.com
Cc: kan.liang@intel.com
Cc: Dmitry.Prohorov@intel.com
Cc: jolsa@redhat.com
Link: https://lkml.kernel.org/r/20180315170129.GX4043@hirez.programming.kicks-ass.net
---
 arch/alpha/kernel/perf_event.c           |  2 +-
 arch/arm/mach-imx/mmdc.c                 |  2 +-
 arch/arm/mm/cache-l2x0-pmu.c             |  2 +-
 arch/mips/kernel/perf_event_mipsxx.c     |  2 +-
 arch/powerpc/perf/core-book3s.c          |  2 +-
 arch/powerpc/perf/core-fsl-emb.c         |  2 +-
 arch/sparc/kernel/perf_event.c           |  2 +-
 arch/x86/events/core.c                   |  2 +-
 arch/x86/events/intel/uncore.c           |  2 +-
 drivers/bus/arm-cci.c                    |  2 +-
 drivers/bus/arm-ccn.c                    |  4 ++--
 drivers/perf/arm_dsu_pmu.c               |  2 +-
 drivers/perf/arm_pmu.c                   |  2 +-
 drivers/perf/hisilicon/hisi_uncore_pmu.c |  2 +-
 drivers/perf/qcom_l2_pmu.c               |  7 +++----
 drivers/perf/qcom_l3_pmu.c               |  2 +-
 drivers/perf/xgene_pmu.c                 |  4 ++--
 include/linux/perf_event.h               |  4 ++++
 kernel/events/core.c                     | 34 +++++++++++++++-----------------
 19 files changed, 41 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 435864c24479..5613aa378a83 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -351,7 +351,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		evtype[n] = group->hw.event_base;
 		current_idx[n++] = PMC_NO_INDEX;
 	}
-	list_for_each_entry(pe, &group->sibling_list, sibling_list) {
+	for_each_sibling_event(pe, group) {
 		if (!is_software_event(pe) && pe->state != PERF_EVENT_STATE_OFF) {
 			if (n >= max_count)
 				return -1;
diff --git a/arch/arm/mach-imx/mmdc.c b/arch/arm/mach-imx/mmdc.c
index 27a9ca20933e..04b3bf71de94 100644
--- a/arch/arm/mach-imx/mmdc.c
+++ b/arch/arm/mach-imx/mmdc.c
@@ -269,7 +269,7 @@ static bool mmdc_pmu_group_is_valid(struct perf_event *event)
 			return false;
 	}
 
-	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
+	for_each_sibling_event(sibling, leader) {
 		if (!mmdc_pmu_group_event_is_valid(sibling, pmu, &counter_mask))
 			return false;
 	}
diff --git a/arch/arm/mm/cache-l2x0-pmu.c b/arch/arm/mm/cache-l2x0-pmu.c
index 3a89ea4c2b57..afe5b4c7b164 100644
--- a/arch/arm/mm/cache-l2x0-pmu.c
+++ b/arch/arm/mm/cache-l2x0-pmu.c
@@ -293,7 +293,7 @@ static bool l2x0_pmu_group_is_valid(struct perf_event *event)
 	else if (!is_software_event(leader))
 		return false;
 
-	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
+	for_each_sibling_event(sibling, leader) {
 		if (sibling->pmu == pmu)
 			num_hw++;
 		else if (!is_software_event(sibling))
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 46097ff3208b..ee73550f0b9a 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -711,7 +711,7 @@ static int validate_group(struct perf_event *event)
 	if (mipsxx_pmu_alloc_counter(&fake_cpuc, &leader->hw) < 0)
 		return -EINVAL;
 
-	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
+	for_each_sibling_event(sibling, leader) {
 		if (mipsxx_pmu_alloc_counter(&fake_cpuc, &sibling->hw) < 0)
 			return -EINVAL;
 	}
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 7c1f66050433..f8908ea4ea73 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -1426,7 +1426,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		flags[n] = group->hw.event_base;
 		events[n++] = group->hw.config;
 	}
-	list_for_each_entry(event, &group->sibling_list, sibling_list) {
+	for_each_sibling_event(event, group) {
 		if (event->pmu->task_ctx_nr == perf_hw_context &&
 		    event->state != PERF_EVENT_STATE_OFF) {
 			if (n >= max_count)
diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c
index 94c2e63662c6..85f1d18e5fd3 100644
--- a/arch/powerpc/perf/core-fsl-emb.c
+++ b/arch/powerpc/perf/core-fsl-emb.c
@@ -277,7 +277,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		ctrs[n] = group;
 		n++;
 	}
-	list_for_each_entry(event, &group->sibling_list, sibling_list) {
+	for_each_sibling_event(event, group) {
 		if (!is_software_event(event) &&
 		    event->state != PERF_EVENT_STATE_OFF) {
 			if (n >= max_count)
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index a0a86d369119..d3149baaa33c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1342,7 +1342,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		events[n] = group->hw.event_base;
 		current_idx[n++] = PIC_NO_INDEX;
 	}
-	list_for_each_entry(event, &group->sibling_list, sibling_list) {
+	for_each_sibling_event(event, group) {
 		if (!is_software_event(event) &&
 		    event->state != PERF_EVENT_STATE_OFF) {
 			if (n >= max_count)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 77a4125b6b1f..bfc8f43909c1 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -990,7 +990,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
 	if (!dogrp)
 		return n;
 
-	list_for_each_entry(event, &leader->sibling_list, sibling_list) {
+	for_each_sibling_event(event, leader) {
 		if (!is_x86_event(event) ||
 		    event->state <= PERF_EVENT_STATE_OFF)
 			continue;
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 9e374cd22ad2..a7956fc7ca1d 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -354,7 +354,7 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
 	if (!dogrp)
 		return n;
 
-	list_for_each_entry(event, &leader->sibling_list, sibling_list) {
+	for_each_sibling_event(event, leader) {
 		if (!is_box_event(box, event) ||
 		    event->state <= PERF_EVENT_STATE_OFF)
 			continue;
diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c
index c98435bdb64f..c4c0c8560cce 100644
--- a/drivers/bus/arm-cci.c
+++ b/drivers/bus/arm-cci.c
@@ -1311,7 +1311,7 @@ validate_group(struct perf_event *event)
 	if (!validate_event(event->pmu, &fake_pmu, leader))
 		return -EINVAL;
 
-	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
+	for_each_sibling_event(sibling, leader) {
 		if (!validate_event(event->pmu, &fake_pmu, sibling))
 			return -EINVAL;
 	}
diff --git a/drivers/bus/arm-ccn.c b/drivers/bus/arm-ccn.c
index 1c310a4be000..65b7e4042ece 100644
--- a/drivers/bus/arm-ccn.c
+++ b/drivers/bus/arm-ccn.c
@@ -846,11 +846,11 @@ static int arm_ccn_pmu_event_init(struct perf_event *event)
 			!is_software_event(event->group_leader))
 		return -EINVAL;
 
-	list_for_each_entry(sibling, &event->group_leader->sibling_list,
-			sibling_list)
+	for_each_sibling_event(sibling, event->group_leader) {
 		if (sibling->pmu != event->pmu &&
 				!is_software_event(sibling))
 			return -EINVAL;
+	}
 
 	return 0;
 }
diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c
index 660680d78147..660cb8ac886a 100644
--- a/drivers/perf/arm_dsu_pmu.c
+++ b/drivers/perf/arm_dsu_pmu.c
@@ -536,7 +536,7 @@ static bool dsu_pmu_validate_group(struct perf_event *event)
 	memset(fake_hw.used_mask, 0, sizeof(fake_hw.used_mask));
 	if (!dsu_pmu_validate_event(event->pmu, &fake_hw, leader))
 		return false;
-	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
+	for_each_sibling_event(sibling, leader) {
 		if (!dsu_pmu_validate_event(event->pmu, &fake_hw, sibling))
 			return false;
 	}
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 628d7a7b9526..344e2083e941 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -311,7 +311,7 @@ validate_group(struct perf_event *event)
 	if (!validate_event(event->pmu, &fake_pmu, leader))
 		return -EINVAL;
 
-	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
+	for_each_sibling_event(sibling, leader) {
 		if (!validate_event(event->pmu, &fake_pmu, sibling))
 			return -EINVAL;
 	}
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c
index e3356087fd76..44df61397a38 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c
@@ -82,7 +82,7 @@ static bool hisi_validate_event_group(struct perf_event *event)
 			counters++;
 	}
 
-	list_for_each_entry(sibling, &event->group_leader->sibling_list, sibling_list) {
+	for_each_sibling_event(sibling, event->group_leader) {
 		if (is_software_event(sibling))
 			continue;
 		if (sibling->pmu != event->pmu)
diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c
index 5e535a718965..842135cf35a3 100644
--- a/drivers/perf/qcom_l2_pmu.c
+++ b/drivers/perf/qcom_l2_pmu.c
@@ -534,14 +534,14 @@ static int l2_cache_event_init(struct perf_event *event)
 		return -EINVAL;
 	}
 
-	list_for_each_entry(sibling, &event->group_leader->sibling_list,
-			    sibling_list)
+	for_each_sibling_event(sibling, event->group_leader) {
 		if (sibling->pmu != event->pmu &&
 		    !is_software_event(sibling)) {
 			dev_dbg_ratelimited(&l2cache_pmu->pdev->dev,
 				 "Can't create mixed PMU group\n");
 			return -EINVAL;
 		}
+	}
 
 	cluster = get_cluster_pmu(l2cache_pmu, event->cpu);
 	if (!cluster) {
@@ -571,8 +571,7 @@ static int l2_cache_event_init(struct perf_event *event)
 		return -EINVAL;
 	}
 
-	list_for_each_entry(sibling, &event->group_leader->sibling_list,
-			    sibling_list) {
+	for_each_sibling_event(sibling, event->group_leader) {
 		if ((sibling != event) &&
 		    !is_software_event(sibling) &&
 		    (L2_EVT_GROUP(sibling->attr.config) ==
diff --git a/drivers/perf/qcom_l3_pmu.c b/drivers/perf/qcom_l3_pmu.c
index 5dedf4b1a552..2dc63d61f2ea 100644
--- a/drivers/perf/qcom_l3_pmu.c
+++ b/drivers/perf/qcom_l3_pmu.c
@@ -468,7 +468,7 @@ static bool qcom_l3_cache__validate_event_group(struct perf_event *event)
 	counters = event_num_counters(event);
 	counters += event_num_counters(leader);
 
-	list_for_each_entry(sibling, &leader->sibling_list, sibling_list) {
+	for_each_sibling_event(sibling, leader) {
 		if (is_software_event(sibling))
 			continue;
 		if (sibling->pmu != event->pmu)
diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c
index f1f4a56cab5e..6bdb1dad805f 100644
--- a/drivers/perf/xgene_pmu.c
+++ b/drivers/perf/xgene_pmu.c
@@ -949,11 +949,11 @@ static int xgene_perf_event_init(struct perf_event *event)
 			!is_software_event(event->group_leader))
 		return -EINVAL;
 
-	list_for_each_entry(sibling, &event->group_leader->sibling_list,
-			sibling_list)
+	for_each_sibling_event(sibling, event->group_leader) {
 		if (sibling->pmu != event->pmu &&
 				!is_software_event(sibling))
 			return -EINVAL;
+	}
 
 	return 0;
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2bb200e1bbea..ff39ab011376 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -536,6 +536,10 @@ struct pmu_event_list {
 	struct list_head	list;
 };
 
+#define for_each_sibling_event(sibling, event)			\
+	if ((event)->group_leader == (event))			\
+		list_for_each_entry((sibling), &(event)->sibling_list, sibling_list)
+
 /**
  * struct perf_event - performance event kernel representation:
  */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3b4c7792a6ac..4d7a460d6669 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader)
 {
 	struct perf_event *sibling;
 
-	list_for_each_entry(sibling, &leader->sibling_list, sibling_list)
+	for_each_sibling_event(sibling, leader)
 		perf_event_update_time(sibling);
 }
 
@@ -1828,7 +1828,7 @@ static void perf_group_attach(struct perf_event *event)
 
 	perf_event__header_size(group_leader);
 
-	list_for_each_entry(pos, &group_leader->sibling_list, sibling_list)
+	for_each_sibling_event(pos, group_leader)
 		perf_event__header_size(pos);
 }
 
@@ -1928,7 +1928,7 @@ static void perf_group_detach(struct perf_event *event)
 out:
 	perf_event__header_size(event->group_leader);
 
-	list_for_each_entry(tmp, &event->group_leader->sibling_list, sibling_list)
+	for_each_sibling_event(tmp, event->group_leader)
 		perf_event__header_size(tmp);
 }
 
@@ -1951,13 +1951,13 @@ static inline int __pmu_filter_match(struct perf_event *event)
  */
 static inline int pmu_filter_match(struct perf_event *event)
 {
-	struct perf_event *child;
+	struct perf_event *sibling;
 
 	if (!__pmu_filter_match(event))
 		return 0;
 
-	list_for_each_entry(child, &event->sibling_list, sibling_list) {
-		if (!__pmu_filter_match(child))
+	for_each_sibling_event(sibling, event) {
+		if (!__pmu_filter_match(sibling))
 			return 0;
 	}
 
@@ -2031,7 +2031,7 @@ group_sched_out(struct perf_event *group_event,
 	/*
 	 * Schedule out siblings (if any):
 	 */
-	list_for_each_entry(event, &group_event->sibling_list, sibling_list)
+	for_each_sibling_event(event, group_event)
 		event_sched_out(event, cpuctx, ctx);
 
 	perf_pmu_enable(ctx->pmu);
@@ -2310,7 +2310,7 @@ group_sched_in(struct perf_event *group_event,
 	/*
 	 * Schedule in siblings as one group (if any):
 	 */
-	list_for_each_entry(event, &group_event->sibling_list, sibling_list) {
+	for_each_sibling_event(event, group_event) {
 		if (event_sched_in(event, cpuctx, ctx)) {
 			partial_group = event;
 			goto group_error;
@@ -2326,7 +2326,7 @@ group_error:
 	 * partial group before returning:
 	 * The events up to the failed event are scheduled out normally.
 	 */
-	list_for_each_entry(event, &group_event->sibling_list, sibling_list) {
+	for_each_sibling_event(event, group_event) {
 		if (event == partial_group)
 			break;
 
@@ -3863,7 +3863,7 @@ static void __perf_event_read(void *info)
 
 	pmu->read(event);
 
-	list_for_each_entry(sub, &event->sibling_list, sibling_list) {
+	for_each_sibling_event(sub, event) {
 		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
 			/*
 			 * Use sibling's PMU rather than @event's since
@@ -4711,7 +4711,7 @@ static int __perf_read_group_add(struct perf_event *leader,
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
 
-	list_for_each_entry(sub, &leader->sibling_list, sibling_list) {
+	for_each_sibling_event(sub, leader) {
 		values[n++] += perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
@@ -4905,7 +4905,7 @@ static void perf_event_for_each(struct perf_event *event,
 	event = event->group_leader;
 
 	perf_event_for_each_child(event, func);
-	list_for_each_entry(sibling, &event->sibling_list, sibling_list)
+	for_each_sibling_event(sibling, event)
 		perf_event_for_each_child(sibling, func);
 }
 
@@ -6077,7 +6077,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 
 	__output_copy(handle, values, n * sizeof(u64));
 
-	list_for_each_entry(sub, &leader->sibling_list, sibling_list) {
+	for_each_sibling_event(sub, leader) {
 		n = 0;
 
 		if ((sub != event) &&
@@ -10662,8 +10662,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		perf_remove_from_context(group_leader, 0);
 		put_ctx(gctx);
 
-		list_for_each_entry(sibling, &group_leader->sibling_list,
-				    sibling_list) {
+		for_each_sibling_event(sibling, group_leader) {
 			perf_remove_from_context(sibling, 0);
 			put_ctx(gctx);
 		}
@@ -10684,8 +10683,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * By installing siblings first we NO-OP because they're not
 		 * reachable through the group lists.
 		 */
-		list_for_each_entry(sibling, &group_leader->sibling_list,
-				    sibling_list) {
+		for_each_sibling_event(sibling, group_leader) {
 			perf_event__state_init(sibling);
 			perf_install_in_context(ctx, sibling, sibling->cpu);
 			get_ctx(ctx);
@@ -11324,7 +11322,7 @@ static int inherit_group(struct perf_event *parent_event,
 	 * case inherit_event() will create individual events, similar to what
 	 * perf_group_detach() would do anyway.
 	 */
-	list_for_each_entry(sub, &parent_event->sibling_list, sibling_list) {
+	for_each_sibling_event(sub, parent_event) {
 		child_ctr = inherit_event(sub, parent, parent_ctx,
 					    child, leader, child_ctx);
 		if (IS_ERR(child_ctr))
-- 
cgit v1.2.3-71-gd317


From 24868367cdcac447232ebcb2aa06e1bf91291586 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 16 Mar 2018 12:51:40 +0000
Subject: perf/core: Clear sibling list of detached events

When perf_group_dettach() is called on a group leader, it updates each
sibling's group_leader field to point to that sibling, effectively
upgrading each siblnig to a group leader. After perf_group_detach has
completed, the caller may free the leader event.

We only remove siblings from the group leader's sibling_list when the
leader has a non-empty group_node. This was fine prior to commit:

  8343aae66167df67 ("perf/core: Remove perf_event::group_entry")

... as the sibling's sibling_list would be empty. However, now that we
use the sibling_list field as both the list head and the list entry,
this leaves each sibling with a non-empty sibling list, including the
stale leader event.

If perf_group_detach() is subsequently called on a sibling, it will
appear to be a group leader, and we'll walk the sibling_list,
potentially dereferencing these stale events. In 0day testing, this has
been observed to result in kernel panics.

Let's avoid this by always removing siblings from the sibling list when
we promote them to leaders.

Fixes: 8343aae66167df67 ("perf/core: Remove perf_event::group_entry")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: vincent.weaver@maine.edu
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: torvalds@linux-foundation.org
Cc: Alexey Budankov <alexey.budankov@linux.intel.com>
Cc: valery.cherepennikov@intel.com
Cc: linux-tip-commits@vger.kernel.org
Cc: eranian@google.com
Cc: acme@redhat.com
Cc: alexander.shishkin@linux.intel.com
Cc: davidcc@google.com
Cc: kan.liang@intel.com
Cc: Dmitry.Prohorov@intel.com
Cc: Jiri Olsa <jolsa@redhat.com>
Link: https://lkml.kernel.org/r/20180316131741.3svgr64yibc6vsid@lakrids.cambridge.arm.com
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d7a460d6669..2776a660db15 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1906,12 +1906,12 @@ static void perf_group_detach(struct perf_event *event)
 	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
 
 		sibling->group_leader = sibling;
+		list_del_init(&sibling->sibling_list);
 
 		/* Inherit group flags from the previous leader */
 		sibling->group_caps = event->group_caps;
 
 		if (!RB_EMPTY_NODE(&event->group_node)) {
-			list_del_init(&sibling->sibling_list);
 			add_event_to_groups(sibling, event->ctx);
 
 			if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
-- 
cgit v1.2.3-71-gd317


From 45dbac0e288350f9a4226a5b4b651ed434dd9f85 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Thu, 15 Mar 2018 04:58:12 -0700
Subject: locking/mutex: Improve documentation

On Wed, Mar 14, 2018 at 01:56:31PM -0700, Andrew Morton wrote:

> My memory is weak and our documentation is awful.  What does
> mutex_lock_killable() actually do and how does it differ from
> mutex_lock_interruptible()?

Add kernel-doc for mutex_lock_killable() and mutex_lock_io().  Reword the
kernel-doc for mutex_lock_interruptible().

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: cl@linux.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/20180315115812.GA9949@bombadil.infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 858a07590e39..2048359f33d2 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1082,15 +1082,16 @@ static noinline int __sched
 __mutex_lock_interruptible_slowpath(struct mutex *lock);
 
 /**
- * mutex_lock_interruptible - acquire the mutex, interruptible
- * @lock: the mutex to be acquired
+ * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals.
+ * @lock: The mutex to be acquired.
  *
- * Lock the mutex like mutex_lock(), and return 0 if the mutex has
- * been acquired or sleep until the mutex becomes available. If a
- * signal arrives while waiting for the lock then this function
- * returns -EINTR.
+ * Lock the mutex like mutex_lock().  If a signal is delivered while the
+ * process is sleeping, this function will return without acquiring the
+ * mutex.
  *
- * This function is similar to (but not equivalent to) down_interruptible().
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * signal arrived.
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
@@ -1104,6 +1105,18 @@ int __sched mutex_lock_interruptible(struct mutex *lock)
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
+/**
+ * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals.
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock().  If a signal which will be fatal to
+ * the current process is delivered while the process is sleeping, this
+ * function will return without acquiring the mutex.
+ *
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * fatal signal arrived.
+ */
 int __sched mutex_lock_killable(struct mutex *lock)
 {
 	might_sleep();
@@ -1115,6 +1128,16 @@ int __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
+/**
+ * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock().  While the task is waiting for this
+ * mutex, it will be accounted as being in the IO wait state by the
+ * scheduler.
+ *
+ * Context: Process context.
+ */
 void __sched mutex_lock_io(struct mutex *lock)
 {
 	int token;
-- 
cgit v1.2.3-71-gd317


From 7f65ea42eb00bc902f1c37a71e984e4f4064cfa9 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 9 Mar 2018 09:52:42 +0000
Subject: sched/fair: Add util_est on top of PELT

The util_avg signal computed by PELT is too variable for some use-cases.
For example, a big task waking up after a long sleep period will have its
utilization almost completely decayed. This introduces some latency before
schedutil will be able to pick the best frequency to run a task.

The same issue can affect task placement. Indeed, since the task
utilization is already decayed at wakeup, when the task is enqueued in a
CPU, this can result in a CPU running a big task as being temporarily
represented as being almost empty. This leads to a race condition where
other tasks can be potentially allocated on a CPU which just started to run
a big task which slept for a relatively long period.

Moreover, the PELT utilization of a task can be updated every [ms], thus
making it a continuously changing value for certain longer running
tasks. This means that the instantaneous PELT utilization of a RUNNING
task is not really meaningful to properly support scheduler decisions.

For all these reasons, a more stable signal can do a better job of
representing the expected/estimated utilization of a task/cfs_rq.
Such a signal can be easily created on top of PELT by still using it as
an estimator which produces values to be aggregated on meaningful
events.

This patch adds a simple implementation of util_est, a new signal built on
top of PELT's util_avg where:

    util_est(task) = max(task::util_avg, f(task::util_avg@dequeue))

This allows to remember how big a task has been reported by PELT in its
previous activations via f(task::util_avg@dequeue), which is the new
_task_util_est(struct task_struct*) function added by this patch.

If a task should change its behavior and it runs longer in a new
activation, after a certain time its util_est will just track the
original PELT signal (i.e. task::util_avg).

The estimated utilization of cfs_rq is defined only for root ones.
That's because the only sensible consumer of this signal are the
scheduler and schedutil when looking for the overall CPU utilization
due to FAIR tasks.

For this reason, the estimated utilization of a root cfs_rq is simply
defined as:

    util_est(cfs_rq) = max(cfs_rq::util_avg, cfs_rq::util_est::enqueued)

where:

    cfs_rq::util_est::enqueued = sum(_task_util_est(task))
                                 for each RUNNABLE task on that root cfs_rq

It's worth noting that the estimated utilization is tracked only for
objects of interests, specifically:

 - Tasks: to better support tasks placement decisions
 - root cfs_rqs: to better support both tasks placement decisions as
                 well as frequencies selection

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Steve Muckle <smuckle@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@android.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: http://lkml.kernel.org/r/20180309095245.11071-2-patrick.bellasi@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h   |  29 ++++++++++++
 kernel/sched/debug.c    |   4 ++
 kernel/sched/fair.c     | 122 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/features.h |   5 ++
 4 files changed, 154 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 21b1168da951..f228c6033832 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -274,6 +274,34 @@ struct load_weight {
 	u32				inv_weight;
 };
 
+/**
+ * struct util_est - Estimation utilization of FAIR tasks
+ * @enqueued: instantaneous estimated utilization of a task/cpu
+ * @ewma:     the Exponential Weighted Moving Average (EWMA)
+ *            utilization of a task
+ *
+ * Support data structure to track an Exponential Weighted Moving Average
+ * (EWMA) of a FAIR task's utilization. New samples are added to the moving
+ * average each time a task completes an activation. Sample's weight is chosen
+ * so that the EWMA will be relatively insensitive to transient changes to the
+ * task's workload.
+ *
+ * The enqueued attribute has a slightly different meaning for tasks and cpus:
+ * - task:   the task's util_avg at last task dequeue time
+ * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
+ * Thus, the util_est.enqueued of a task represents the contribution on the
+ * estimated utilization of the CPU where that task is currently enqueued.
+ *
+ * Only for tasks we track a moving average of the past instantaneous
+ * estimated utilization. This allows to absorb sporadic drops in utilization
+ * of an otherwise almost periodic task.
+ */
+struct util_est {
+	unsigned int			enqueued;
+	unsigned int			ewma;
+#define UTIL_EST_WEIGHT_SHIFT		2
+};
+
 /*
  * The load_avg/util_avg accumulates an infinite geometric series
  * (see __update_load_avg() in kernel/sched/fair.c).
@@ -335,6 +363,7 @@ struct sched_avg {
 	unsigned long			load_avg;
 	unsigned long			runnable_load_avg;
 	unsigned long			util_avg;
+	struct util_est			util_est;
 };
 
 struct sched_statistics {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 644d9a464380..332303be4beb 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -541,6 +541,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->avg.runnable_load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
 			cfs_rq->avg.util_avg);
+	SEQ_printf(m, "  .%-30s: %u\n", "util_est_enqueued",
+			cfs_rq->avg.util_est.enqueued);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
 			cfs_rq->removed.load_avg);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
@@ -989,6 +991,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 	P(se.avg.runnable_load_avg);
 	P(se.avg.util_avg);
 	P(se.avg.last_update_time);
+	P(se.avg.util_est.ewma);
+	P(se.avg.util_est.enqueued);
 #endif
 	P(policy);
 	P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3582117e1580..22b59a7facd2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3873,6 +3873,113 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 
 static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
 
+static inline unsigned long task_util(struct task_struct *p)
+{
+	return READ_ONCE(p->se.avg.util_avg);
+}
+
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+	struct util_est ue = READ_ONCE(p->se.avg.util_est);
+
+	return max(ue.ewma, ue.enqueued);
+}
+
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+	return max(task_util(p), _task_util_est(p));
+}
+
+static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
+				    struct task_struct *p)
+{
+	unsigned int enqueued;
+
+	if (!sched_feat(UTIL_EST))
+		return;
+
+	/* Update root cfs_rq's estimated utilization */
+	enqueued  = cfs_rq->avg.util_est.enqueued;
+	enqueued += _task_util_est(p);
+	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+}
+
+/*
+ * Check if a (signed) value is within a specified (unsigned) margin,
+ * based on the observation that:
+ *
+ *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
+ *
+ * NOTE: this only works when value + maring < INT_MAX.
+ */
+static inline bool within_margin(int value, int margin)
+{
+	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+}
+
+static void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+{
+	long last_ewma_diff;
+	struct util_est ue;
+
+	if (!sched_feat(UTIL_EST))
+		return;
+
+	/*
+	 * Update root cfs_rq's estimated utilization
+	 *
+	 * If *p is the last task then the root cfs_rq's estimated utilization
+	 * of a CPU is 0 by definition.
+	 */
+	ue.enqueued = 0;
+	if (cfs_rq->nr_running) {
+		ue.enqueued  = cfs_rq->avg.util_est.enqueued;
+		ue.enqueued -= min_t(unsigned int, ue.enqueued,
+				     _task_util_est(p));
+	}
+	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+
+	/*
+	 * Skip update of task's estimated utilization when the task has not
+	 * yet completed an activation, e.g. being migrated.
+	 */
+	if (!task_sleep)
+		return;
+
+	/*
+	 * Skip update of task's estimated utilization when its EWMA is
+	 * already ~1% close to its last activation value.
+	 */
+	ue = p->se.avg.util_est;
+	ue.enqueued = task_util(p);
+	last_ewma_diff = ue.enqueued - ue.ewma;
+	if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+		return;
+
+	/*
+	 * Update Task's estimated utilization
+	 *
+	 * When *p completes an activation we can consolidate another sample
+	 * of the task size. This is done by storing the current PELT value
+	 * as ue.enqueued and by using this value to update the Exponential
+	 * Weighted Moving Average (EWMA):
+	 *
+	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
+	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
+	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
+	 *          = w * (      last_ewma_diff            ) +     ewma(t-1)
+	 *          = w * (last_ewma_diff  +  ewma(t-1) / w)
+	 *
+	 * Where 'w' is the weight of new samples, which is configured to be
+	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+	 */
+	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
+	ue.ewma  += last_ewma_diff;
+	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+	WRITE_ONCE(p->se.avg.util_est, ue);
+}
+
 #else /* CONFIG_SMP */
 
 static inline int
@@ -3902,6 +4009,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
 	return 0;
 }
 
+static inline void
+util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
+		 bool task_sleep) {}
+
 #endif /* CONFIG_SMP */
 
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5249,6 +5363,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!se)
 		add_nr_running(rq, 1);
 
+	util_est_enqueue(&rq->cfs, p);
 	hrtick_update(rq);
 }
 
@@ -5308,6 +5423,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!se)
 		sub_nr_running(rq, 1);
 
+	util_est_dequeue(&rq->cfs, p, task_sleep);
 	hrtick_update(rq);
 }
 
@@ -5835,7 +5951,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	return target;
 }
 
-static inline unsigned long task_util(struct task_struct *p);
 static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
 
 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -6351,11 +6466,6 @@ static unsigned long cpu_util(int cpu)
 	return (util >= capacity) ? capacity : util;
 }
 
-static inline unsigned long task_util(struct task_struct *p)
-{
-	return p->se.avg.util_avg;
-}
-
 /*
  * cpu_util_wake: Compute CPU utilization with any contributions from
  * the waking task p removed.
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9552fd5854bf..c459a4b61544 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
 SCHED_FEAT(WA_IDLE, true)
 SCHED_FEAT(WA_WEIGHT, true)
 SCHED_FEAT(WA_BIAS, true)
+
+/*
+ * UtilEstimation. Use estimated CPU utilization.
+ */
+SCHED_FEAT(UTIL_EST, false)
-- 
cgit v1.2.3-71-gd317


From f9be3e5961c5554879a491961187472e923f5ee0 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 9 Mar 2018 09:52:43 +0000
Subject: sched/fair: Use util_est in LB and WU paths

When the scheduler looks at the CPU utilization, the current PELT value
for a CPU is returned straight away. In certain scenarios this can have
undesired side effects on task placement.

For example, since the task utilization is decayed at wakeup time, when
a long sleeping big task is enqueued it does not add immediately a
significant contribution to the target CPU.
As a result we generate a race condition where other tasks can be placed
on the same CPU while it is still considered relatively empty.

In order to reduce this kind of race conditions, this patch introduces the
required support to integrate the usage of the CPU's estimated utilization
in the wakeup path, via cpu_util_wake(), as well as in the load-balance
path, via cpu_util() which is used by update_sg_lb_stats().

The estimated utilization of a CPU is defined to be the maximum between
its PELT's utilization and the sum of the estimated utilization (at
previous dequeue time) of all the tasks currently RUNNABLE on that CPU.
This allows to properly represent the spare capacity of a CPU which, for
example, has just got a big task running since a long sleep period.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Steve Muckle <smuckle@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@android.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: http://lkml.kernel.org/r/20180309095245.11071-3-patrick.bellasi@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 84 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 70 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 22b59a7facd2..570b8d056282 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6432,11 +6432,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	return target;
 }
 
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
+/**
+ * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
+ * @cpu: the CPU to get the utilization of
+ *
+ * The unit of the return value must be the one of capacity so we can compare
+ * the utilization with the capacity of the CPU that is available for CFS task
+ * (ie cpu_capacity).
  *
  * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
  * recent utilization of currently non-runnable tasks on a CPU. It represents
@@ -6447,6 +6449,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
  * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
  * the running time on this CPU scaled by capacity_curr.
  *
+ * The estimated utilization of a CPU is defined to be the maximum between its
+ * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
+ * currently RUNNABLE on that CPU.
+ * This allows to properly represent the expected utilization of a CPU which
+ * has just got a big task running since a long sleep period. At the same time
+ * however it preserves the benefits of the "blocked utilization" in
+ * describing the potential for other tasks waking up on the same CPU.
+ *
  * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
  * higher than capacity_orig because of unfortunate rounding in
  * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
@@ -6457,13 +6467,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
  * available capacity. We allow utilization to overshoot capacity_curr (but not
  * capacity_orig) as it useful for predicting the capacity required after task
  * migrations (scheduler-driven DVFS).
+ *
+ * Return: the (estimated) utilization for the specified CPU
  */
-static unsigned long cpu_util(int cpu)
+static inline unsigned long cpu_util(int cpu)
 {
-	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
-	unsigned long capacity = capacity_orig_of(cpu);
+	struct cfs_rq *cfs_rq;
+	unsigned int util;
+
+	cfs_rq = &cpu_rq(cpu)->cfs;
+	util = READ_ONCE(cfs_rq->avg.util_avg);
+
+	if (sched_feat(UTIL_EST))
+		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
 
-	return (util >= capacity) ? capacity : util;
+	return min_t(unsigned long, util, capacity_orig_of(cpu));
 }
 
 /*
@@ -6472,16 +6490,54 @@ static unsigned long cpu_util(int cpu)
  */
 static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
 {
-	unsigned long util, capacity;
+	struct cfs_rq *cfs_rq;
+	unsigned int util;
 
 	/* Task has no contribution or is new */
-	if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
 		return cpu_util(cpu);
 
-	capacity = capacity_orig_of(cpu);
-	util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+	cfs_rq = &cpu_rq(cpu)->cfs;
+	util = READ_ONCE(cfs_rq->avg.util_avg);
+
+	/* Discount task's blocked util from CPU's util */
+	util -= min_t(unsigned int, util, task_util(p));
 
-	return (util >= capacity) ? capacity : util;
+	/*
+	 * Covered cases:
+	 *
+	 * a) if *p is the only task sleeping on this CPU, then:
+	 *      cpu_util (== task_util) > util_est (== 0)
+	 *    and thus we return:
+	 *      cpu_util_wake = (cpu_util - task_util) = 0
+	 *
+	 * b) if other tasks are SLEEPING on this CPU, which is now exiting
+	 *    IDLE, then:
+	 *      cpu_util >= task_util
+	 *      cpu_util > util_est (== 0)
+	 *    and thus we discount *p's blocked utilization to return:
+	 *      cpu_util_wake = (cpu_util - task_util) >= 0
+	 *
+	 * c) if other tasks are RUNNABLE on that CPU and
+	 *      util_est > cpu_util
+	 *    then we use util_est since it returns a more restrictive
+	 *    estimation of the spare capacity on that CPU, by just
+	 *    considering the expected utilization of tasks already
+	 *    runnable on that CPU.
+	 *
+	 * Cases a) and b) are covered by the above code, while case c) is
+	 * covered by the following code when estimated utilization is
+	 * enabled.
+	 */
+	if (sched_feat(UTIL_EST))
+		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+	/*
+	 * Utilization (estimated) can exceed the CPU capacity, thus let's
+	 * clamp to the maximum CPU capacity to ensure consistency with
+	 * the cpu_util call.
+	 */
+	return min_t(unsigned long, util, capacity_orig_of(cpu));
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From a07630b8b2c16f82fd5b71d890079f4dd7599c1d Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 9 Mar 2018 09:52:44 +0000
Subject: sched/cpufreq/schedutil: Use util_est for OPP selection

When schedutil looks at the CPU utilization, the current PELT value for
that CPU is returned straight away. In certain scenarios this can have
undesired side effects and delays on frequency selection.

For example, since the task utilization is decayed at wakeup time, a
long sleeping big task newly enqueued does not add immediately a
significant contribution to the target CPU. This introduces some latency
before schedutil will be able to detect the best frequency required by
that task.

Moreover, the PELT signal build-up time is a function of the current
frequency, because of the scale invariant load tracking support. Thus,
starting from a lower frequency, the utilization build-up time will
increase even more and further delays the selection of the actual
frequency which better serves the task requirements.

In order to reduce these kind of latencies, we integrate the usage
of the CPU's estimated utilization in the sugov_get_util function.

This allows to properly consider the expected utilization of a CPU which,
for example, has just got a big task running after a long sleep period.
Ultimately this allows to select the best frequency to run a task
right after its wake-up.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steve Muckle <smuckle@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@android.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20180309095245.11071-4-patrick.bellasi@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 22909ffc04fb..c3deaee7a7a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2163,6 +2163,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
 
 static inline unsigned long cpu_util_cfs(struct rq *rq)
 {
-	return rq->cfs.avg.util_avg;
+	unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
+
+	if (sched_feat(UTIL_EST)) {
+		util = max_t(unsigned long, util,
+			     READ_ONCE(rq->cfs.avg.util_est.enqueued));
+	}
+
+	return util;
 }
 #endif
-- 
cgit v1.2.3-71-gd317


From d519329f72a6f36bc4f2b85452640cfe583b4f81 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 9 Mar 2018 09:52:45 +0000
Subject: sched/fair: Update util_est only on util_avg updates

The estimated utilization of a task is currently updated every time the
task is dequeued. However, to keep overheads under control, PELT signals
are effectively updated at maximum once every 1ms.

Thus, for really short running tasks, it can happen that their util_avg
value has not been updates since their last enqueue.  If such tasks are
also frequently running tasks (e.g. the kind of workload generated by
hackbench) it can also happen that their util_avg is updated only every
few activations.

This means that updating util_est at every dequeue potentially introduces
not necessary overheads and it's also conceptually wrong if the util_avg
signal has never been updated during a task activation.

Let's introduce a throttling mechanism on task's util_est updates
to sync them with util_avg updates. To make the solution memory
efficient, both in terms of space and load/store operations, we encode a
synchronization flag into the LSB of util_est.enqueued.
This makes util_est an even values only metric, which is still
considered good enough for its purpose.
The synchronization bit is (re)set by __update_load_avg_se() once the
PELT signal of a task has been updated during its last activation.

Such a throttling mechanism allows to keep under control util_est
overheads in the wakeup hot path, thus making it a suitable mechanism
which can be enabled also on high-intensity workload systems.
Thus, this now switches on by default the estimation utilization
scheduler feature.

Suggested-by: Chris Redpath <chris.redpath@arm.com>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Steve Muckle <smuckle@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@android.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: http://lkml.kernel.org/r/20180309095245.11071-5-patrick.bellasi@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c     | 42 ++++++++++++++++++++++++++++++++++++++----
 kernel/sched/features.h |  2 +-
 2 files changed, 39 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 570b8d056282..0951d1c58d2f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3242,6 +3242,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
 	sa->util_avg = sa->util_sum / divider;
 }
 
+/*
+ * When a task is dequeued, its estimated utilization should not be update if
+ * its util_avg has not been updated at least once.
+ * This flag is used to synchronize util_avg updates with util_est updates.
+ * We map this information into the LSB bit of the utilization saved at
+ * dequeue time (i.e. util_est.dequeued).
+ */
+#define UTIL_AVG_UNCHANGED 0x1
+
+static inline void cfs_se_util_change(struct sched_avg *avg)
+{
+	unsigned int enqueued;
+
+	if (!sched_feat(UTIL_EST))
+		return;
+
+	/* Avoid store if the flag has been already set */
+	enqueued = avg->util_est.enqueued;
+	if (!(enqueued & UTIL_AVG_UNCHANGED))
+		return;
+
+	/* Reset flag to report util_avg has been updated */
+	enqueued &= ~UTIL_AVG_UNCHANGED;
+	WRITE_ONCE(avg->util_est.enqueued, enqueued);
+}
+
 /*
  * sched_entity:
  *
@@ -3293,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit
 				cfs_rq->curr == se)) {
 
 		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+		cfs_se_util_change(&se->avg);
 		return 1;
 	}
 
@@ -3900,7 +3927,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 
 	/* Update root cfs_rq's estimated utilization */
 	enqueued  = cfs_rq->avg.util_est.enqueued;
-	enqueued += _task_util_est(p);
+	enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
 }
 
@@ -3936,7 +3963,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 	if (cfs_rq->nr_running) {
 		ue.enqueued  = cfs_rq->avg.util_est.enqueued;
 		ue.enqueued -= min_t(unsigned int, ue.enqueued,
-				     _task_util_est(p));
+				     (_task_util_est(p) | UTIL_AVG_UNCHANGED));
 	}
 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
 
@@ -3947,12 +3974,19 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 	if (!task_sleep)
 		return;
 
+	/*
+	 * If the PELT values haven't changed since enqueue time,
+	 * skip the util_est update.
+	 */
+	ue = p->se.avg.util_est;
+	if (ue.enqueued & UTIL_AVG_UNCHANGED)
+		return;
+
 	/*
 	 * Skip update of task's estimated utilization when its EWMA is
 	 * already ~1% close to its last activation value.
 	 */
-	ue = p->se.avg.util_est;
-	ue.enqueued = task_util(p);
+	ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
 	last_ewma_diff = ue.enqueued - ue.ewma;
 	if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
 		return;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index c459a4b61544..85ae8488039c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -89,4 +89,4 @@ SCHED_FEAT(WA_BIAS, true)
 /*
  * UtilEstimation. Use estimated CPU utilization.
  */
-SCHED_FEAT(UTIL_EST, false)
+SCHED_FEAT(UTIL_EST, true)
-- 
cgit v1.2.3-71-gd317


From 6b2bb7265f0b62605e8caee3613449ed0db270b9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 15 Mar 2018 11:40:33 +0100
Subject: sched/wait: Introduce wait_var_event()

As a replacement for the wait_on_atomic_t() API provide the
wait_var_event() API.

The wait_var_event() API is based on the very same hashed-waitqueue
idea, but doesn't care about the type (atomic_t) or the specific
condition (atomic_read() == 0). IOW. it's much more widely
applicable/flexible.

It shares all the benefits/disadvantages of a hashed-waitqueue
approach with the existing wait_on_atomic_t/wait_on_bit() APIs.

The API is modeled after the existing wait_event() API, but instead of
taking a wait_queue_head, it takes an address. This addresses is
hashed to obtain a wait_queue_head from the bit_wait_table.

Similar to the wait_event() API, it takes a condition expression as
second argument and will wait until this expression becomes true.

The following are (mostly) identical replacements:

	wait_on_atomic_t(&my_atomic, atomic_t_wait, TASK_UNINTERRUPTIBLE);
	wake_up_atomic_t(&my_atomic);

	wait_var_event(&my_atomic, !atomic_read(&my_atomic));
	wake_up_var(&my_atomic);

The only difference is that wake_up_var() is an unconditional wakeup
and doesn't check the previously hard-coded (atomic_read() == 0)
condition here. This is of little concequence, since most callers are
already conditional on atomic_dec_and_test() and the ones that are
not, are trivial to make so.

Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait_bit.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/wait_bit.c  | 48 +++++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 61b39eaf7cad..3fcdb75d69cf 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -262,4 +262,74 @@ int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode
 	return out_of_line_wait_on_atomic_t(val, action, mode);
 }
 
+extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags);
+extern void wake_up_var(void *var);
+extern wait_queue_head_t *__var_waitqueue(void *p);
+
+#define ___wait_var_event(var, condition, state, exclusive, ret, cmd)	\
+({									\
+	__label__ __out;						\
+	struct wait_queue_head *__wq_head = __var_waitqueue(var);	\
+	struct wait_bit_queue_entry __wbq_entry;			\
+	long __ret = ret; /* explicit shadow */				\
+									\
+	init_wait_var_entry(&__wbq_entry, var,				\
+			    exclusive ? WQ_FLAG_EXCLUSIVE : 0);		\
+	for (;;) {							\
+		long __int = prepare_to_wait_event(__wq_head,		\
+						   &__wbq_entry.wq_entry, \
+						   state);		\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			goto __out;					\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_wait(__wq_head, &__wbq_entry.wq_entry);			\
+__out:	__ret;								\
+})
+
+#define __wait_var_event(var, condition)				\
+	___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
+			  schedule())
+
+#define wait_var_event(var, condition)					\
+do {									\
+	might_sleep();							\
+	if (condition)							\
+		break;							\
+	__wait_var_event(var, condition);				\
+} while (0)
+
+#define __wait_var_event_killable(var, condition)			\
+	___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,		\
+			  schedule())
+
+#define wait_var_event_killable(var, condition)				\
+({									\
+	int __ret = 0;							\
+	might_sleep();							\
+	if (!(condition))						\
+		__ret = __wait_var_event_killable(var, condition);	\
+	__ret;								\
+})
+
+#define __wait_var_event_timeout(var, condition, timeout)		\
+	___wait_var_event(var, ___wait_cond_timeout(condition),		\
+			  TASK_UNINTERRUPTIBLE, 0, timeout,		\
+			  __ret = schedule_timeout(__ret))
+
+#define wait_var_event_timeout(var, condition, timeout)			\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_var_event_timeout(var, condition, timeout); \
+	__ret;								\
+})
+
 #endif /* _LINUX_WAIT_BIT_H */
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 4239c78f5cd3..ed84ab245a05 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -149,6 +149,54 @@ void wake_up_bit(void *word, int bit)
 }
 EXPORT_SYMBOL(wake_up_bit);
 
+wait_queue_head_t *__var_waitqueue(void *p)
+{
+	if (BITS_PER_LONG == 64) {
+		unsigned long q = (unsigned long)p;
+
+		return bit_waitqueue((void *)(q & ~1), q & 1);
+	}
+	return bit_waitqueue(p, 0);
+}
+EXPORT_SYMBOL(__var_waitqueue);
+
+static int
+var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
+		  int sync, void *arg)
+{
+	struct wait_bit_key *key = arg;
+	struct wait_bit_queue_entry *wbq_entry =
+		container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+
+	if (wbq_entry->key.flags != key->flags ||
+	    wbq_entry->key.bit_nr != key->bit_nr)
+		return 0;
+
+	return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+
+void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags)
+{
+	*wbq_entry = (struct wait_bit_queue_entry){
+		.key = {
+			.flags	= (var),
+			.bit_nr = -1,
+		},
+		.wq_entry = {
+			.private = current,
+			.func	 = var_wake_function,
+			.entry	 = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
+		},
+	};
+}
+EXPORT_SYMBOL(init_wait_var_entry);
+
+void wake_up_var(void *var)
+{
+	__wake_up_bit(__var_waitqueue(var), var, -1);
+}
+EXPORT_SYMBOL(wake_up_var);
+
 /*
  * Manipulate the atomic_t address to produce a better bit waitqueue table hash
  * index (we're keying off bit -1, but that would produce a horrible hash
-- 
cgit v1.2.3-71-gd317


From 9b8cce52c4b5c08297900bfdcafc6b08d9bc4a27 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 15 Mar 2018 11:46:30 +0100
Subject: sched/wait: Remove the wait_on_atomic_t() API

There are no users left (everyone got converted to wait_var_event()), remove it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait_bit.h |  27 -------------
 kernel/sched/wait_bit.c  | 101 -----------------------------------------------
 2 files changed, 128 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 3fcdb75d69cf..9318b2166439 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -10,7 +10,6 @@
 struct wait_bit_key {
 	void			*flags;
 	int			bit_nr;
-#define WAIT_ATOMIC_T_BIT_NR	-1
 	unsigned long		timeout;
 };
 
@@ -22,21 +21,15 @@ struct wait_bit_queue_entry {
 #define __WAIT_BIT_KEY_INITIALIZER(word, bit)					\
 	{ .flags = word, .bit_nr = bit, }
 
-#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p)					\
-	{ .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, }
-
 typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);
-typedef int wait_atomic_t_action_f(atomic_t *counter, unsigned int mode);
 
 void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
 int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
 int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
 void wake_up_bit(void *word, int bit);
-void wake_up_atomic_t(atomic_t *p);
 int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
 int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
 int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
-int out_of_line_wait_on_atomic_t(atomic_t *p, wait_atomic_t_action_f action, unsigned int mode);
 struct wait_queue_head *bit_waitqueue(void *word, int bit);
 extern void __init wait_bit_init(void);
 
@@ -57,7 +50,6 @@ extern int bit_wait(struct wait_bit_key *key, int mode);
 extern int bit_wait_io(struct wait_bit_key *key, int mode);
 extern int bit_wait_timeout(struct wait_bit_key *key, int mode);
 extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode);
-extern int atomic_t_wait(atomic_t *counter, unsigned int mode);
 
 /**
  * wait_on_bit - wait for a bit to be cleared
@@ -243,25 +235,6 @@ wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
 	return out_of_line_wait_on_bit_lock(word, bit, action, mode);
 }
 
-/**
- * wait_on_atomic_t - Wait for an atomic_t to become 0
- * @val: The atomic value being waited on, a kernel virtual address
- * @action: the function used to sleep, which may take special actions
- * @mode: the task state to sleep in
- *
- * Wait for an atomic_t to become 0.  We abuse the bit-wait waitqueue table for
- * the purpose of getting a waitqueue, but we set the key to a bit number
- * outside of the target 'word'.
- */
-static inline
-int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode)
-{
-	might_sleep();
-	if (atomic_read(val) == 0)
-		return 0;
-	return out_of_line_wait_on_atomic_t(val, action, mode);
-}
-
 extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags);
 extern void wake_up_var(void *var);
 extern wait_queue_head_t *__var_waitqueue(void *p);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index ed84ab245a05..60a84f5a6cb4 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -197,107 +197,6 @@ void wake_up_var(void *var)
 }
 EXPORT_SYMBOL(wake_up_var);
 
-/*
- * Manipulate the atomic_t address to produce a better bit waitqueue table hash
- * index (we're keying off bit -1, but that would produce a horrible hash
- * value).
- */
-static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
-{
-	if (BITS_PER_LONG == 64) {
-		unsigned long q = (unsigned long)p;
-
-		return bit_waitqueue((void *)(q & ~1), q & 1);
-	}
-	return bit_waitqueue(p, 0);
-}
-
-static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync,
-				  void *arg)
-{
-	struct wait_bit_key *key = arg;
-	struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
-	atomic_t *val = key->flags;
-
-	if (wait_bit->key.flags != key->flags ||
-	    wait_bit->key.bit_nr != key->bit_nr ||
-	    atomic_read(val) != 0)
-		return 0;
-
-	return autoremove_wake_function(wq_entry, mode, sync, key);
-}
-
-/*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
- * the actions of __wait_on_atomic_t() are permitted return codes.  Nonzero
- * return codes halt waiting and return.
- */
-static __sched
-int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
-		       wait_atomic_t_action_f action, unsigned int mode)
-{
-	atomic_t *val;
-	int ret = 0;
-
-	do {
-		prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
-		val = wbq_entry->key.flags;
-		if (atomic_read(val) == 0)
-			break;
-		ret = (*action)(val, mode);
-	} while (!ret && atomic_read(val) != 0);
-	finish_wait(wq_head, &wbq_entry->wq_entry);
-
-	return ret;
-}
-
-#define DEFINE_WAIT_ATOMIC_T(name, p)					\
-	struct wait_bit_queue_entry name = {				\
-		.key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p),		\
-		.wq_entry = {						\
-			.private	= current,			\
-			.func		= wake_atomic_t_function,	\
-			.entry		=				\
-				LIST_HEAD_INIT((name).wq_entry.entry),	\
-		},							\
-	}
-
-__sched int out_of_line_wait_on_atomic_t(atomic_t *p,
-					 wait_atomic_t_action_f action,
-					 unsigned int mode)
-{
-	struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
-	DEFINE_WAIT_ATOMIC_T(wq_entry, p);
-
-	return __wait_on_atomic_t(wq_head, &wq_entry, action, mode);
-}
-EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
-
-__sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
-{
-	schedule();
-	if (signal_pending_state(mode, current))
-		return -EINTR;
-
-	return 0;
-}
-EXPORT_SYMBOL(atomic_t_wait);
-
-/**
- * wake_up_atomic_t - Wake up a waiter on a atomic_t
- * @p: The atomic_t being waited on, a kernel virtual address
- *
- * Wake up anyone waiting for the atomic_t to go to zero.
- *
- * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
- * check is done by the waiter's wake function, not the by the waker itself).
- */
-void wake_up_atomic_t(atomic_t *p)
-{
-	__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
-}
-EXPORT_SYMBOL(wake_up_atomic_t);
-
 __sched int bit_wait(struct wait_bit_key *word, int mode)
 {
 	schedule();
-- 
cgit v1.2.3-71-gd317


From b3fc5c9bb373661224906bf434c09ca0de032e82 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 15 Mar 2018 11:49:36 +0100
Subject: sched/wait: Improve __var_waitqueue() code generation

Since we fixed hash_64() to not suck there is no need to play games to
attempt to improve the hash value on 64-bit.

Also, since we don't use the bit value for the variables, use hash_ptr()
directly.

No change in functionality.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/wait_bit.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 60a84f5a6cb4..c67c6d24adc2 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -151,12 +151,7 @@ EXPORT_SYMBOL(wake_up_bit);
 
 wait_queue_head_t *__var_waitqueue(void *p)
 {
-	if (BITS_PER_LONG == 64) {
-		unsigned long q = (unsigned long)p;
-
-		return bit_waitqueue((void *)(q & ~1), q & 1);
-	}
-	return bit_waitqueue(p, 0);
+	return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
 }
 EXPORT_SYMBOL(__var_waitqueue);
 
-- 
cgit v1.2.3-71-gd317


From 578ae447e7e5d78c90ac40a06406c1741f79ba96 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 19 Mar 2018 13:18:57 -0500
Subject: jump_label: Disable jump labels in __exit code

With the following commit:

  333522447063 ("jump_label: Explicitly disable jump labels in __init code")

... we explicitly disabled jump labels in __init code, so they could be
detected and not warned about in the following commit:

  dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt")

In-kernel __exit code has the same issue.  It's never used, so it's
freed along with the rest of initmem.  But jump label entries in __exit
code aren't explicitly disabled, so we get the following warning when
enabling pr_debug() in __exit code:

  can't patch jump_label at dmi_sysfs_exit+0x0/0x2d
  WARNING: CPU: 0 PID: 22572 at kernel/jump_label.c:376 __jump_label_update+0x9d/0xb0

Fix the warning by disabling all jump labels in initmem (which includes
both __init and __exit code).

Reported-and-tested-by: Li Wang <liwang@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: dc1dd184c2f0 ("jump_label: Warn on failed jump_label patching attempt")
Link: http://lkml.kernel.org/r/7121e6e595374f06616c505b6e690e275c0054d1.1521483452.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h | 4 ++--
 init/main.c                | 2 +-
 kernel/jump_label.c        | 7 ++++---
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 2168cc6b8b30..b46b541c67c4 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -151,7 +151,7 @@ extern struct jump_entry __start___jump_table[];
 extern struct jump_entry __stop___jump_table[];
 
 extern void jump_label_init(void);
-extern void jump_label_invalidate_init(void);
+extern void jump_label_invalidate_initmem(void);
 extern void jump_label_lock(void);
 extern void jump_label_unlock(void);
 extern void arch_jump_label_transform(struct jump_entry *entry,
@@ -199,7 +199,7 @@ static __always_inline void jump_label_init(void)
 	static_key_initialized = true;
 }
 
-static inline void jump_label_invalidate_init(void) {}
+static inline void jump_label_invalidate_initmem(void) {}
 
 static __always_inline bool static_key_false(struct static_key *key)
 {
diff --git a/init/main.c b/init/main.c
index 969eaf140ef0..21efbf6ace93 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1001,7 +1001,7 @@ static int __ref kernel_init(void *unused)
 	/* need to finish all async __init code before freeing the memory */
 	async_synchronize_full();
 	ftrace_free_init_mem();
-	jump_label_invalidate_init();
+	jump_label_invalidate_initmem();
 	free_initmem();
 	mark_readonly();
 	system_state = SYSTEM_RUNNING;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index e7214093dcd1..01ebdf1f9f40 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -16,6 +16,7 @@
 #include <linux/jump_label_ratelimit.h>
 #include <linux/bug.h>
 #include <linux/cpu.h>
+#include <asm/sections.h>
 
 #ifdef HAVE_JUMP_LABEL
 
@@ -421,15 +422,15 @@ void __init jump_label_init(void)
 	cpus_read_unlock();
 }
 
-/* Disable any jump label entries in __init code */
-void __init jump_label_invalidate_init(void)
+/* Disable any jump label entries in __init/__exit code */
+void __init jump_label_invalidate_initmem(void)
 {
 	struct jump_entry *iter_start = __start___jump_table;
 	struct jump_entry *iter_stop = __stop___jump_table;
 	struct jump_entry *iter;
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		if (init_kernel_text(iter->code))
+		if (init_section_contains((void *)(unsigned long)iter->code, 1))
 			iter->code = 0;
 	}
 }
-- 
cgit v1.2.3-71-gd317


From c917e0f259908e75bd2a65877e25f9d90c22c848 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 12 Mar 2018 09:59:43 -0700
Subject: perf/cgroup: Fix child event counting bug

When a perf_event is attached to parent cgroup, it should count events
for all children cgroups:

   parent_group   <---- perf_event
     \
      - child_group  <---- process(es)

However, in our tests, we found this perf_event cannot report reliable
results. Here is an example case:

  # create cgroups
  mkdir -p /sys/fs/cgroup/p/c
  # start perf for parent group
  perf stat -e instructions -G "p"

  # on another console, run test process in child cgroup:
  stressapptest -s 2 -M 1000 & echo $! > /sys/fs/cgroup/p/c/cgroup.procs

  # after the test process is done, stop perf in the first console shows

       <not counted>      instructions              p

The instruction should not be "not counted" as the process runs in the
child cgroup.

We found this is because perf_event->cgrp and cpuctx->cgrp are not
identical, thus perf_event->cgrp are not updated properly.

This patch fixes this by updating perf_cgroup properly for ancestor
cgroup(s).

Reported-by: Ephraim Park <ephiepark@fb.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <jolsa@redhat.com>
Cc: <kernel-team@fb.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20180312165943.1057894-1-songliubraving@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4b838470fac4..709a55b9ad97 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -724,9 +724,15 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 
 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 {
-	struct perf_cgroup *cgrp_out = cpuctx->cgrp;
-	if (cgrp_out)
-		__update_cgrp_time(cgrp_out);
+	struct perf_cgroup *cgrp = cpuctx->cgrp;
+	struct cgroup_subsys_state *css;
+
+	if (cgrp) {
+		for (css = &cgrp->css; css; css = css->parent) {
+			cgrp = container_of(css, struct perf_cgroup, css);
+			__update_cgrp_time(cgrp);
+		}
+	}
 }
 
 static inline void update_cgrp_time_from_event(struct perf_event *event)
@@ -754,6 +760,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 {
 	struct perf_cgroup *cgrp;
 	struct perf_cgroup_info *info;
+	struct cgroup_subsys_state *css;
 
 	/*
 	 * ctx->lock held by caller
@@ -764,8 +771,12 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 		return;
 
 	cgrp = perf_cgroup_from_task(task, ctx);
-	info = this_cpu_ptr(cgrp->info);
-	info->timestamp = ctx->timestamp;
+
+	for (css = &cgrp->css; css; css = css->parent) {
+		cgrp = container_of(css, struct perf_cgroup, css);
+		info = this_cpu_ptr(cgrp->info);
+		info->timestamp = ctx->timestamp;
+	}
 }
 
 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-- 
cgit v1.2.3-71-gd317


From a8c024cd9b9683d25ae1f459525dd2c6bec75e79 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Mon, 19 Mar 2018 14:35:54 -0400
Subject: sched/debug: Fix per-task line continuation for console output

When the SEQ_printf() macro prints to the console, it runs a simple
printk() without KERN_CONT "continued" line printing.  The result of
this is oddly wrapped task info, for example:

  % echo t > /proc/sysrq-trigger
  % dmesg
  ...
  runnable tasks:
  ...
  [   29.608611]  I
  [   29.608613]       rcu_sched     8      3252.013846      4087   120
  [   29.608614]         0.000000        29.090111         0.000000
  [   29.608615]  0 0
  [   29.608616]  /

Modify SEQ_printf to use pr_cont() for expected one-line results:

  % echo t > /proc/sysrq-trigger
  % dmesg
  ...
  runnable tasks:
  ...
  [  106.716329]  S        cpuhp/5    37      2006.315026        14   120         0.000000         0.496893         0.000000 0 0 /

Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1521484555-8620-2-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1ca0130ed4f9..50026aa2d81e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -32,7 +32,7 @@ static DEFINE_SPINLOCK(sched_debug_lock);
 	if (m)					\
 		seq_printf(m, x);		\
 	else					\
-		printk(x);			\
+		pr_cont(x);			\
  } while (0)
 
 /*
-- 
cgit v1.2.3-71-gd317


From e9ca267096674eadd1fd479279bcb58df1486049 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Mon, 19 Mar 2018 14:35:55 -0400
Subject: sched/debug: Adjust newlines for better alignment

Scheduler debug stats include newlines that display out of alignment
when prefixed by timestamps.  For example, the dmesg utility:

  % echo t > /proc/sysrq-trigger
  % dmesg
  ...
  [   83.124251]
  runnable tasks:
   S           task   PID         tree-key  switches  prio     wait-time
  sum-exec        sum-sleep
  -----------------------------------------------------------------------------------------------------------

At the same time, some syslog utilities (like rsyslog by default) don't
like the additional newlines control characters, saving lines like this
to /var/log/messages:

  Mar 16 16:02:29 localhost kernel: #012runnable tasks:#012 S           task   PID         tree-key ...
                                    ^^^^               ^^^^
Clean these up by moving newline characters to their own SEQ_printf
invocation.  This leaves the /proc/sched_debug unchanged, but brings the
entire output into alignment when prefixed:

  % echo t > /proc/sysrq-trigger
  % dmesg
  ...
  [   62.410368] runnable tasks:
  [   62.410368]  S           task   PID         tree-key  switches  prio     wait-time             sum-exec        sum-sleep
  [   62.410369] -----------------------------------------------------------------------------------------------------------
  [   62.410369]  I  kworker/u12:0     5      1932.215593       332   120         0.000000         3.621252         0.000000 0 0 /

and no escaped control characters from rsyslog in /var/log/messages:

  Mar 16 16:15:06 localhost kernel: runnable tasks:
  Mar 16 16:15:06 localhost kernel: S           task   PID         tree-key  ...

Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1521484555-8620-3-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 50026aa2d81e..72c401b3b15c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -501,12 +501,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 {
 	struct task_struct *g, *p;
 
-	SEQ_printf(m,
-	"\nrunnable tasks:\n"
-	" S           task   PID         tree-key  switches  prio"
-	"     wait-time             sum-exec        sum-sleep\n"
-	"-------------------------------------------------------"
-	"----------------------------------------------------\n");
+	SEQ_printf(m, "\n");
+	SEQ_printf(m, "runnable tasks:\n");
+	SEQ_printf(m, " S           task   PID         tree-key  switches  prio"
+		   "     wait-time             sum-exec        sum-sleep\n");
+	SEQ_printf(m, "-------------------------------------------------------"
+		   "----------------------------------------------------\n");
 
 	rcu_read_lock();
 	for_each_process_thread(g, p) {
@@ -527,9 +527,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	unsigned long flags;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+	SEQ_printf(m, "\n");
+	SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
 #else
-	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+	SEQ_printf(m, "\n");
+	SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
 #endif
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
@@ -595,9 +597,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
-	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+	SEQ_printf(m, "\n");
+	SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
 #else
-	SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+	SEQ_printf(m, "\n");
+	SEQ_printf(m, "rt_rq[%d]:\n", cpu);
 #endif
 
 #define P(x) \
@@ -624,7 +628,8 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
 {
 	struct dl_bw *dl_bw;
 
-	SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
+	SEQ_printf(m, "\n");
+	SEQ_printf(m, "dl_rq[%d]:\n", cpu);
 
 #define PU(x) \
 	SEQ_printf(m, "  .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
-- 
cgit v1.2.3-71-gd317


From f005afede992e265bb98534b86912bb669ccd0d2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 20 Mar 2018 11:19:17 -0700
Subject: trace/bpf: remove helper bpf_perf_prog_read_value from tracepoint
 type programs

Commit 4bebdc7a85aa ("bpf: add helper bpf_perf_prog_read_value")
added helper bpf_perf_prog_read_value so that perf_event type program
can read event counter and enabled/running time.
This commit, however, introduced a bug which allows this helper
for tracepoint type programs. This is incorrect as bpf_perf_prog_read_value
needs to access perf_event through its bpf_perf_event_data_kern type context,
which is not available for tracepoint type program.

This patch fixed the issue by separating bpf_func_proto between tracepoint
and perf_event type programs and removed bpf_perf_prog_read_value
from tracepoint func prototype.

Fixes: 4bebdc7a85aa ("bpf: add helper bpf_perf_prog_read_value")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/trace/bpf_trace.c | 68 ++++++++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c0a9e310d715..01e6b3a38871 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -661,7 +661,41 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_perf_event_output:
+		return &bpf_perf_event_output_proto_tp;
+	case BPF_FUNC_get_stackid:
+		return &bpf_get_stackid_proto_tp;
+	default:
+		return tracing_func_proto(func_id);
+	}
+}
+
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+				    struct bpf_insn_access_aux *info)
+{
+	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
+		return false;
+	if (type != BPF_READ)
+		return false;
+	if (off % size != 0)
+		return false;
+
+	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
+	return true;
+}
+
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
+	.get_func_proto  = tp_prog_func_proto,
+	.is_valid_access = tp_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
+BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx,
 	   struct bpf_perf_event_value *, buf, u32, size)
 {
 	int err = -EINVAL;
@@ -678,8 +712,8 @@ clear:
 	return err;
 }
 
-static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
-         .func           = bpf_perf_prog_read_value_tp,
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
+         .func           = bpf_perf_prog_read_value,
          .gpl_only       = true,
          .ret_type       = RET_INTEGER,
          .arg1_type      = ARG_PTR_TO_CTX,
@@ -687,7 +721,7 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
          .arg3_type      = ARG_CONST_SIZE,
 };
 
-static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
@@ -695,34 +729,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
 	case BPF_FUNC_perf_prog_read_value:
-		return &bpf_perf_prog_read_value_proto_tp;
+		return &bpf_perf_prog_read_value_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
 }
 
-static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
-				    struct bpf_insn_access_aux *info)
-{
-	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
-		return false;
-	if (type != BPF_READ)
-		return false;
-	if (off % size != 0)
-		return false;
-
-	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
-	return true;
-}
-
-const struct bpf_verifier_ops tracepoint_verifier_ops = {
-	.get_func_proto  = tp_prog_func_proto,
-	.is_valid_access = tp_prog_is_valid_access,
-};
-
-const struct bpf_prog_ops tracepoint_prog_ops = {
-};
-
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 				    struct bpf_insn_access_aux *info)
 {
@@ -779,7 +791,7 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
 }
 
 const struct bpf_verifier_ops perf_event_verifier_ops = {
-	.get_func_proto		= tp_prog_func_proto,
+	.get_func_proto		= pe_prog_func_proto,
 	.is_valid_access	= pe_prog_is_valid_access,
 	.convert_ctx_access	= pe_prog_convert_ctx_access,
 };
-- 
cgit v1.2.3-71-gd317


From 0fa4fe85f4724fff89b09741c437cbee9cf8b008 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Mon, 19 Mar 2018 17:57:27 -0700
Subject: bpf: skip unnecessary capability check

The current check statement in BPF syscall will do a capability check
for CAP_SYS_ADMIN before checking sysctl_unprivileged_bpf_disabled. This
code path will trigger unnecessary security hooks on capability checking
and cause false alarms on unprivileged process trying to get CAP_SYS_ADMIN
access. This can be resolved by simply switch the order of the statement
and CAP_SYS_ADMIN is not required anyway if unprivileged bpf syscall is
allowed.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e24aa3241387..43f95d190eea 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1845,7 +1845,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	union bpf_attr attr = {};
 	int err;
 
-	if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
+	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	err = check_uarg_tail_zero(uattr, sizeof(attr), size);
-- 
cgit v1.2.3-71-gd317


From 19b558db12f9f4e45a22012bae7b4783e62224da Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Feb 2018 17:21:55 +0100
Subject: posix-timers: Protect posix clock array access against speculation

The clockid argument of clockid_to_kclock() comes straight from user space
via various syscalls and is used as index into the posix_clocks array.

Protect it against spectre v1 array out of bounds speculation. Remove the
redundant check for !posix_clock[id] as this is another source for
speculation and does not provide any advantage over the return
posix_clock[id] path which returns NULL in that case anyway.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: stable@vger.kernel.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1802151718320.1296@nanos.tec.linutronix.de
---
 kernel/time/posix-timers.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 75043046914e..10b7186d0638 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -50,6 +50,7 @@
 #include <linux/export.h>
 #include <linux/hashtable.h>
 #include <linux/compat.h>
+#include <linux/nospec.h>
 
 #include "timekeeping.h"
 #include "posix-timers.h"
@@ -1346,11 +1347,15 @@ static const struct k_clock * const posix_clocks[] = {
 
 static const struct k_clock *clockid_to_kclock(const clockid_t id)
 {
-	if (id < 0)
+	clockid_t idx = id;
+
+	if (id < 0) {
 		return (id & CLOCKFD_MASK) == CLOCKFD ?
 			&clock_posix_dynamic : &clock_posix_cpu;
+	}
 
-	if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id])
+	if (id >= ARRAY_SIZE(posix_clocks))
 		return NULL;
-	return posix_clocks[id];
+
+	return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
 }
-- 
cgit v1.2.3-71-gd317


From c5d343b6b7badd1f5fe0873eff2e8d63a193e732 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sat, 17 Mar 2018 21:38:10 +0900
Subject: tracing: probeevent: Fix to support minus offset from symbol

In Documentation/trace/kprobetrace.txt, it says

 @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)

However, the parser doesn't parse minus offset correctly, since
commit 2fba0c8867af ("tracing/kprobes: Fix probe offset to be
unsigned") drops minus ("-") offset support for kprobe probe
address usage.

This fixes the traceprobe_split_symbol_offset() to parse minus
offset again with checking the offset range, and add a minus
offset check in kprobe probe address usage.

Link: http://lkml.kernel.org/r/152129028983.31874.13419301530285775521.stgit@devbox

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org
Fixes: 2fba0c8867af ("tracing/kprobes: Fix probe offset to be unsigned")
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 4 ++--
 kernel/trace/trace_probe.c  | 8 +++-----
 kernel/trace/trace_probe.h  | 2 +-
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1fad24acd444..ae4147eaebd4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -659,7 +659,7 @@ static int create_trace_kprobe(int argc, char **argv)
 	char *symbol = NULL, *event = NULL, *group = NULL;
 	int maxactive = 0;
 	char *arg;
-	unsigned long offset = 0;
+	long offset = 0;
 	void *addr = NULL;
 	char buf[MAX_EVENT_NAME_LEN];
 
@@ -747,7 +747,7 @@ static int create_trace_kprobe(int argc, char **argv)
 		symbol = argv[1];
 		/* TODO: support .init module functions */
 		ret = traceprobe_split_symbol_offset(symbol, &offset);
-		if (ret) {
+		if (ret || offset < 0 || offset > UINT_MAX) {
 			pr_info("Failed to parse either an address or a symbol.\n");
 			return ret;
 		}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d59357308677..daf54bda4dc8 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -320,7 +320,7 @@ static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
 }
 
 /* Split symbol and offset. */
-int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
+int traceprobe_split_symbol_offset(char *symbol, long *offset)
 {
 	char *tmp;
 	int ret;
@@ -328,13 +328,11 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
 	if (!offset)
 		return -EINVAL;
 
-	tmp = strchr(symbol, '+');
+	tmp = strpbrk(symbol, "+-");
 	if (tmp) {
-		/* skip sign because kstrtoul doesn't accept '+' */
-		ret = kstrtoul(tmp + 1, 0, offset);
+		ret = kstrtol(tmp, 0, offset);
 		if (ret)
 			return ret;
-
 		*tmp = '\0';
 	} else
 		*offset = 0;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index e101c5bb9eda..6a4d3fa94042 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -365,7 +365,7 @@ extern int traceprobe_conflict_field_name(const char *name,
 extern void traceprobe_update_arg(struct probe_arg *arg);
 extern void traceprobe_free_probe_arg(struct probe_arg *arg);
 
-extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
+extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
 
 /* Sum up total data length for dynamic arraies (strings) */
 static nokprobe_inline int
-- 
cgit v1.2.3-71-gd317


From e97a90f7069b740575bcb1dae86596e0484b8957 Mon Sep 17 00:00:00 2001
From: Claudio Scordino <claudio@evidence.eu.com>
Date: Tue, 13 Mar 2018 11:35:40 +0100
Subject: sched/cpufreq: Rate limits for SCHED_DEADLINE

When the SCHED_DEADLINE scheduling class increases the CPU utilization, it
should not wait for the rate limit, otherwise it may miss some deadline.

Tests using rt-app on Exynos5422 with up to 10 SCHED_DEADLINE tasks have
shown reductions of even 10% of deadline misses with a negligible
increase of energy consumption (measured through Baylibre Cape).

Signed-off-by: Claudio Scordino <claudio@evidence.eu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: linux-pm@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Todd Kjos <tkjos@android.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lkml.kernel.org/r/1520937340-2755-1-git-send-email-claudio@evidence.eu.com
---
 kernel/sched/cpufreq_schedutil.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 89fe78ecb88c..2b124811947d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -267,6 +267,16 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
 static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
 #endif /* CONFIG_NO_HZ_COMMON */
 
+/*
+ * Make sugov_should_update_freq() ignore the rate limit when DL
+ * has increased the utilization.
+ */
+static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
+{
+	if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
+		sg_policy->need_freq_update = true;
+}
+
 static void sugov_update_single(struct update_util_data *hook, u64 time,
 				unsigned int flags)
 {
@@ -279,6 +289,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	sugov_set_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
 
+	ignore_dl_rate_limit(sg_cpu, sg_policy);
+
 	if (!sugov_should_update_freq(sg_policy, time))
 		return;
 
@@ -356,6 +368,8 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
 	sugov_set_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
 
+	ignore_dl_rate_limit(sg_cpu, sg_policy);
+
 	if (sugov_should_update_freq(sg_policy, time)) {
 		next_f = sugov_next_freq_shared(sg_cpu, time);
 		sugov_update_commit(sg_policy, time, next_f);
-- 
cgit v1.2.3-71-gd317


From b720342849fe685310fca01748a32730a6eca5aa Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 26 Mar 2018 14:09:26 -0700
Subject: sched/core: Update preempt_notifier_key to modern API

No changes in refcount semantics, use DEFINE_STATIC_KEY_FALSE()
for initialization and replace:

  static_key_slow_inc|dec()   =>   static_branch_inc|dec()
  static_key_false()          =>   static_branch_unlikely()

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Link: http://lkml.kernel.org/r/20180326210929.5244-4-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b249adbf2a48..de440456f15c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2462,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p)
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
-static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
 
 void preempt_notifier_inc(void)
 {
-	static_key_slow_inc(&preempt_notifier_key);
+	static_branch_inc(&preempt_notifier_key);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_inc);
 
 void preempt_notifier_dec(void)
 {
-	static_key_slow_dec(&preempt_notifier_key);
+	static_branch_dec(&preempt_notifier_key);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_dec);
 
@@ -2482,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec);
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
-	if (!static_key_false(&preempt_notifier_key))
+	if (!static_branch_unlikely(&preempt_notifier_key))
 		WARN(1, "registering preempt_notifier while notifiers disabled\n");
 
 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
@@ -2511,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
 
 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
-	if (static_key_false(&preempt_notifier_key))
+	if (static_branch_unlikely(&preempt_notifier_key))
 		__fire_sched_in_preempt_notifiers(curr);
 }
 
@@ -2529,7 +2529,7 @@ static __always_inline void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
-	if (static_key_false(&preempt_notifier_key))
+	if (static_branch_unlikely(&preempt_notifier_key))
 		__fire_sched_out_preempt_notifiers(curr, next);
 }
 
-- 
cgit v1.2.3-71-gd317


From f67b15037a7a50c57f72e69a6d59941ad90a0f0f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 26 Mar 2018 15:39:07 -1000
Subject: perf/hwbp: Simplify the perf-hwbp code, fix documentation

Annoyingly, modify_user_hw_breakpoint() unnecessarily complicates the
modification of a breakpoint - simplify it and remove the pointless
local variables.

Also update the stale Docbook while at it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/hw_breakpoint.c | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 3f8cb1e14588..253ae2da13c3 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -427,16 +427,9 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
  * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
  * @bp: the breakpoint structure to modify
  * @attr: new breakpoint attributes
- * @triggered: callback to trigger when we hit the breakpoint
- * @tsk: pointer to 'task_struct' of the process to which the address belongs
  */
 int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
 {
-	u64 old_addr = bp->attr.bp_addr;
-	u64 old_len = bp->attr.bp_len;
-	int old_type = bp->attr.bp_type;
-	int err = 0;
-
 	/*
 	 * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
 	 * will not be possible to raise IPIs that invoke __perf_event_disable.
@@ -451,27 +444,18 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	bp->attr.bp_addr = attr->bp_addr;
 	bp->attr.bp_type = attr->bp_type;
 	bp->attr.bp_len = attr->bp_len;
+	bp->attr.disabled = 1;
 
-	if (attr->disabled)
-		goto end;
-
-	err = validate_hw_breakpoint(bp);
-	if (!err)
-		perf_event_enable(bp);
+	if (!attr->disabled) {
+		int err = validate_hw_breakpoint(bp);
 
-	if (err) {
-		bp->attr.bp_addr = old_addr;
-		bp->attr.bp_type = old_type;
-		bp->attr.bp_len = old_len;
-		if (!bp->attr.disabled)
-			perf_event_enable(bp);
+		if (err)
+			return err;
 
-		return err;
+		perf_event_enable(bp);
+		bp->attr.disabled = 0;
 	}
 
-end:
-	bp->attr.disabled = attr->disabled;
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
-- 
cgit v1.2.3-71-gd317


From c28d62cf52d791ba5f6db7ce525ed06b86291c82 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 27 Mar 2018 14:14:38 +0200
Subject: locking/rtmutex: Handle non enqueued waiters gracefully in
 remove_waiter()

In -RT task_blocks_on_rt_mutex() may return with -EAGAIN due to
(->pi_blocked_on == PI_WAKEUP_INPROGRESS) before it added itself as a
waiter. In such a case remove_waiter() must not be called because without a
waiter it will trigger the BUG_ON() statement.

This was initially reported by Yimin Deng. Thomas Gleixner fixed it then
with an explicit check for waiters before calling remove_waiter().

Instead of an explicit NULL check before calling rt_mutex_top_waiter() make
the function return NULL if there are no waiters. With that fixed the now
pointless NULL check is removed from rt_mutex_slowlock().

Reported-and-debugged-by: Yimin Deng <yimin11.deng@gmail.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/CAAh1qt=DCL9aUXNxanP5BKtiPp3m+qj4yB+gDohhXPVFCxWwzg@mail.gmail.com
Link: https://lkml.kernel.org/r/20180327121438.sss7hxg3crqy4ecd@linutronix.de
---
 kernel/locking/rtmutex.c        |  3 +--
 kernel/locking/rtmutex_common.h | 11 ++++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 940633c63254..4f014be7a4b8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1268,8 +1268,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
 	if (unlikely(ret)) {
 		__set_current_state(TASK_RUNNING);
-		if (rt_mutex_has_waiters(lock))
-			remove_waiter(lock, &waiter);
+		remove_waiter(lock, &waiter);
 		rt_mutex_handle_deadlock(ret, chwalk, &waiter);
 	}
 
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 68686b3ec3c1..d1d62f942be2 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -52,12 +52,13 @@ static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
 static inline struct rt_mutex_waiter *
 rt_mutex_top_waiter(struct rt_mutex *lock)
 {
-	struct rt_mutex_waiter *w;
-
-	w = rb_entry(lock->waiters.rb_leftmost,
-		     struct rt_mutex_waiter, tree_entry);
-	BUG_ON(w->lock != lock);
+	struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+	struct rt_mutex_waiter *w = NULL;
 
+	if (leftmost) {
+		w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
+		BUG_ON(w->lock != lock);
+	}
 	return w;
 }
 
-- 
cgit v1.2.3-71-gd317


From b3c39758c8a6972f02b43f83dba7fe7a352371b9 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 27 Mar 2018 19:41:41 +0900
Subject: lockdep: Make the lock debug output more useful

The lock debug output in print_lock() has a few shortcomings:

 - It prints the hlock->acquire_ip field in %px and %pS format. That's
   redundant information.

 - It lacks information about the lock object itself. The lock class is
   not helpful to identify a particular instance of a lock.

Change the output so it prints:

 - hlock->instance to allow identification of a particular lock instance.

 - only the %pS format of hlock->ip_acquire which is sufficient to decode
   the actual code line with faddr2line.

The resulting output is:

3 locks held by a.out/31106:
#0: 00000000b0f753ba (&mm->mmap_sem){++++}, at: copy_process.part.41+0x10d5/0x1fe0
#1: 00000000ef64d539 (&mm->mmap_sem/1){+.+.}, at: copy_process.part.41+0x10fe/0x1fe0
#2: 00000000b41a282e (&mapping->i_mmap_rwsem){++++}, at: copy_process.part.41+0x12f2/0x1fe0

[ tglx: Massaged changelog ]

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: linux-mm@kvack.org
Cc: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/201803271941.GBE57310.tVSOJLQOFFOHFM@I-love.SAKURA.ne.jp
---
 kernel/locking/lockdep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 12a2805dd64a..023386338269 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -556,9 +556,9 @@ static void print_lock(struct held_lock *hlock)
 		return;
 	}
 
+	printk(KERN_CONT "%p", hlock->instance);
 	print_lock_name(lock_classes + class_idx - 1);
-	printk(KERN_CONT ", at: [<%px>] %pS\n",
-		(void *)hlock->acquire_ip, (void *)hlock->acquire_ip);
+	printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
 }
 
 static void lockdep_print_held_locks(struct task_struct *curr)
-- 
cgit v1.2.3-71-gd317


From 6ed70cf342de03c7b11cd4eb032705faeb29d284 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 29 Mar 2018 15:06:48 +0300
Subject: perf/x86/pt, coresight: Clean up address filter structure

This is a cosmetic patch that deals with the address filter structure's
ambiguous fields 'filter' and 'range'. The former stands to mean that the
filter's *action* should be to filter the traces to its address range if
it's set or stop tracing if it's unset. This is confusing and hard on the
eyes, so this patch replaces it with 'action' enum. The 'range' field is
completely redundant (meaning that the filter is an address range as
opposed to a single address trigger), as we can use zero size to mean the
same thing.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Acked-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20180329120648.11902-1-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/pt.c                       | 13 ++++--
 drivers/hwtracing/coresight/coresight-etm-perf.c | 59 +++++++++++-------------
 include/linux/perf_event.h                       | 14 ++++--
 kernel/events/core.c                             | 26 +++++++----
 4 files changed, 63 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 81fd41d5a0d9..3b993942a0e4 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1186,8 +1186,12 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
 	int range = 0;
 
 	list_for_each_entry(filter, filters, entry) {
-		/* PT doesn't support single address triggers */
-		if (!filter->range || !filter->size)
+		/*
+		 * PT doesn't support single address triggers and
+		 * 'start' filters.
+		 */
+		if (!filter->size ||
+		    filter->action == PERF_ADDR_FILTER_ACTION_START)
 			return -EOPNOTSUPP;
 
 		if (!filter->inode) {
@@ -1227,7 +1231,10 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
 
 		filters->filter[range].msr_a  = msr_a;
 		filters->filter[range].msr_b  = msr_b;
-		filters->filter[range].config = filter->filter ? 1 : 2;
+		if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER)
+			filters->filter[range].config = 1;
+		else
+			filters->filter[range].config = 2;
 		range++;
 	}
 
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 8a0ad77574e7..4e5ed6597f2f 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -393,35 +393,26 @@ static int etm_addr_filters_validate(struct list_head *filters)
 		if (++index > ETM_ADDR_CMP_MAX)
 			return -EOPNOTSUPP;
 
+		/* filter::size==0 means single address trigger */
+		if (filter->size) {
+			/*
+			 * The existing code relies on START/STOP filters
+			 * being address filters.
+			 */
+			if (filter->action == PERF_ADDR_FILTER_ACTION_START ||
+			    filter->action == PERF_ADDR_FILTER_ACTION_STOP)
+				return -EOPNOTSUPP;
+
+			range = true;
+		} else
+			address = true;
+
 		/*
-		 * As taken from the struct perf_addr_filter documentation:
-		 *	@range:	1: range, 0: address
-		 *
 		 * At this time we don't allow range and start/stop filtering
 		 * to cohabitate, they have to be mutually exclusive.
 		 */
-		if ((filter->range == 1) && address)
+		if (range && address)
 			return -EOPNOTSUPP;
-
-		if ((filter->range == 0) && range)
-			return -EOPNOTSUPP;
-
-		/*
-		 * For range filtering, the second address in the address
-		 * range comparator needs to be higher than the first.
-		 * Invalid otherwise.
-		 */
-		if (filter->range && filter->size == 0)
-			return -EINVAL;
-
-		/*
-		 * Everything checks out with this filter, record what we've
-		 * received before moving on to the next one.
-		 */
-		if (filter->range)
-			range = true;
-		else
-			address = true;
 	}
 
 	return 0;
@@ -441,18 +432,20 @@ static void etm_addr_filters_sync(struct perf_event *event)
 		stop = start + filter->size;
 		etm_filter = &filters->etm_filter[i];
 
-		if (filter->range == 1) {
+		switch (filter->action) {
+		case PERF_ADDR_FILTER_ACTION_FILTER:
 			etm_filter->start_addr = start;
 			etm_filter->stop_addr = stop;
 			etm_filter->type = ETM_ADDR_TYPE_RANGE;
-		} else {
-			if (filter->filter == 1) {
-				etm_filter->start_addr = start;
-				etm_filter->type = ETM_ADDR_TYPE_START;
-			} else {
-				etm_filter->stop_addr = stop;
-				etm_filter->type = ETM_ADDR_TYPE_STOP;
-			}
+			break;
+		case PERF_ADDR_FILTER_ACTION_START:
+			etm_filter->start_addr = start;
+			etm_filter->type = ETM_ADDR_TYPE_START;
+			break;
+		case PERF_ADDR_FILTER_ACTION_STOP:
+			etm_filter->stop_addr = stop;
+			etm_filter->type = ETM_ADDR_TYPE_STOP;
+			break;
 		}
 		i++;
 	}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ff39ab011376..e71e99eb9a4e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -449,14 +449,19 @@ struct pmu {
 	int (*filter_match)		(struct perf_event *event); /* optional */
 };
 
+enum perf_addr_filter_action_t {
+	PERF_ADDR_FILTER_ACTION_STOP = 0,
+	PERF_ADDR_FILTER_ACTION_START,
+	PERF_ADDR_FILTER_ACTION_FILTER,
+};
+
 /**
  * struct perf_addr_filter - address range filter definition
  * @entry:	event's filter list linkage
  * @inode:	object file's inode for file-based filters
  * @offset:	filter range offset
- * @size:	filter range size
- * @range:	1: range, 0: address
- * @filter:	1: filter/start, 0: stop
+ * @size:	filter range size (size==0 means single address trigger)
+ * @action:	filter/start/stop
  *
  * This is a hardware-agnostic filter configuration as specified by the user.
  */
@@ -465,8 +470,7 @@ struct perf_addr_filter {
 	struct inode		*inode;
 	unsigned long		offset;
 	unsigned long		size;
-	unsigned int		range	: 1,
-				filter	: 1;
+	enum perf_addr_filter_action_t	action;
 };
 
 /**
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7517b4fb3ef4..fc1c330c6bd6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8803,7 +8803,8 @@ restart:
  *  * for kernel addresses: <start address>[/<size>]
  *  * for object files:     <start address>[/<size>]@</path/to/object/file>
  *
- * if <size> is not specified, the range is treated as a single address.
+ * if <size> is not specified or is zero, the range is treated as a single
+ * address; not valid for ACTION=="filter".
  */
 enum {
 	IF_ACT_NONE = -1,
@@ -8853,6 +8854,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 		return -ENOMEM;
 
 	while ((start = strsep(&fstr, " ,\n")) != NULL) {
+		static const enum perf_addr_filter_action_t actions[] = {
+			[IF_ACT_FILTER]	= PERF_ADDR_FILTER_ACTION_FILTER,
+			[IF_ACT_START]	= PERF_ADDR_FILTER_ACTION_START,
+			[IF_ACT_STOP]	= PERF_ADDR_FILTER_ACTION_STOP,
+		};
 		ret = -EINVAL;
 
 		if (!*start)
@@ -8869,12 +8875,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 		switch (token) {
 		case IF_ACT_FILTER:
 		case IF_ACT_START:
-			filter->filter = 1;
-
 		case IF_ACT_STOP:
 			if (state != IF_STATE_ACTION)
 				goto fail;
 
+			filter->action = actions[token];
 			state = IF_STATE_SOURCE;
 			break;
 
@@ -8887,15 +8892,12 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 			if (state != IF_STATE_SOURCE)
 				goto fail;
 
-			if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
-				filter->range = 1;
-
 			*args[0].to = 0;
 			ret = kstrtoul(args[0].from, 0, &filter->offset);
 			if (ret)
 				goto fail;
 
-			if (filter->range) {
+			if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
 				*args[1].to = 0;
 				ret = kstrtoul(args[1].from, 0, &filter->size);
 				if (ret)
@@ -8903,7 +8905,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 			}
 
 			if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
-				int fpos = filter->range ? 2 : 1;
+				int fpos = token == IF_SRC_FILE ? 2 : 1;
 
 				filename = match_strdup(&args[fpos]);
 				if (!filename) {
@@ -8929,6 +8931,14 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 			if (kernel && event->attr.exclude_kernel)
 				goto fail;
 
+			/*
+			 * ACTION "filter" must have a non-zero length region
+			 * specified.
+			 */
+			if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
+			    !filter->size)
+				goto fail;
+
 			if (!kernel) {
 				if (!filename)
 					goto fail;
-- 
cgit v1.2.3-71-gd317


From 5149cbac4235e12a34cf089592a8bd1c9fcfa467 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Fri, 30 Mar 2018 17:27:58 -0400
Subject: locking/rwsem: Add DEBUG_RWSEMS to look for lock/unlock mismatches

For a rwsem, locking can either be exclusive or shared. The corresponding
exclusive or shared unlock must be used. Otherwise, the protected data
structures may get corrupted or the lock may be in an inconsistent state.

In order to detect such anomaly, a new configuration option DEBUG_RWSEMS
is added which can be enabled to look for such mismatches and print
warnings that that happens.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1522445280-7767-2-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem.c | 4 ++++
 kernel/locking/rwsem.h | 8 +++++++-
 lib/Kconfig.debug      | 8 ++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index f549c552dbf1..30465a2f2b6c 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -117,6 +117,7 @@ EXPORT_SYMBOL(down_write_trylock);
 void up_read(struct rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+	DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED);
 
 	__up_read(sem);
 }
@@ -129,6 +130,7 @@ EXPORT_SYMBOL(up_read);
 void up_write(struct rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+	DEBUG_RWSEMS_WARN_ON(sem->owner != current);
 
 	rwsem_clear_owner(sem);
 	__up_write(sem);
@@ -142,6 +144,7 @@ EXPORT_SYMBOL(up_write);
 void downgrade_write(struct rw_semaphore *sem)
 {
 	lock_downgrade(&sem->dep_map, _RET_IP_);
+	DEBUG_RWSEMS_WARN_ON(sem->owner != current);
 
 	rwsem_set_reader_owned(sem);
 	__downgrade_write(sem);
@@ -211,6 +214,7 @@ EXPORT_SYMBOL(down_write_killable_nested);
 
 void up_read_non_owner(struct rw_semaphore *sem)
 {
+	DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED);
 	__up_read(sem);
 }
 
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index a883b8f1fdc6..a17cba8d94bb 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -16,6 +16,12 @@
  */
 #define RWSEM_READER_OWNED	((struct task_struct *)1UL)
 
+#ifdef CONFIG_DEBUG_RWSEMS
+# define DEBUG_RWSEMS_WARN_ON(c)	DEBUG_LOCKS_WARN_ON(c)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c)
+#endif
+
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 /*
  * All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -41,7 +47,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
 	 * do a write to the rwsem cacheline when it is really necessary
 	 * to minimize cacheline contention.
 	 */
-	if (sem->owner != RWSEM_READER_OWNED)
+	if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED)
 		WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
 }
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 64155e310a9f..88650ab5cedb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1075,6 +1075,13 @@ config DEBUG_WW_MUTEX_SLOWPATH
 	 even a debug kernel.  If you are a driver writer, enable it.  If
 	 you are a distro, do not.
 
+config DEBUG_RWSEMS
+	bool "RW Semaphore debugging: basic checks"
+	depends on DEBUG_KERNEL && RWSEM_SPIN_ON_OWNER
+	help
+	  This debugging feature allows mismatched rw semaphore locks and unlocks
+	  to be detected and reported.
+
 config DEBUG_LOCK_ALLOC
 	bool "Lock debugging: detect incorrect freeing of live locks"
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
@@ -1097,6 +1104,7 @@ config PROVE_LOCKING
 	select DEBUG_SPINLOCK
 	select DEBUG_MUTEXES
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
+	select DEBUG_RWSEMS if RWSEM_SPIN_ON_OWNER
 	select DEBUG_LOCK_ALLOC
 	select TRACE_IRQFLAGS
 	default n
-- 
cgit v1.2.3-71-gd317


From d300b610812f3c10d146db4c18f98eba38834c70 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Mar 2018 11:34:26 +0100
Subject: kernel: use kernel_wait4() instead of sys_wait4()

All call sites of sys_wait4() set *rusage to NULL. Therefore, there is
no need for the copy_to_user() handling of *rusage, and we can use
kernel_wait4() directly.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Acked-by: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 kernel/exit.c          | 2 +-
 kernel/pid_namespace.c | 6 +++---
 kernel/umh.c           | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 995453d9fb55..c3c7ac560114 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1691,7 +1691,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
  */
 SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
 {
-	return sys_wait4(pid, stat_addr, options, NULL);
+	return kernel_wait4(pid, stat_addr, options, NULL);
 }
 
 #endif
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0b53eef7d34b..93b57f026688 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -242,16 +242,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 
 	/*
 	 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
-	 * sys_wait4() will also block until our children traced from the
+	 * kernel_wait4() will also block until our children traced from the
 	 * parent namespace are detached and become EXIT_DEAD.
 	 */
 	do {
 		clear_thread_flag(TIF_SIGPENDING);
-		rc = sys_wait4(-1, NULL, __WALL, NULL);
+		rc = kernel_wait4(-1, NULL, __WALL, NULL);
 	} while (rc != -ECHILD);
 
 	/*
-	 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
+	 * kernel_wait4() above can't reap the EXIT_DEAD children but we do not
 	 * really care, we could reparent them to the global init. We could
 	 * exit and reap ->child_reaper even if it is not the last thread in
 	 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
diff --git a/kernel/umh.c b/kernel/umh.c
index 18e5fa4b0e71..f76b3ff876cf 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -118,7 +118,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
 {
 	pid_t pid;
 
-	/* If SIGCLD is ignored sys_wait4 won't populate the status. */
+	/* If SIGCLD is ignored kernel_wait4 won't populate the status. */
 	kernel_sigaction(SIGCHLD, SIG_DFL);
 	pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
 	if (pid < 0) {
@@ -135,7 +135,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
 		 *
 		 * Thus the __user pointer cast is valid here.
 		 */
-		sys_wait4(pid, (int __user *)&ret, 0, NULL);
+		kernel_wait4(pid, (int __user *)&ret, 0, NULL);
 
 		/*
 		 * If ret is 0, either call_usermodehelper_exec_async failed and
-- 
cgit v1.2.3-71-gd317


From d53238cd51a80f6f2e5b9d64830c62e2086787bd Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Mar 2018 11:34:37 +0100
Subject: kernel: open-code sys_rt_sigpending() in sys_sigpending()

A similar but not fully equivalent code path is already open-coded
three times (in sys_rt_sigpending and in the two compat stubs), so
do it a fourth time here.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 include/linux/syscalls.h |  2 +-
 kernel/signal.c          | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0526286a0314..a63e21e7a3af 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -288,7 +288,7 @@ asmlinkage long sys_capset(cap_user_header_t header,
 				const cap_user_data_t data);
 asmlinkage long sys_personality(unsigned int personality);
 
-asmlinkage long sys_sigpending(old_sigset_t __user *set);
+asmlinkage long sys_sigpending(old_sigset_t __user *uset);
 asmlinkage long sys_sigprocmask(int how, old_sigset_t __user *set,
 				old_sigset_t __user *oset);
 asmlinkage long sys_sigaltstack(const struct sigaltstack __user *uss,
diff --git a/kernel/signal.c b/kernel/signal.c
index c6e4c83dc090..985c61749bcf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3629,11 +3629,20 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
 
 /**
  *  sys_sigpending - examine pending signals
- *  @set: where mask of pending signal is returned
+ *  @uset: where mask of pending signal is returned
  */
-SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
+SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
 {
-	return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); 
+	sigset_t set;
+	int err;
+
+	if (sizeof(old_sigset_t) > sizeof(*uset))
+		return -EINVAL;
+
+	err = do_sigpending(&set);
+	if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t)))
+		err = -EFAULT;
+	return err;
 }
 
 #ifdef CONFIG_COMPAT
-- 
cgit v1.2.3-71-gd317


From 6b27aef09fea32b805a8c81287b1bb80362dadb0 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sat, 17 Mar 2018 15:18:30 +0100
Subject: kexec: call do_kexec_load() in compat syscall directly

do_kexec_load() can be called directly by compat_sys_kexec() as long as
the same parameters checks are completed which are currently handled
(also) by sys_kexec(). Therefore, move those to kexec_load_check(),
call that newly introduced helper function from both sys_kexec() and
compat_sys_kexec(), and duplicate the remaining code from sys_kexec()
in compat_sys_kexec().

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Eric Biederman <ebiederm@xmission.com>
Cc: kexec@lists.infradead.org
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 kernel/kexec.c | 52 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index e62ec4dc6620..aed8fb2564b3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -192,11 +192,9 @@ out:
  * that to happen you need to do that yourself.
  */
 
-SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
-		struct kexec_segment __user *, segments, unsigned long, flags)
+static inline int kexec_load_check(unsigned long nr_segments,
+				   unsigned long flags)
 {
-	int result;
-
 	/* We only trust the superuser with rebooting the system. */
 	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
 		return -EPERM;
@@ -208,17 +206,29 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
 		return -EINVAL;
 
-	/* Verify we are on the appropriate architecture */
-	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
-		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
-		return -EINVAL;
-
 	/* Put an artificial cap on the number
 	 * of segments passed to kexec_load.
 	 */
 	if (nr_segments > KEXEC_SEGMENT_MAX)
 		return -EINVAL;
 
+	return 0;
+}
+
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+		struct kexec_segment __user *, segments, unsigned long, flags)
+{
+	int result;
+
+	result = kexec_load_check(nr_segments, flags);
+	if (result)
+		return result;
+
+	/* Verify we are on the appropriate architecture */
+	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+		return -EINVAL;
+
 	/* Because we write directly to the reserved memory
 	 * region when loading crash kernels we need a mutex here to
 	 * prevent multiple crash  kernels from attempting to load
@@ -247,15 +257,16 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
 	struct kexec_segment out, __user *ksegments;
 	unsigned long i, result;
 
+	result = kexec_load_check(nr_segments, flags);
+	if (result)
+		return result;
+
 	/* Don't allow clients that don't understand the native
 	 * architecture to do anything.
 	 */
 	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
 		return -EINVAL;
 
-	if (nr_segments > KEXEC_SEGMENT_MAX)
-		return -EINVAL;
-
 	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
 	for (i = 0; i < nr_segments; i++) {
 		result = copy_from_user(&in, &segments[i], sizeof(in));
@@ -272,6 +283,21 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
 			return -EFAULT;
 	}
 
-	return sys_kexec_load(entry, nr_segments, ksegments, flags);
+	/* Because we write directly to the reserved memory
+	 * region when loading crash kernels we need a mutex here to
+	 * prevent multiple crash  kernels from attempting to load
+	 * simultaneously, and to prevent a crash kernel from loading
+	 * over the top of a in use crash kernel.
+	 *
+	 * KISS: always take the mutex.
+	 */
+	if (!mutex_trylock(&kexec_mutex))
+		return -EBUSY;
+
+	result = do_kexec_load(entry, nr_segments, ksegments, flags);
+
+	mutex_unlock(&kexec_mutex);
+
+	return result;
 }
 #endif
-- 
cgit v1.2.3-71-gd317


From 2de0db992de189fccc83fed57c30875144821491 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Mar 2018 11:34:26 +0100
Subject: mm: use do_futex() instead of sys_futex() in mm_release()

sys_futex() is a wrapper to do_futex() which does not modify any
values here:

- uaddr, val and val3 are kept the same

- op is masked with FUTEX_CMD_MASK, but is always set to FUTEX_WAKE.
  Therefore, val2 is always 0.

- as utime is set to NULL, *timeout is NULL

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 include/linux/futex.h | 13 ++++++++++---
 kernel/fork.c         |  4 ++--
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index c0fb9a24bbd2..821ae502d3d8 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -9,9 +9,6 @@ struct inode;
 struct mm_struct;
 struct task_struct;
 
-long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
-	      u32 __user *uaddr2, u32 val2, u32 val3);
-
 extern int
 handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
 
@@ -55,6 +52,9 @@ union futex_key {
 
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
+
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+	      u32 __user *uaddr2, u32 val2, u32 val3);
 #ifdef CONFIG_HAVE_FUTEX_CMPXCHG
 #define futex_cmpxchg_enabled 1
 #else
@@ -64,6 +64,13 @@ extern int futex_cmpxchg_enabled;
 static inline void exit_robust_list(struct task_struct *curr)
 {
 }
+
+static inline long do_futex(u32 __user *uaddr, int op, u32 val,
+			    ktime_t *timeout, u32 __user *uaddr2,
+			    u32 val2, u32 val3)
+{
+	return -EINVAL;
+}
 #endif
 
 #ifdef CONFIG_FUTEX_PI
diff --git a/kernel/fork.c b/kernel/fork.c
index e5d9d405ae4e..b1e031aac9db 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1198,8 +1198,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 			 * not set up a proper pointer then tough luck.
 			 */
 			put_user(0, tsk->clear_child_tid);
-			sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
-					1, NULL, NULL, 0);
+			do_futex(tsk->clear_child_tid, FUTEX_WAKE,
+					1, NULL, NULL, 0, 0);
 		}
 		tsk->clear_child_tid = NULL;
 	}
-- 
cgit v1.2.3-71-gd317


From 192c58073d9abb1c507c89f109da5dc9f130ba70 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Mar 2018 11:34:27 +0100
Subject: kernel: add do_getpgid() helper; remove internal call to
 sys_getpgid()

Using the do_getpgid() helper removes an in-kernel call to the
sys_getpgid() syscall.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 kernel/sys.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index f2289de20e19..ebb138b841c8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1027,7 +1027,7 @@ out:
 	return err;
 }
 
-SYSCALL_DEFINE1(getpgid, pid_t, pid)
+static int do_getpgid(pid_t pid)
 {
 	struct task_struct *p;
 	struct pid *grp;
@@ -1055,11 +1055,16 @@ out:
 	return retval;
 }
 
+SYSCALL_DEFINE1(getpgid, pid_t, pid)
+{
+	return do_getpgid(pid);
+}
+
 #ifdef __ARCH_WANT_SYS_GETPGRP
 
 SYSCALL_DEFINE0(getpgrp)
 {
-	return sys_getpgid(0);
+	return do_getpgid(0);
 }
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 6203deb0a76f35b6dc476fecb228c850099a1bc4 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sat, 17 Mar 2018 17:11:51 +0100
Subject: kernel: add do_compat_sigaltstack() helper; remove in-kernel call to
 compat syscall

Using this helper allows us to avoid the in-kernel call to the
compat_sys_sigaltstack() syscall.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 kernel/signal.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 985c61749bcf..f04466655238 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3573,9 +3573,8 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
 }
 
 #ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE2(sigaltstack,
-			const compat_stack_t __user *, uss_ptr,
-			compat_stack_t __user *, uoss_ptr)
+static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr,
+				 compat_stack_t __user *uoss_ptr)
 {
 	stack_t uss, uoss;
 	int ret;
@@ -3602,9 +3601,16 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack,
 	return ret;
 }
 
+COMPAT_SYSCALL_DEFINE2(sigaltstack,
+			const compat_stack_t __user *, uss_ptr,
+			compat_stack_t __user *, uoss_ptr)
+{
+	return do_compat_sigaltstack(uss_ptr, uoss_ptr);
+}
+
 int compat_restore_altstack(const compat_stack_t __user *uss)
 {
-	int err = compat_sys_sigaltstack(uss, NULL);
+	int err = do_compat_sigaltstack(uss, NULL);
 	/* squash all but -EFAULT for now */
 	return err == -EFAULT ? err : 0;
 }
-- 
cgit v1.2.3-71-gd317


From e530dca584a9aa4aedf26adf0ed3c1c9b80e2767 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Mon, 19 Mar 2018 18:09:27 +0100
Subject: kernel: provide ksys_*() wrappers for syscalls called by
 kernel/uid16.c

Using these helpers allows us to avoid the in-kernel calls to these
syscalls: sys_setregid(), sys_setgid(), sys_setreuid(), sys_setuid(),
sys_setresuid(), sys_setresgid(), sys_setfsuid(), and sys_setfsgid().

The ksys_ prefix denotes that these function are meant as a drop-in
replacement for the syscall. In particular, they use the same calling
convention.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 kernel/sys.c   | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++--------
 kernel/uid16.c | 19 ++++++++++---------
 kernel/uid16.h | 14 ++++++++++++++
 3 files changed, 74 insertions(+), 17 deletions(-)
 create mode 100644 kernel/uid16.h

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index ebb138b841c8..550f47788ae4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -69,6 +69,8 @@
 #include <asm/io.h>
 #include <asm/unistd.h>
 
+#include "uid16.h"
+
 #ifndef SET_UNALIGN_CTL
 # define SET_UNALIGN_CTL(a, b)	(-EINVAL)
 #endif
@@ -340,7 +342,7 @@ out_unlock:
  *      operations (as far as semantic preservation is concerned).
  */
 #ifdef CONFIG_MULTIUSER
-SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
+long __sys_setregid(gid_t rgid, gid_t egid)
 {
 	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
@@ -392,12 +394,17 @@ error:
 	return retval;
 }
 
+SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
+{
+	return __sys_setregid(rgid, egid);
+}
+
 /*
  * setgid() is implemented like SysV w/ SAVED_IDS
  *
  * SMP: Same implicit races as above.
  */
-SYSCALL_DEFINE1(setgid, gid_t, gid)
+long __sys_setgid(gid_t gid)
 {
 	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
@@ -429,6 +436,11 @@ error:
 	return retval;
 }
 
+SYSCALL_DEFINE1(setgid, gid_t, gid)
+{
+	return __sys_setgid(gid);
+}
+
 /*
  * change the user struct in a credentials set to match the new UID
  */
@@ -473,7 +485,7 @@ static int set_user(struct cred *new)
  * 100% compatible with BSD.  A program which uses just setuid() will be
  * 100% compatible with POSIX with saved IDs.
  */
-SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
+long __sys_setreuid(uid_t ruid, uid_t euid)
 {
 	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
@@ -533,6 +545,11 @@ error:
 	return retval;
 }
 
+SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
+{
+	return __sys_setreuid(ruid, euid);
+}
+
 /*
  * setuid() is implemented like SysV with SAVED_IDS
  *
@@ -544,7 +561,7 @@ error:
  * will allow a root program to temporarily drop privileges and be able to
  * regain them by swapping the real and effective uid.
  */
-SYSCALL_DEFINE1(setuid, uid_t, uid)
+long __sys_setuid(uid_t uid)
 {
 	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
@@ -586,12 +603,17 @@ error:
 	return retval;
 }
 
+SYSCALL_DEFINE1(setuid, uid_t, uid)
+{
+	return __sys_setuid(uid);
+}
+
 
 /*
  * This function implements a generic ability to update ruid, euid,
  * and suid.  This allows you to implement the 4.4 compatible seteuid().
  */
-SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
+long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 {
 	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
@@ -656,6 +678,11 @@ error:
 	return retval;
 }
 
+SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
+{
+	return __sys_setresuid(ruid, euid, suid);
+}
+
 SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
 {
 	const struct cred *cred = current_cred();
@@ -678,7 +705,7 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
 /*
  * Same as above, but for rgid, egid, sgid.
  */
-SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
+long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 {
 	struct user_namespace *ns = current_user_ns();
 	const struct cred *old;
@@ -730,6 +757,11 @@ error:
 	return retval;
 }
 
+SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
+{
+	return __sys_setresgid(rgid, egid, sgid);
+}
+
 SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
 {
 	const struct cred *cred = current_cred();
@@ -757,7 +789,7 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
  * whatever uid it wants to). It normally shadows "euid", except when
  * explicitly set by setfsuid() or for access..
  */
-SYSCALL_DEFINE1(setfsuid, uid_t, uid)
+long __sys_setfsuid(uid_t uid)
 {
 	const struct cred *old;
 	struct cred *new;
@@ -793,10 +825,15 @@ change_okay:
 	return old_fsuid;
 }
 
+SYSCALL_DEFINE1(setfsuid, uid_t, uid)
+{
+	return __sys_setfsuid(uid);
+}
+
 /*
  * Samma på svenska..
  */
-SYSCALL_DEFINE1(setfsgid, gid_t, gid)
+long __sys_setfsgid(gid_t gid)
 {
 	const struct cred *old;
 	struct cred *new;
@@ -830,6 +867,11 @@ change_okay:
 	commit_creds(new);
 	return old_fsgid;
 }
+
+SYSCALL_DEFINE1(setfsgid, gid_t, gid)
+{
+	return __sys_setfsgid(gid);
+}
 #endif /* CONFIG_MULTIUSER */
 
 /**
diff --git a/kernel/uid16.c b/kernel/uid16.c
index ef1da2a5f9bd..7b930edfe461 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -18,6 +18,8 @@
 
 #include <linux/uaccess.h>
 
+#include "uid16.h"
+
 SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
 {
 	return sys_chown(filename, low2highuid(user), low2highgid(group));
@@ -35,27 +37,27 @@ SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
 
 SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
 {
-	return sys_setregid(low2highgid(rgid), low2highgid(egid));
+	return __sys_setregid(low2highgid(rgid), low2highgid(egid));
 }
 
 SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
 {
-	return sys_setgid(low2highgid(gid));
+	return __sys_setgid(low2highgid(gid));
 }
 
 SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
 {
-	return sys_setreuid(low2highuid(ruid), low2highuid(euid));
+	return __sys_setreuid(low2highuid(ruid), low2highuid(euid));
 }
 
 SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
 {
-	return sys_setuid(low2highuid(uid));
+	return __sys_setuid(low2highuid(uid));
 }
 
 SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
 {
-	return sys_setresuid(low2highuid(ruid), low2highuid(euid),
+	return __sys_setresuid(low2highuid(ruid), low2highuid(euid),
 				 low2highuid(suid));
 }
 
@@ -78,11 +80,10 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid
 
 SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
 {
-	return sys_setresgid(low2highgid(rgid), low2highgid(egid),
+	return __sys_setresgid(low2highgid(rgid), low2highgid(egid),
 				 low2highgid(sgid));
 }
 
-
 SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
 {
 	const struct cred *cred = current_cred();
@@ -102,12 +103,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid
 
 SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
 {
-	return sys_setfsuid(low2highuid(uid));
+	return __sys_setfsuid(low2highuid(uid));
 }
 
 SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
 {
-	return sys_setfsgid(low2highgid(gid));
+	return __sys_setfsgid(low2highgid(gid));
 }
 
 static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/uid16.h b/kernel/uid16.h
new file mode 100644
index 000000000000..cdca040f7602
--- /dev/null
+++ b/kernel/uid16.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_UID16_H
+#define LINUX_UID16_H
+
+long __sys_setuid(uid_t uid);
+long __sys_setgid(gid_t gid);
+long __sys_setreuid(uid_t ruid, uid_t euid);
+long __sys_setregid(gid_t rgid, gid_t egid);
+long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid);
+long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid);
+long __sys_setfsuid(uid_t uid);
+long __sys_setfsgid(gid_t gid);
+
+#endif /* LINUX_UID16_H */
-- 
cgit v1.2.3-71-gd317


From 7d4dd4f159b94003655b1688d9a4c0e2b6268ff8 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Wed, 14 Mar 2018 22:40:35 +0100
Subject: sched: add do_sched_yield() helper; remove in-kernel call to
 sched_yield()

Using the sched-internal do_sched_yield() helper allows us to get rid of
the sched-internal call to the sys_sched_yield() syscall.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Ingo Molnar <mingo@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 kernel/sched/core.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7c535eee0a6..8de4919c889a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4892,7 +4892,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
  *
  * Return: 0.
  */
-SYSCALL_DEFINE0(sched_yield)
+static void do_sched_yield(void)
 {
 	struct rq_flags rf;
 	struct rq *rq;
@@ -4913,7 +4913,11 @@ SYSCALL_DEFINE0(sched_yield)
 	sched_preempt_enable_no_resched();
 
 	schedule();
+}
 
+SYSCALL_DEFINE0(sched_yield)
+{
+	do_sched_yield();
 	return 0;
 }
 
@@ -4997,7 +5001,7 @@ EXPORT_SYMBOL(__cond_resched_softirq);
 void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
-	sys_sched_yield();
+	do_sched_yield();
 }
 EXPORT_SYMBOL(yield);
 
-- 
cgit v1.2.3-71-gd317


From b6e9b0babb7a02ae4f00f053974609000f00950e Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sat, 17 Mar 2018 16:00:25 +0100
Subject: mm: add kernel_migrate_pages() helper, move compat syscall to
 mm/mempolicy.c

Move compat_sys_migrate_pages() to mm/mempolicy.c and make it call a newly
introduced helper -- kernel_migrate_pages() -- instead of the syscall.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 kernel/compat.c | 33 ---------------------------------
 mm/mempolicy.c  | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 3f5fa8902e7d..51bdf1808943 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -508,39 +508,6 @@ COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
 	}
 	return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
 }
-
-COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
-		       compat_ulong_t, maxnode,
-		       const compat_ulong_t __user *, old_nodes,
-		       const compat_ulong_t __user *, new_nodes)
-{
-	unsigned long __user *old = NULL;
-	unsigned long __user *new = NULL;
-	nodemask_t tmp_mask;
-	unsigned long nr_bits;
-	unsigned long size;
-
-	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
-	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
-	if (old_nodes) {
-		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
-			return -EFAULT;
-		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
-		if (new_nodes)
-			new = old + size / sizeof(unsigned long);
-		if (copy_to_user(old, nodes_addr(tmp_mask), size))
-			return -EFAULT;
-	}
-	if (new_nodes) {
-		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
-			return -EFAULT;
-		if (new == NULL)
-			new = compat_alloc_user_space(size);
-		if (copy_to_user(new, nodes_addr(tmp_mask), size))
-			return -EFAULT;
-	}
-	return sys_migrate_pages(pid, nr_bits + 1, old, new);
-}
 #endif
 
 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d879f1d8a44a..7399ede02b5f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1377,9 +1377,9 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
 	return do_set_mempolicy(mode, flags, &nodes);
 }
 
-SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
-		const unsigned long __user *, old_nodes,
-		const unsigned long __user *, new_nodes)
+static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
+				const unsigned long __user *old_nodes,
+				const unsigned long __user *new_nodes)
 {
 	struct mm_struct *mm = NULL;
 	struct task_struct *task;
@@ -1469,6 +1469,13 @@ out_put:
 
 }
 
+SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
+		const unsigned long __user *, old_nodes,
+		const unsigned long __user *, new_nodes)
+{
+	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
+}
+
 
 /* Retrieve NUMA policy */
 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
@@ -1571,7 +1578,40 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 }
 
-#endif
+COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
+		       compat_ulong_t, maxnode,
+		       const compat_ulong_t __user *, old_nodes,
+		       const compat_ulong_t __user *, new_nodes)
+{
+	unsigned long __user *old = NULL;
+	unsigned long __user *new = NULL;
+	nodemask_t tmp_mask;
+	unsigned long nr_bits;
+	unsigned long size;
+
+	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
+	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+	if (old_nodes) {
+		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
+			return -EFAULT;
+		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
+		if (new_nodes)
+			new = old + size / sizeof(unsigned long);
+		if (copy_to_user(old, nodes_addr(tmp_mask), size))
+			return -EFAULT;
+	}
+	if (new_nodes) {
+		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
+			return -EFAULT;
+		if (new == NULL)
+			new = compat_alloc_user_space(size);
+		if (copy_to_user(new, nodes_addr(tmp_mask), size))
+			return -EFAULT;
+	}
+	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
+}
+
+#endif /* CONFIG_COMPAT */
 
 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
 						unsigned long addr)
-- 
cgit v1.2.3-71-gd317


From 7addf44388255f6fa99c83e3e2ad79cef0813698 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sat, 17 Mar 2018 16:08:03 +0100
Subject: mm: add kernel_move_pages() helper, move compat syscall to
 mm/migrate.c

Move compat_sys_move_pages() to mm/migrate.c and make it call a newly
introduced helper -- kernel_move_pages() -- instead of the syscall.

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 kernel/compat.c | 22 ----------------------
 mm/migrate.c    | 39 +++++++++++++++++++++++++++++++++++----
 2 files changed, 35 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 51bdf1808943..6d21894806b4 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -488,28 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
 }
 EXPORT_SYMBOL_GPL(get_compat_sigset);
 
-#ifdef CONFIG_NUMA
-COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
-		       compat_uptr_t __user *, pages32,
-		       const int __user *, nodes,
-		       int __user *, status,
-		       int, flags)
-{
-	const void __user * __user *pages;
-	int i;
-
-	pages = compat_alloc_user_space(nr_pages * sizeof(void *));
-	for (i = 0; i < nr_pages; i++) {
-		compat_uptr_t p;
-
-		if (get_user(p, pages32 + i) ||
-			put_user(compat_ptr(p), pages + i))
-			return -EFAULT;
-	}
-	return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
-}
-#endif
-
 /*
  * Allocate user-space memory for the duration of a single system call,
  * in order to marshall parameters inside a compat thunk.
diff --git a/mm/migrate.c b/mm/migrate.c
index 1e5525a25691..003886606a22 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -34,6 +34,7 @@
 #include <linux/backing-dev.h>
 #include <linux/compaction.h>
 #include <linux/syscalls.h>
+#include <linux/compat.h>
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
@@ -1745,10 +1746,10 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
  * Move a list of pages in the address space of the currently executing
  * process.
  */
-SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
-		const void __user * __user *, pages,
-		const int __user *, nodes,
-		int __user *, status, int, flags)
+static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
+			     const void __user * __user *pages,
+			     const int __user *nodes,
+			     int __user *status, int flags)
 {
 	struct task_struct *task;
 	struct mm_struct *mm;
@@ -1807,6 +1808,36 @@ out:
 	return err;
 }
 
+SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
+		const void __user * __user *, pages,
+		const int __user *, nodes,
+		int __user *, status, int, flags)
+{
+	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
+		       compat_uptr_t __user *, pages32,
+		       const int __user *, nodes,
+		       int __user *, status,
+		       int, flags)
+{
+	const void __user * __user *pages;
+	int i;
+
+	pages = compat_alloc_user_space(nr_pages * sizeof(void *));
+	for (i = 0; i < nr_pages; i++) {
+		compat_uptr_t p;
+
+		if (get_user(p, pages32 + i) ||
+			put_user(compat_ptr(p), pages + i))
+			return -EFAULT;
+	}
+	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+#endif /* CONFIG_COMPAT */
+
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * Returns true if this is a safe migration target node for misplaced NUMA
-- 
cgit v1.2.3-71-gd317


From ab0d1e85bfd0c25260f02cd3708d5abdfb5b5a9c Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 4 Mar 2018 21:54:05 +0100
Subject: fs/quota: use COMPAT_SYSCALL_DEFINE for sys32_quotactl()

While sys32_quotactl() is only needed on x86, it can use the recommended
COMPAT_SYSCALL_DEFINEx() machinery for its setup.

Acked-by: Jan Kara <jack@suse.cz>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/x86/entry/syscalls/syscall_32.tbl | 2 +-
 fs/quota/compat.c                      | 5 +++--
 include/linux/compat.h                 | 3 +++
 include/linux/syscalls.h               | 2 --
 kernel/sys_ni.c                        | 2 +-
 5 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ef6edaf285cd..c58f75b088c5 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -137,7 +137,7 @@
 128	i386	init_module		sys_init_module
 129	i386	delete_module		sys_delete_module
 130	i386	get_kernel_syms
-131	i386	quotactl		sys_quotactl			sys32_quotactl
+131	i386	quotactl		sys_quotactl			compat_sys_quotactl32
 132	i386	getpgid			sys_getpgid
 133	i386	fchdir			sys_fchdir
 134	i386	bdflush			sys_bdflush
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
index 1577a2fd51f4..c30572857619 100644
--- a/fs/quota/compat.c
+++ b/fs/quota/compat.c
@@ -41,8 +41,9 @@ struct compat_fs_quota_stat {
 	__u16		qs_iwarnlimit;
 };
 
-asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
-						qid_t id, void __user *addr)
+COMPAT_SYSCALL_DEFINE4(quotactl32, unsigned int, cmd,
+		       const char __user *, special, qid_t, id,
+		       void __user *, addr)
 {
 	unsigned int cmds;
 	struct if_dqblk __user *dqblk;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 16c3027074a2..f1649a5e6716 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -461,6 +461,9 @@ asmlinkage ssize_t compat_sys_pwritev2(compat_ulong_t fd,
 		const struct compat_iovec __user *vec,
 		compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags);
 
+asmlinkage long compat_sys_quotactl32(unsigned int cmd,
+		const char __user *special, qid_t id, void __user *addr);
+
 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
 asmlinkage long compat_sys_preadv64(unsigned long fd,
 		const struct compat_iovec __user *vec,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a63e21e7a3af..6ab7ed71a8b6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -241,8 +241,6 @@ static inline void addr_limit_user_check(void)
 #endif
 }
 
-asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
-			       qid_t id, void __user *addr);
 asmlinkage long sys_time(time_t __user *tloc);
 asmlinkage long sys_stime(time_t __user *tptr);
 asmlinkage long sys_gettimeofday(struct timeval __user *tv,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index b5189762d275..951dbda5c2b4 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -18,7 +18,7 @@ asmlinkage long sys_ni_syscall(void)
 }
 
 cond_syscall(sys_quotactl);
-cond_syscall(sys32_quotactl);
+cond_syscall(compat_sys_quotactl32);
 cond_syscall(sys_acct);
 cond_syscall(sys_lookup_dcookie);
 cond_syscall(compat_sys_lookup_dcookie);
-- 
cgit v1.2.3-71-gd317


From 55731b3cda3a85ee888dac3bf1f36489f275c187 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Mar 2018 11:34:55 +0100
Subject: fs: add do_fchownat(), ksys_fchown() helpers and ksys_{,l}chown()
 wrappers

Using the fs-interal do_fchownat() wrapper allows us to get rid of
fs-internal calls to the sys_fchownat() syscall.

Introducing the ksys_fchown() helper and the ksys_{,}chown() wrappers
allows us to avoid the in-kernel calls to the sys_{,l,f}chown() syscalls.
The ksys_ prefix denotes that these functions are meant as a drop-in
replacement for the syscalls. In particular, they use the same calling
convention as sys_{,l,f}chown().

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/s390/kernel/compat_linux.c |  6 +++---
 fs/internal.h                   |  2 ++
 fs/open.c                       | 23 +++++++++++++++++------
 include/linux/syscalls.h        | 17 +++++++++++++++++
 init/initramfs.c                |  8 ++++----
 kernel/uid16.c                  |  6 +++---
 6 files changed, 46 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 5a9cfde5fc28..9a9bb395359c 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -89,18 +89,18 @@
 COMPAT_SYSCALL_DEFINE3(s390_chown16, const char __user *, filename,
 		       u16, user, u16, group)
 {
-	return sys_chown(filename, low2highuid(user), low2highgid(group));
+	return ksys_chown(filename, low2highuid(user), low2highgid(group));
 }
 
 COMPAT_SYSCALL_DEFINE3(s390_lchown16, const char __user *,
 		       filename, u16, user, u16, group)
 {
-	return sys_lchown(filename, low2highuid(user), low2highgid(group));
+	return ksys_lchown(filename, low2highuid(user), low2highgid(group));
 }
 
 COMPAT_SYSCALL_DEFINE3(s390_fchown16, unsigned int, fd, u16, user, u16, group)
 {
-	return sys_fchown(fd, low2highuid(user), low2highgid(group));
+	return ksys_fchown(fd, low2highuid(user), low2highgid(group));
 }
 
 COMPAT_SYSCALL_DEFINE2(s390_setregid16, u16, rgid, u16, egid)
diff --git a/fs/internal.h b/fs/internal.h
index 26f4f05b52ef..c797480cbd6f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -121,6 +121,8 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 
 long do_faccessat(int dfd, const char __user *filename, int mode);
 int do_fchmodat(int dfd, const char __user *filename, umode_t mode);
+int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
+		int flag);
 
 extern int open_check_o_direct(struct file *f);
 extern int vfs_open(const struct path *, struct file *, const struct cred *);
diff --git a/fs/open.c b/fs/open.c
index 0fc8188be31a..7b2eccb541f2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -645,8 +645,8 @@ retry_deleg:
 	return error;
 }
 
-SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
-		gid_t, group, int, flag)
+int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
+		int flag)
 {
 	struct path path;
 	int error = -EINVAL;
@@ -677,18 +677,24 @@ out:
 	return error;
 }
 
+SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
+		gid_t, group, int, flag)
+{
+	return do_fchownat(dfd, filename, user, group, flag);
+}
+
 SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
 {
-	return sys_fchownat(AT_FDCWD, filename, user, group, 0);
+	return do_fchownat(AT_FDCWD, filename, user, group, 0);
 }
 
 SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
 {
-	return sys_fchownat(AT_FDCWD, filename, user, group,
-			    AT_SYMLINK_NOFOLLOW);
+	return do_fchownat(AT_FDCWD, filename, user, group,
+			   AT_SYMLINK_NOFOLLOW);
 }
 
-SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
+int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
 {
 	struct fd f = fdget(fd);
 	int error = -EBADF;
@@ -708,6 +714,11 @@ out:
 	return error;
 }
 
+SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
+{
+	return ksys_fchown(fd, user, group);
+}
+
 int open_check_o_direct(struct file *f)
 {
 	/* NB: we're sure to have correct a_ops only after f_op->open */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 33f06de090ea..df0d1e818a6e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -954,6 +954,7 @@ int ksys_chroot(const char __user *filename);
 ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count);
 int ksys_chdir(const char __user *filename);
 int ksys_fchmod(unsigned int fd, umode_t mode);
+int ksys_fchown(unsigned int fd, uid_t user, gid_t group);
 
 /*
  * The following kernel syscall equivalents are just wrappers to fs-internal
@@ -1021,4 +1022,20 @@ static inline long ksys_access(const char __user *filename, int mode)
 	return do_faccessat(AT_FDCWD, filename, mode);
 }
 
+extern int do_fchownat(int dfd, const char __user *filename, uid_t user,
+		       gid_t group, int flag);
+
+static inline long ksys_chown(const char __user *filename, uid_t user,
+			      gid_t group)
+{
+	return do_fchownat(AT_FDCWD, filename, user, group, 0);
+}
+
+static inline long ksys_lchown(const char __user *filename, uid_t user,
+			       gid_t group)
+{
+	return do_fchownat(AT_FDCWD, filename, user, group,
+			     AT_SYMLINK_NOFOLLOW);
+}
+
 #endif
diff --git a/init/initramfs.c b/init/initramfs.c
index 16c3c23076e2..35173bef7c00 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -343,7 +343,7 @@ static int __init do_name(void)
 			wfd = sys_open(collected, openflags, mode);
 
 			if (wfd >= 0) {
-				sys_fchown(wfd, uid, gid);
+				ksys_fchown(wfd, uid, gid);
 				ksys_fchmod(wfd, mode);
 				if (body_len)
 					sys_ftruncate(wfd, body_len);
@@ -353,14 +353,14 @@ static int __init do_name(void)
 		}
 	} else if (S_ISDIR(mode)) {
 		ksys_mkdir(collected, mode);
-		sys_chown(collected, uid, gid);
+		ksys_chown(collected, uid, gid);
 		ksys_chmod(collected, mode);
 		dir_add(collected, mtime);
 	} else if (S_ISBLK(mode) || S_ISCHR(mode) ||
 		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
 		if (maybe_link() == 0) {
 			ksys_mknod(collected, mode, rdev);
-			sys_chown(collected, uid, gid);
+			ksys_chown(collected, uid, gid);
 			ksys_chmod(collected, mode);
 			do_utime(collected, mtime);
 		}
@@ -393,7 +393,7 @@ static int __init do_symlink(void)
 	collected[N_ALIGN(name_len) + body_len] = '\0';
 	clean_path(collected, 0);
 	ksys_symlink(collected + N_ALIGN(name_len), collected);
-	sys_lchown(collected, uid, gid);
+	ksys_lchown(collected, uid, gid);
 	do_utime(collected, mtime);
 	state = SkipIt;
 	next_state = Reset;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 7b930edfe461..af6925d8599b 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -22,17 +22,17 @@
 
 SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
 {
-	return sys_chown(filename, low2highuid(user), low2highgid(group));
+	return ksys_chown(filename, low2highuid(user), low2highgid(group));
 }
 
 SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
 {
-	return sys_lchown(filename, low2highuid(user), low2highgid(group));
+	return ksys_lchown(filename, low2highuid(user), low2highgid(group));
 }
 
 SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
 {
-	return sys_fchown(fd, low2highuid(user), low2highgid(group));
+	return ksys_fchown(fd, low2highuid(user), low2highgid(group));
 }
 
 SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
-- 
cgit v1.2.3-71-gd317


From 70f68ee81e2e9ad5105b8d2bd324e890e94c6ad9 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Wed, 14 Mar 2018 22:35:11 +0100
Subject: fs: add ksys_sync() helper; remove in-kernel calls to sys_sync()

Using this helper allows us to avoid the in-kernel calls to the
sys_sync() syscall. The ksys_ prefix denotes that this function
is meant as a drop-in replacement for the syscall. In particular, it
uses the same calling convention as sys_sync().

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 arch/sparc/kernel/setup_32.c | 2 +-
 drivers/tty/sysrq.c          | 2 +-
 fs/sync.c                    | 7 ++++++-
 include/linux/syscalls.h     | 1 +
 kernel/power/hibernate.c     | 2 +-
 kernel/power/suspend.c       | 2 +-
 kernel/power/user.c          | 2 +-
 7 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index 2e3a3e203061..13664c377196 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -86,7 +86,7 @@ static void prom_sync_me(void)
 	show_free_areas(0, NULL);
 	if (!is_idle_task(current)) {
 		local_irq_enable();
-		sys_sync();
+		ksys_sync();
 		local_irq_disable();
 	}
 	prom_printf("Returning to prom\n");
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index b674793be478..6364890575ec 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -660,7 +660,7 @@ static void sysrq_do_reset(struct timer_list *t)
 
 	state->reset_requested = true;
 
-	sys_sync();
+	ksys_sync();
 	kernel_restart(NULL);
 }
 
diff --git a/fs/sync.c b/fs/sync.c
index 6e0a2cbaf6de..602ae94bb67e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -105,7 +105,7 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
  * just write metadata (such as inodes or bitmaps) to block device page cache
  * and do not sync it on their own in ->sync_fs().
  */
-SYSCALL_DEFINE0(sync)
+void ksys_sync(void)
 {
 	int nowait = 0, wait = 1;
 
@@ -117,6 +117,11 @@ SYSCALL_DEFINE0(sync)
 	iterate_bdevs(fdatawait_one_bdev, NULL);
 	if (unlikely(laptop_mode))
 		laptop_sync_completion();
+}
+
+SYSCALL_DEFINE0(sync)
+{
+	ksys_sync();
 	return 0;
 }
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3a2e90842ff8..0a9942b3e718 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -960,6 +960,7 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent,
 int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
 off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence);
 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count);
+void ksys_sync(void);
 
 /*
  * The following kernel syscall equivalents are just wrappers to fs-internal
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a5c36e9c56a6..4710f1b142fc 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -701,7 +701,7 @@ int hibernate(void)
 	}
 
 	pr_info("Syncing filesystems ... \n");
-	sys_sync();
+	ksys_sync();
 	pr_info("done.\n");
 
 	error = freeze_processes();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0685c4499431..4c10be0f4843 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -560,7 +560,7 @@ static int enter_state(suspend_state_t state)
 #ifndef CONFIG_SUSPEND_SKIP_SYNC
 	trace_suspend_resume(TPS("sync_filesystems"), 0, true);
 	pr_info("Syncing filesystems ... ");
-	sys_sync();
+	ksys_sync();
 	pr_cont("done.\n");
 	trace_suspend_resume(TPS("sync_filesystems"), 0, false);
 #endif
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 22df9f7ff672..75c959de4b29 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -224,7 +224,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 			break;
 
 		printk("Syncing filesystems ... ");
-		sys_sync();
+		ksys_sync();
 		printk("done.\n");
 
 		error = freeze_processes();
-- 
cgit v1.2.3-71-gd317


From 9b32105ec6b13d32d5db6a6e7992c97ce54b5ea7 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Mar 2018 11:34:42 +0100
Subject: kernel: add ksys_unshare() helper; remove in-kernel calls to
 sys_unshare()

Using this helper allows us to avoid the in-kernel calls to the
sys_unshare() syscall. The ksys_ prefix denotes that this function is meant
as a drop-in replacement for the syscall. In particular, it uses the same
calling convention as sys_unshare().

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 drivers/base/devtmpfs.c  | 2 +-
 include/linux/syscalls.h | 1 +
 init/do_mounts_initrd.c  | 2 +-
 kernel/fork.c            | 7 ++++++-
 4 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 3f7ee954fd2c..f7768077e817 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -380,7 +380,7 @@ static int devtmpfsd(void *p)
 {
 	char options[] = "mode=0755";
 	int *err = p;
-	*err = sys_unshare(CLONE_NEWNS);
+	*err = ksys_unshare(CLONE_NEWNS);
 	if (*err)
 		goto out;
 	*err = ksys_mount("devtmpfs", "/", "devtmpfs", MS_SILENT, options);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0a9942b3e718..e724dda509e0 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -961,6 +961,7 @@ int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
 off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence);
 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count);
 void ksys_sync(void);
+int ksys_unshare(unsigned long unshare_flags);
 
 /*
  * The following kernel syscall equivalents are just wrappers to fs-internal
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index 03ec0c1b7553..d1d3e53bdeef 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -36,7 +36,7 @@ __setup("noinitrd", no_initrd);
 
 static int init_linuxrc(struct subprocess_info *info, struct cred *new)
 {
-	sys_unshare(CLONE_FS | CLONE_FILES);
+	ksys_unshare(CLONE_FS | CLONE_FILES);
 	/* stdin/stdout/stderr for /linuxrc */
 	ksys_open("/dev/console", O_RDWR, 0);
 	ksys_dup(0);
diff --git a/kernel/fork.c b/kernel/fork.c
index b1e031aac9db..f71b67dc156d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2354,7 +2354,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
  * constructed. Here we are modifying the current, active,
  * task_struct.
  */
-SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
+int ksys_unshare(unsigned long unshare_flags)
 {
 	struct fs_struct *fs, *new_fs = NULL;
 	struct files_struct *fd, *new_fd = NULL;
@@ -2470,6 +2470,11 @@ bad_unshare_out:
 	return err;
 }
 
+SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
+{
+	return ksys_unshare(unshare_flags);
+}
+
 /*
  *	Helper to unshare the files of the current task.
  *	We don't want to expose copy_files internals to
-- 
cgit v1.2.3-71-gd317


From e2aaa9f423367ee03755d632555c242629a08d00 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Fri, 16 Mar 2018 12:36:06 +0100
Subject: kernel: add ksys_setsid() helper; remove in-kernel call to
 sys_setsid()

Using this helper allows us to avoid the in-kernel call to the
sys_setsid() syscall. The ksys_ prefix denotes that this function
is meant as a drop-in replacement for the syscall. In particular, it
uses the same calling convention as sys_setsid().

This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 include/linux/syscalls.h | 1 +
 init/do_mounts_initrd.c  | 2 +-
 kernel/sys.c             | 7 ++++++-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e724dda509e0..4dd685ee425d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -962,6 +962,7 @@ off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence);
 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count);
 void ksys_sync(void);
 int ksys_unshare(unsigned long unshare_flags);
+int ksys_setsid(void);
 
 /*
  * The following kernel syscall equivalents are just wrappers to fs-internal
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index d1d3e53bdeef..5a91aefa7305 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -45,7 +45,7 @@ static int init_linuxrc(struct subprocess_info *info, struct cred *new)
 	ksys_chdir("/root");
 	ksys_mount(".", "/", NULL, MS_MOVE, NULL);
 	ksys_chroot(".");
-	sys_setsid();
+	ksys_setsid();
 	return 0;
 }
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 550f47788ae4..ad692183dfe9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1150,7 +1150,7 @@ static void set_special_pids(struct pid *pid)
 		change_pid(curr, PIDTYPE_PGID, pid);
 }
 
-SYSCALL_DEFINE0(setsid)
+int ksys_setsid(void)
 {
 	struct task_struct *group_leader = current->group_leader;
 	struct pid *sid = task_pid(group_leader);
@@ -1183,6 +1183,11 @@ out:
 	return err;
 }
 
+SYSCALL_DEFINE0(setsid)
+{
+	return ksys_setsid();
+}
+
 DECLARE_RWSEM(uts_sem);
 
 #ifdef COMPAT_UTS_MACHINE
-- 
cgit v1.2.3-71-gd317


From 70dd4b3160798b647b7f30baf9fb6e8a5933d4e2 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Tue, 6 Mar 2018 19:53:01 +0100
Subject: kernel/sys_ni: sort cond_syscall() entries

Shuffle the cond_syscall() entries in kernel/sys_ni.c around so that they
are kept in the same order as in include/uapi/asm-generic/unistd.h. For
better structuring, add the same comments as in that file, but keep a few
additional comments and extend the commentary where it seems useful.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 kernel/sys_ni.c | 506 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 332 insertions(+), 174 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 951dbda5c2b4..0c1538f5a559 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -17,245 +17,403 @@ asmlinkage long sys_ni_syscall(void)
 	return -ENOSYS;
 }
 
-cond_syscall(sys_quotactl);
-cond_syscall(compat_sys_quotactl32);
-cond_syscall(sys_acct);
+/*
+ * This list is kept in the same order as include/uapi/asm-generic/unistd.h.
+ * Architecture specific entries go below, followed by deprecated or obsolete
+ * system calls.
+ */
+
+cond_syscall(sys_io_setup);
+cond_syscall(compat_sys_io_setup);
+cond_syscall(sys_io_destroy);
+cond_syscall(sys_io_submit);
+cond_syscall(compat_sys_io_submit);
+cond_syscall(sys_io_cancel);
+cond_syscall(sys_io_getevents);
+cond_syscall(compat_sys_io_getevents);
+
+/* fs/xattr.c */
+
+/* fs/dcache.c */
+
+/* fs/cookies.c */
 cond_syscall(sys_lookup_dcookie);
 cond_syscall(compat_sys_lookup_dcookie);
-cond_syscall(sys_swapon);
-cond_syscall(sys_swapoff);
+
+/* fs/eventfd.c */
+cond_syscall(sys_eventfd2);
+
+/* fs/eventfd.c */
+cond_syscall(sys_epoll_create1);
+cond_syscall(sys_epoll_ctl);
+cond_syscall(sys_epoll_pwait);
+cond_syscall(compat_sys_epoll_pwait);
+
+/* fs/fcntl.c */
+
+/* fs/inotify_user.c */
+cond_syscall(sys_inotify_init1);
+cond_syscall(sys_inotify_add_watch);
+cond_syscall(sys_inotify_rm_watch);
+
+/* fs/ioctl.c */
+
+/* fs/ioprio.c */
+cond_syscall(sys_ioprio_set);
+cond_syscall(sys_ioprio_get);
+
+/* fs/locks.c */
+cond_syscall(sys_flock);
+
+/* fs/namei.c */
+
+/* fs/namespace.c */
+
+/* fs/nfsctl.c */
+
+/* fs/open.c */
+
+/* fs/pipe.c */
+
+/* fs/quota.c */
+cond_syscall(sys_quotactl);
+
+/* fs/readdir.c */
+
+/* fs/read_write.c */
+
+/* fs/sendfile.c */
+
+/* fs/select.c */
+
+/* fs/signalfd.c */
+cond_syscall(sys_signalfd4);
+cond_syscall(compat_sys_signalfd4);
+
+/* fs/splice.c */
+
+/* fs/stat.c */
+
+/* fs/sync.c */
+
+/* fs/timerfd.c */
+cond_syscall(sys_timerfd_create);
+cond_syscall(sys_timerfd_settime);
+cond_syscall(compat_sys_timerfd_settime);
+cond_syscall(sys_timerfd_gettime);
+cond_syscall(compat_sys_timerfd_gettime);
+
+/* fs/utimes.c */
+
+/* kernel/acct.c */
+cond_syscall(sys_acct);
+
+/* kernel/capability.c */
+cond_syscall(sys_capget);
+cond_syscall(sys_capset);
+
+/* kernel/exec_domain.c */
+
+/* kernel/exit.c */
+
+/* kernel/fork.c */
+
+/* kernel/futex.c */
+cond_syscall(sys_futex);
+cond_syscall(compat_sys_futex);
+cond_syscall(sys_set_robust_list);
+cond_syscall(compat_sys_set_robust_list);
+cond_syscall(sys_get_robust_list);
+cond_syscall(compat_sys_get_robust_list);
+
+/* kernel/hrtimer.c */
+
+/* kernel/itimer.c */
+
+/* kernel/kexec.c */
 cond_syscall(sys_kexec_load);
 cond_syscall(compat_sys_kexec_load);
-cond_syscall(sys_kexec_file_load);
+
+/* kernel/module.c */
 cond_syscall(sys_init_module);
-cond_syscall(sys_finit_module);
 cond_syscall(sys_delete_module);
+
+/* kernel/posix-timers.c */
+
+/* kernel/printk.c */
+cond_syscall(sys_syslog);
+
+/* kernel/ptrace.c */
+
+/* kernel/sched/core.c */
+
+/* kernel/signal.c */
+
+/* kernel/sys.c */
+cond_syscall(sys_setregid);
+cond_syscall(sys_setgid);
+cond_syscall(sys_setreuid);
+cond_syscall(sys_setuid);
+cond_syscall(sys_setresuid);
+cond_syscall(sys_getresuid);
+cond_syscall(sys_setresgid);
+cond_syscall(sys_getresgid);
+cond_syscall(sys_setfsuid);
+cond_syscall(sys_setfsgid);
+cond_syscall(sys_setgroups);
+cond_syscall(sys_getgroups);
+
+/* kernel/time.c */
+
+/* kernel/timer.c */
+
+/* ipc/mqueue.c */
+cond_syscall(sys_mq_open);
+cond_syscall(compat_sys_mq_open);
+cond_syscall(sys_mq_unlink);
+cond_syscall(sys_mq_timedsend);
+cond_syscall(compat_sys_mq_timedsend);
+cond_syscall(sys_mq_timedreceive);
+cond_syscall(compat_sys_mq_timedreceive);
+cond_syscall(sys_mq_notify);
+cond_syscall(compat_sys_mq_notify);
+cond_syscall(sys_mq_getsetattr);
+cond_syscall(compat_sys_mq_getsetattr);
+
+/* ipc/msg.c */
+cond_syscall(sys_msgget);
+cond_syscall(sys_msgctl);
+cond_syscall(compat_sys_msgctl);
+cond_syscall(sys_msgrcv);
+cond_syscall(compat_sys_msgrcv);
+cond_syscall(sys_msgsnd);
+cond_syscall(compat_sys_msgsnd);
+
+/* ipc/sem.c */
+cond_syscall(sys_semget);
+cond_syscall(sys_semctl);
+cond_syscall(compat_sys_semctl);
+cond_syscall(sys_semtimedop);
+cond_syscall(compat_sys_semtimedop);
+cond_syscall(sys_semop);
+
+/* ipc/shm.c */
+cond_syscall(sys_shmget);
+cond_syscall(sys_shmctl);
+cond_syscall(compat_sys_shmctl);
+cond_syscall(sys_shmat);
+cond_syscall(compat_sys_shmat);
+cond_syscall(sys_shmdt);
+
+/* net/socket.c */
+cond_syscall(sys_socket);
 cond_syscall(sys_socketpair);
 cond_syscall(sys_bind);
 cond_syscall(sys_listen);
 cond_syscall(sys_accept);
-cond_syscall(sys_accept4);
 cond_syscall(sys_connect);
 cond_syscall(sys_getsockname);
 cond_syscall(sys_getpeername);
-cond_syscall(sys_sendto);
-cond_syscall(sys_send);
-cond_syscall(sys_recvfrom);
-cond_syscall(sys_recv);
-cond_syscall(sys_socket);
 cond_syscall(sys_setsockopt);
 cond_syscall(compat_sys_setsockopt);
 cond_syscall(sys_getsockopt);
 cond_syscall(compat_sys_getsockopt);
+cond_syscall(sys_sendto);
 cond_syscall(sys_shutdown);
+cond_syscall(sys_recvfrom);
+cond_syscall(compat_sys_recvfrom);
 cond_syscall(sys_sendmsg);
-cond_syscall(sys_sendmmsg);
 cond_syscall(compat_sys_sendmsg);
-cond_syscall(compat_sys_sendmmsg);
 cond_syscall(sys_recvmsg);
-cond_syscall(sys_recvmmsg);
 cond_syscall(compat_sys_recvmsg);
-cond_syscall(compat_sys_recv);
-cond_syscall(compat_sys_recvfrom);
-cond_syscall(compat_sys_recvmmsg);
-cond_syscall(sys_socketcall);
-cond_syscall(sys_futex);
-cond_syscall(compat_sys_futex);
-cond_syscall(sys_set_robust_list);
-cond_syscall(compat_sys_set_robust_list);
-cond_syscall(sys_get_robust_list);
-cond_syscall(compat_sys_get_robust_list);
-cond_syscall(sys_epoll_create);
-cond_syscall(sys_epoll_create1);
-cond_syscall(sys_epoll_ctl);
-cond_syscall(sys_epoll_wait);
-cond_syscall(sys_epoll_pwait);
-cond_syscall(compat_sys_epoll_pwait);
-cond_syscall(sys_semget);
-cond_syscall(sys_semop);
-cond_syscall(sys_semtimedop);
-cond_syscall(compat_sys_semtimedop);
-cond_syscall(sys_semctl);
-cond_syscall(compat_sys_semctl);
-cond_syscall(sys_msgget);
-cond_syscall(sys_msgsnd);
-cond_syscall(compat_sys_msgsnd);
-cond_syscall(sys_msgrcv);
-cond_syscall(compat_sys_msgrcv);
-cond_syscall(sys_msgctl);
-cond_syscall(compat_sys_msgctl);
-cond_syscall(sys_shmget);
-cond_syscall(sys_shmat);
-cond_syscall(compat_sys_shmat);
-cond_syscall(sys_shmdt);
-cond_syscall(sys_shmctl);
-cond_syscall(compat_sys_shmctl);
-cond_syscall(sys_mq_open);
-cond_syscall(sys_mq_unlink);
-cond_syscall(sys_mq_timedsend);
-cond_syscall(sys_mq_timedreceive);
-cond_syscall(sys_mq_notify);
-cond_syscall(sys_mq_getsetattr);
-cond_syscall(compat_sys_mq_open);
-cond_syscall(compat_sys_mq_timedsend);
-cond_syscall(compat_sys_mq_timedreceive);
-cond_syscall(compat_sys_mq_notify);
-cond_syscall(compat_sys_mq_getsetattr);
-cond_syscall(sys_mbind);
-cond_syscall(sys_get_mempolicy);
-cond_syscall(sys_set_mempolicy);
-cond_syscall(compat_sys_mbind);
-cond_syscall(compat_sys_get_mempolicy);
-cond_syscall(compat_sys_set_mempolicy);
+
+/* mm/filemap.c */
+
+/* mm/nommu.c, also with MMU */
+cond_syscall(sys_mremap);
+
+/* security/keys/keyctl.c */
 cond_syscall(sys_add_key);
 cond_syscall(sys_request_key);
 cond_syscall(sys_keyctl);
 cond_syscall(compat_sys_keyctl);
-cond_syscall(compat_sys_socketcall);
-cond_syscall(sys_inotify_init);
-cond_syscall(sys_inotify_init1);
-cond_syscall(sys_inotify_add_watch);
-cond_syscall(sys_inotify_rm_watch);
-cond_syscall(sys_migrate_pages);
-cond_syscall(sys_move_pages);
-cond_syscall(sys_chown16);
-cond_syscall(sys_fchown16);
-cond_syscall(sys_getegid16);
-cond_syscall(sys_geteuid16);
-cond_syscall(sys_getgid16);
-cond_syscall(sys_getgroups16);
-cond_syscall(sys_getresgid16);
-cond_syscall(sys_getresuid16);
-cond_syscall(sys_getuid16);
-cond_syscall(sys_lchown16);
-cond_syscall(sys_setfsgid16);
-cond_syscall(sys_setfsuid16);
-cond_syscall(sys_setgid16);
-cond_syscall(sys_setgroups16);
-cond_syscall(sys_setregid16);
-cond_syscall(sys_setresgid16);
-cond_syscall(sys_setresuid16);
-cond_syscall(sys_setreuid16);
-cond_syscall(sys_setuid16);
-cond_syscall(sys_sgetmask);
-cond_syscall(sys_ssetmask);
-cond_syscall(sys_vm86old);
-cond_syscall(sys_vm86);
-cond_syscall(sys_modify_ldt);
-cond_syscall(sys_ipc);
-cond_syscall(compat_sys_ipc);
-cond_syscall(compat_sys_sysctl);
-cond_syscall(sys_flock);
-cond_syscall(sys_io_setup);
-cond_syscall(sys_io_destroy);
-cond_syscall(sys_io_submit);
-cond_syscall(sys_io_cancel);
-cond_syscall(sys_io_getevents);
-cond_syscall(compat_sys_io_setup);
-cond_syscall(compat_sys_io_submit);
-cond_syscall(compat_sys_io_getevents);
-cond_syscall(sys_sysfs);
-cond_syscall(sys_syslog);
-cond_syscall(sys_process_vm_readv);
-cond_syscall(sys_process_vm_writev);
-cond_syscall(compat_sys_process_vm_readv);
-cond_syscall(compat_sys_process_vm_writev);
-cond_syscall(sys_uselib);
-cond_syscall(sys_fadvise64);
-cond_syscall(sys_fadvise64_64);
-cond_syscall(sys_madvise);
-cond_syscall(sys_setuid);
-cond_syscall(sys_setregid);
-cond_syscall(sys_setgid);
-cond_syscall(sys_setreuid);
-cond_syscall(sys_setresuid);
-cond_syscall(sys_getresuid);
-cond_syscall(sys_setresgid);
-cond_syscall(sys_getresgid);
-cond_syscall(sys_setgroups);
-cond_syscall(sys_getgroups);
-cond_syscall(sys_setfsuid);
-cond_syscall(sys_setfsgid);
-cond_syscall(sys_capget);
-cond_syscall(sys_capset);
-cond_syscall(sys_copy_file_range);
 
-/* arch-specific weak syscall entries */
-cond_syscall(sys_pciconfig_read);
-cond_syscall(sys_pciconfig_write);
-cond_syscall(sys_pciconfig_iobase);
-cond_syscall(compat_sys_s390_ipc);
-cond_syscall(ppc_rtas);
-cond_syscall(sys_spu_run);
-cond_syscall(sys_spu_create);
-cond_syscall(sys_subpage_prot);
-cond_syscall(sys_s390_pci_mmio_read);
-cond_syscall(sys_s390_pci_mmio_write);
+/* arch/example/kernel/sys_example.c */
 
-/* mmu depending weak syscall entries */
+/* mm/fadvise.c */
+cond_syscall(sys_fadvise64_64);
+
+/* mm/, CONFIG_MMU only */
+cond_syscall(sys_swapon);
+cond_syscall(sys_swapoff);
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);
 cond_syscall(sys_mlock);
 cond_syscall(sys_munlock);
 cond_syscall(sys_mlockall);
 cond_syscall(sys_munlockall);
-cond_syscall(sys_mlock2);
 cond_syscall(sys_mincore);
 cond_syscall(sys_madvise);
-cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
-cond_syscall(compat_sys_move_pages);
+cond_syscall(sys_mbind);
+cond_syscall(compat_sys_mbind);
+cond_syscall(sys_get_mempolicy);
+cond_syscall(compat_sys_get_mempolicy);
+cond_syscall(sys_set_mempolicy);
+cond_syscall(compat_sys_set_mempolicy);
+cond_syscall(sys_migrate_pages);
 cond_syscall(compat_sys_migrate_pages);
+cond_syscall(sys_move_pages);
+cond_syscall(compat_sys_move_pages);
 
-/* block-layer dependent */
-cond_syscall(sys_bdflush);
-cond_syscall(sys_ioprio_set);
-cond_syscall(sys_ioprio_get);
-
-/* New file descriptors */
-cond_syscall(sys_signalfd);
-cond_syscall(sys_signalfd4);
-cond_syscall(compat_sys_signalfd);
-cond_syscall(compat_sys_signalfd4);
-cond_syscall(sys_timerfd_create);
-cond_syscall(sys_timerfd_settime);
-cond_syscall(sys_timerfd_gettime);
-cond_syscall(compat_sys_timerfd_settime);
-cond_syscall(compat_sys_timerfd_gettime);
-cond_syscall(sys_eventfd);
-cond_syscall(sys_eventfd2);
-cond_syscall(sys_memfd_create);
-cond_syscall(sys_userfaultfd);
-
-/* performance counters: */
 cond_syscall(sys_perf_event_open);
+cond_syscall(sys_accept4);
+cond_syscall(sys_recvmmsg);
+cond_syscall(compat_sys_recvmmsg);
+
+/*
+ * Architecture specific syscalls: see further below
+ */
 
-/* fanotify! */
+/* fanotify */
 cond_syscall(sys_fanotify_init);
 cond_syscall(sys_fanotify_mark);
-cond_syscall(compat_sys_fanotify_mark);
 
 /* open by handle */
 cond_syscall(sys_name_to_handle_at);
 cond_syscall(sys_open_by_handle_at);
 cond_syscall(compat_sys_open_by_handle_at);
 
+cond_syscall(sys_sendmmsg);
+cond_syscall(compat_sys_sendmmsg);
+cond_syscall(sys_process_vm_readv);
+cond_syscall(compat_sys_process_vm_readv);
+cond_syscall(sys_process_vm_writev);
+cond_syscall(compat_sys_process_vm_writev);
+
 /* compare kernel pointers */
 cond_syscall(sys_kcmp);
 
+cond_syscall(sys_finit_module);
+
 /* operate on Secure Computing state */
 cond_syscall(sys_seccomp);
 
+cond_syscall(sys_memfd_create);
+
 /* access BPF programs and maps */
 cond_syscall(sys_bpf);
 
 /* execveat */
 cond_syscall(sys_execveat);
 
+cond_syscall(sys_userfaultfd);
+
 /* membarrier */
 cond_syscall(sys_membarrier);
 
+cond_syscall(sys_mlock2);
+
+cond_syscall(sys_copy_file_range);
+
 /* memory protection keys */
 cond_syscall(sys_pkey_mprotect);
 cond_syscall(sys_pkey_alloc);
 cond_syscall(sys_pkey_free);
+
+
+/*
+ * Architecture specific weak syscall entries.
+ */
+
+/* pciconfig: alpha, arm, arm64, ia64, sparc */
+cond_syscall(sys_pciconfig_read);
+cond_syscall(sys_pciconfig_write);
+cond_syscall(sys_pciconfig_iobase);
+
+/* sys_socketcall: arm, mips, x86, ... */
+cond_syscall(sys_socketcall);
+cond_syscall(compat_sys_socketcall);
+
+/* compat syscalls for arm64, x86, ... */
+cond_syscall(compat_sys_sysctl);
+cond_syscall(compat_sys_fanotify_mark);
+
+/* x86 */
+cond_syscall(sys_vm86old);
+cond_syscall(sys_modify_ldt);
+cond_syscall(compat_sys_quotactl32);
+cond_syscall(sys_vm86);
+cond_syscall(sys_kexec_file_load);
+
+/* s390 */
+cond_syscall(sys_s390_pci_mmio_read);
+cond_syscall(sys_s390_pci_mmio_write);
+cond_syscall(compat_sys_s390_ipc);
+
+/* powerpc */
+cond_syscall(ppc_rtas);
+cond_syscall(sys_spu_run);
+cond_syscall(sys_spu_create);
+cond_syscall(sys_subpage_prot);
+
+
+/*
+ * Deprecated system calls which are still defined in
+ * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch
+ */
+
+/* __ARCH_WANT_SYSCALL_NO_FLAGS */
+cond_syscall(sys_epoll_create);
+cond_syscall(sys_inotify_init);
+cond_syscall(sys_eventfd);
+cond_syscall(sys_signalfd);
+cond_syscall(compat_sys_signalfd);
+
+/* __ARCH_WANT_SYSCALL_OFF_T */
+cond_syscall(sys_fadvise64);
+
+/* __ARCH_WANT_SYSCALL_DEPRECATED */
+cond_syscall(sys_epoll_wait);
+cond_syscall(sys_recv);
+cond_syscall(compat_sys_recv);
+cond_syscall(sys_send);
+cond_syscall(sys_bdflush);
+cond_syscall(sys_uselib);
+
+
+/*
+ * The syscalls below are not found in include/uapi/asm-generic/unistd.h
+ */
+
+/* obsolete: SGETMASK_SYSCALL */
+cond_syscall(sys_sgetmask);
+cond_syscall(sys_ssetmask);
+
+/* obsolete: SYSFS_SYSCALL */
+cond_syscall(sys_sysfs);
+
+/* obsolete: __ARCH_WANT_SYS_IPC */
+cond_syscall(sys_ipc);
+cond_syscall(compat_sys_ipc);
+
+/* obsolete: UID16 */
+cond_syscall(sys_chown16);
+cond_syscall(sys_fchown16);
+cond_syscall(sys_getegid16);
+cond_syscall(sys_geteuid16);
+cond_syscall(sys_getgid16);
+cond_syscall(sys_getgroups16);
+cond_syscall(sys_getresgid16);
+cond_syscall(sys_getresuid16);
+cond_syscall(sys_getuid16);
+cond_syscall(sys_lchown16);
+cond_syscall(sys_setfsgid16);
+cond_syscall(sys_setfsuid16);
+cond_syscall(sys_setgid16);
+cond_syscall(sys_setgroups16);
+cond_syscall(sys_setregid16);
+cond_syscall(sys_setresgid16);
+cond_syscall(sys_setresuid16);
+cond_syscall(sys_setreuid16);
+cond_syscall(sys_setuid16);
-- 
cgit v1.2.3-71-gd317


From 67a7acd3773a94df2e671601a288685485463cf9 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 4 Mar 2018 19:06:35 +0100
Subject: kernel/sys_ni: remove {sys_,sys_compat} from cond_syscall definitions

This keeps it in line with the SYSCALL_DEFINEx() / COMPAT_SYSCALL_DEFINEx()
calling convention.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 Documentation/process/adding-syscalls.rst |   2 +-
 kernel/sys_ni.c                           | 433 +++++++++++++++---------------
 2 files changed, 219 insertions(+), 216 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/process/adding-syscalls.rst b/Documentation/process/adding-syscalls.rst
index 556613744556..314c8bf6f2a2 100644
--- a/Documentation/process/adding-syscalls.rst
+++ b/Documentation/process/adding-syscalls.rst
@@ -222,7 +222,7 @@ your new syscall number may get adjusted to resolve conflicts.
 The file ``kernel/sys_ni.c`` provides a fallback stub implementation of each
 system call, returning ``-ENOSYS``.  Add your new system call here too::
 
-    cond_syscall(sys_xyzzy);
+    COND_SYSCALL(xyzzy);
 
 Your new kernel functionality, and the system call that controls it, should
 normally be optional, so add a ``CONFIG`` option (typically to
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0c1538f5a559..6cafc008f6db 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -17,53 +17,56 @@ asmlinkage long sys_ni_syscall(void)
 	return -ENOSYS;
 }
 
+#define COND_SYSCALL(name) cond_syscall(sys_##name)
+#define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name)
+
 /*
  * This list is kept in the same order as include/uapi/asm-generic/unistd.h.
  * Architecture specific entries go below, followed by deprecated or obsolete
  * system calls.
  */
 
-cond_syscall(sys_io_setup);
-cond_syscall(compat_sys_io_setup);
-cond_syscall(sys_io_destroy);
-cond_syscall(sys_io_submit);
-cond_syscall(compat_sys_io_submit);
-cond_syscall(sys_io_cancel);
-cond_syscall(sys_io_getevents);
-cond_syscall(compat_sys_io_getevents);
+COND_SYSCALL(io_setup);
+COND_SYSCALL_COMPAT(io_setup);
+COND_SYSCALL(io_destroy);
+COND_SYSCALL(io_submit);
+COND_SYSCALL_COMPAT(io_submit);
+COND_SYSCALL(io_cancel);
+COND_SYSCALL(io_getevents);
+COND_SYSCALL_COMPAT(io_getevents);
 
 /* fs/xattr.c */
 
 /* fs/dcache.c */
 
 /* fs/cookies.c */
-cond_syscall(sys_lookup_dcookie);
-cond_syscall(compat_sys_lookup_dcookie);
+COND_SYSCALL(lookup_dcookie);
+COND_SYSCALL_COMPAT(lookup_dcookie);
 
 /* fs/eventfd.c */
-cond_syscall(sys_eventfd2);
+COND_SYSCALL(eventfd2);
 
 /* fs/eventfd.c */
-cond_syscall(sys_epoll_create1);
-cond_syscall(sys_epoll_ctl);
-cond_syscall(sys_epoll_pwait);
-cond_syscall(compat_sys_epoll_pwait);
+COND_SYSCALL(epoll_create1);
+COND_SYSCALL(epoll_ctl);
+COND_SYSCALL(epoll_pwait);
+COND_SYSCALL_COMPAT(epoll_pwait);
 
 /* fs/fcntl.c */
 
 /* fs/inotify_user.c */
-cond_syscall(sys_inotify_init1);
-cond_syscall(sys_inotify_add_watch);
-cond_syscall(sys_inotify_rm_watch);
+COND_SYSCALL(inotify_init1);
+COND_SYSCALL(inotify_add_watch);
+COND_SYSCALL(inotify_rm_watch);
 
 /* fs/ioctl.c */
 
 /* fs/ioprio.c */
-cond_syscall(sys_ioprio_set);
-cond_syscall(sys_ioprio_get);
+COND_SYSCALL(ioprio_set);
+COND_SYSCALL(ioprio_get);
 
 /* fs/locks.c */
-cond_syscall(sys_flock);
+COND_SYSCALL(flock);
 
 /* fs/namei.c */
 
@@ -76,7 +79,7 @@ cond_syscall(sys_flock);
 /* fs/pipe.c */
 
 /* fs/quota.c */
-cond_syscall(sys_quotactl);
+COND_SYSCALL(quotactl);
 
 /* fs/readdir.c */
 
@@ -87,8 +90,8 @@ cond_syscall(sys_quotactl);
 /* fs/select.c */
 
 /* fs/signalfd.c */
-cond_syscall(sys_signalfd4);
-cond_syscall(compat_sys_signalfd4);
+COND_SYSCALL(signalfd4);
+COND_SYSCALL_COMPAT(signalfd4);
 
 /* fs/splice.c */
 
@@ -97,20 +100,20 @@ cond_syscall(compat_sys_signalfd4);
 /* fs/sync.c */
 
 /* fs/timerfd.c */
-cond_syscall(sys_timerfd_create);
-cond_syscall(sys_timerfd_settime);
-cond_syscall(compat_sys_timerfd_settime);
-cond_syscall(sys_timerfd_gettime);
-cond_syscall(compat_sys_timerfd_gettime);
+COND_SYSCALL(timerfd_create);
+COND_SYSCALL(timerfd_settime);
+COND_SYSCALL_COMPAT(timerfd_settime);
+COND_SYSCALL(timerfd_gettime);
+COND_SYSCALL_COMPAT(timerfd_gettime);
 
 /* fs/utimes.c */
 
 /* kernel/acct.c */
-cond_syscall(sys_acct);
+COND_SYSCALL(acct);
 
 /* kernel/capability.c */
-cond_syscall(sys_capget);
-cond_syscall(sys_capset);
+COND_SYSCALL(capget);
+COND_SYSCALL(capset);
 
 /* kernel/exec_domain.c */
 
@@ -119,29 +122,29 @@ cond_syscall(sys_capset);
 /* kernel/fork.c */
 
 /* kernel/futex.c */
-cond_syscall(sys_futex);
-cond_syscall(compat_sys_futex);
-cond_syscall(sys_set_robust_list);
-cond_syscall(compat_sys_set_robust_list);
-cond_syscall(sys_get_robust_list);
-cond_syscall(compat_sys_get_robust_list);
+COND_SYSCALL(futex);
+COND_SYSCALL_COMPAT(futex);
+COND_SYSCALL(set_robust_list);
+COND_SYSCALL_COMPAT(set_robust_list);
+COND_SYSCALL(get_robust_list);
+COND_SYSCALL_COMPAT(get_robust_list);
 
 /* kernel/hrtimer.c */
 
 /* kernel/itimer.c */
 
 /* kernel/kexec.c */
-cond_syscall(sys_kexec_load);
-cond_syscall(compat_sys_kexec_load);
+COND_SYSCALL(kexec_load);
+COND_SYSCALL_COMPAT(kexec_load);
 
 /* kernel/module.c */
-cond_syscall(sys_init_module);
-cond_syscall(sys_delete_module);
+COND_SYSCALL(init_module);
+COND_SYSCALL(delete_module);
 
 /* kernel/posix-timers.c */
 
 /* kernel/printk.c */
-cond_syscall(sys_syslog);
+COND_SYSCALL(syslog);
 
 /* kernel/ptrace.c */
 
@@ -150,176 +153,176 @@ cond_syscall(sys_syslog);
 /* kernel/signal.c */
 
 /* kernel/sys.c */
-cond_syscall(sys_setregid);
-cond_syscall(sys_setgid);
-cond_syscall(sys_setreuid);
-cond_syscall(sys_setuid);
-cond_syscall(sys_setresuid);
-cond_syscall(sys_getresuid);
-cond_syscall(sys_setresgid);
-cond_syscall(sys_getresgid);
-cond_syscall(sys_setfsuid);
-cond_syscall(sys_setfsgid);
-cond_syscall(sys_setgroups);
-cond_syscall(sys_getgroups);
+COND_SYSCALL(setregid);
+COND_SYSCALL(setgid);
+COND_SYSCALL(setreuid);
+COND_SYSCALL(setuid);
+COND_SYSCALL(setresuid);
+COND_SYSCALL(getresuid);
+COND_SYSCALL(setresgid);
+COND_SYSCALL(getresgid);
+COND_SYSCALL(setfsuid);
+COND_SYSCALL(setfsgid);
+COND_SYSCALL(setgroups);
+COND_SYSCALL(getgroups);
 
 /* kernel/time.c */
 
 /* kernel/timer.c */
 
 /* ipc/mqueue.c */
-cond_syscall(sys_mq_open);
-cond_syscall(compat_sys_mq_open);
-cond_syscall(sys_mq_unlink);
-cond_syscall(sys_mq_timedsend);
-cond_syscall(compat_sys_mq_timedsend);
-cond_syscall(sys_mq_timedreceive);
-cond_syscall(compat_sys_mq_timedreceive);
-cond_syscall(sys_mq_notify);
-cond_syscall(compat_sys_mq_notify);
-cond_syscall(sys_mq_getsetattr);
-cond_syscall(compat_sys_mq_getsetattr);
+COND_SYSCALL(mq_open);
+COND_SYSCALL_COMPAT(mq_open);
+COND_SYSCALL(mq_unlink);
+COND_SYSCALL(mq_timedsend);
+COND_SYSCALL_COMPAT(mq_timedsend);
+COND_SYSCALL(mq_timedreceive);
+COND_SYSCALL_COMPAT(mq_timedreceive);
+COND_SYSCALL(mq_notify);
+COND_SYSCALL_COMPAT(mq_notify);
+COND_SYSCALL(mq_getsetattr);
+COND_SYSCALL_COMPAT(mq_getsetattr);
 
 /* ipc/msg.c */
-cond_syscall(sys_msgget);
-cond_syscall(sys_msgctl);
-cond_syscall(compat_sys_msgctl);
-cond_syscall(sys_msgrcv);
-cond_syscall(compat_sys_msgrcv);
-cond_syscall(sys_msgsnd);
-cond_syscall(compat_sys_msgsnd);
+COND_SYSCALL(msgget);
+COND_SYSCALL(msgctl);
+COND_SYSCALL_COMPAT(msgctl);
+COND_SYSCALL(msgrcv);
+COND_SYSCALL_COMPAT(msgrcv);
+COND_SYSCALL(msgsnd);
+COND_SYSCALL_COMPAT(msgsnd);
 
 /* ipc/sem.c */
-cond_syscall(sys_semget);
-cond_syscall(sys_semctl);
-cond_syscall(compat_sys_semctl);
-cond_syscall(sys_semtimedop);
-cond_syscall(compat_sys_semtimedop);
-cond_syscall(sys_semop);
+COND_SYSCALL(semget);
+COND_SYSCALL(semctl);
+COND_SYSCALL_COMPAT(semctl);
+COND_SYSCALL(semtimedop);
+COND_SYSCALL_COMPAT(semtimedop);
+COND_SYSCALL(semop);
 
 /* ipc/shm.c */
-cond_syscall(sys_shmget);
-cond_syscall(sys_shmctl);
-cond_syscall(compat_sys_shmctl);
-cond_syscall(sys_shmat);
-cond_syscall(compat_sys_shmat);
-cond_syscall(sys_shmdt);
+COND_SYSCALL(shmget);
+COND_SYSCALL(shmctl);
+COND_SYSCALL_COMPAT(shmctl);
+COND_SYSCALL(shmat);
+COND_SYSCALL_COMPAT(shmat);
+COND_SYSCALL(shmdt);
 
 /* net/socket.c */
-cond_syscall(sys_socket);
-cond_syscall(sys_socketpair);
-cond_syscall(sys_bind);
-cond_syscall(sys_listen);
-cond_syscall(sys_accept);
-cond_syscall(sys_connect);
-cond_syscall(sys_getsockname);
-cond_syscall(sys_getpeername);
-cond_syscall(sys_setsockopt);
-cond_syscall(compat_sys_setsockopt);
-cond_syscall(sys_getsockopt);
-cond_syscall(compat_sys_getsockopt);
-cond_syscall(sys_sendto);
-cond_syscall(sys_shutdown);
-cond_syscall(sys_recvfrom);
-cond_syscall(compat_sys_recvfrom);
-cond_syscall(sys_sendmsg);
-cond_syscall(compat_sys_sendmsg);
-cond_syscall(sys_recvmsg);
-cond_syscall(compat_sys_recvmsg);
+COND_SYSCALL(socket);
+COND_SYSCALL(socketpair);
+COND_SYSCALL(bind);
+COND_SYSCALL(listen);
+COND_SYSCALL(accept);
+COND_SYSCALL(connect);
+COND_SYSCALL(getsockname);
+COND_SYSCALL(getpeername);
+COND_SYSCALL(setsockopt);
+COND_SYSCALL_COMPAT(setsockopt);
+COND_SYSCALL(getsockopt);
+COND_SYSCALL_COMPAT(getsockopt);
+COND_SYSCALL(sendto);
+COND_SYSCALL(shutdown);
+COND_SYSCALL(recvfrom);
+COND_SYSCALL_COMPAT(recvfrom);
+COND_SYSCALL(sendmsg);
+COND_SYSCALL_COMPAT(sendmsg);
+COND_SYSCALL(recvmsg);
+COND_SYSCALL_COMPAT(recvmsg);
 
 /* mm/filemap.c */
 
 /* mm/nommu.c, also with MMU */
-cond_syscall(sys_mremap);
+COND_SYSCALL(mremap);
 
 /* security/keys/keyctl.c */
-cond_syscall(sys_add_key);
-cond_syscall(sys_request_key);
-cond_syscall(sys_keyctl);
-cond_syscall(compat_sys_keyctl);
+COND_SYSCALL(add_key);
+COND_SYSCALL(request_key);
+COND_SYSCALL(keyctl);
+COND_SYSCALL_COMPAT(keyctl);
 
 /* arch/example/kernel/sys_example.c */
 
 /* mm/fadvise.c */
-cond_syscall(sys_fadvise64_64);
+COND_SYSCALL(fadvise64_64);
 
 /* mm/, CONFIG_MMU only */
-cond_syscall(sys_swapon);
-cond_syscall(sys_swapoff);
-cond_syscall(sys_mprotect);
-cond_syscall(sys_msync);
-cond_syscall(sys_mlock);
-cond_syscall(sys_munlock);
-cond_syscall(sys_mlockall);
-cond_syscall(sys_munlockall);
-cond_syscall(sys_mincore);
-cond_syscall(sys_madvise);
-cond_syscall(sys_remap_file_pages);
-cond_syscall(sys_mbind);
-cond_syscall(compat_sys_mbind);
-cond_syscall(sys_get_mempolicy);
-cond_syscall(compat_sys_get_mempolicy);
-cond_syscall(sys_set_mempolicy);
-cond_syscall(compat_sys_set_mempolicy);
-cond_syscall(sys_migrate_pages);
-cond_syscall(compat_sys_migrate_pages);
-cond_syscall(sys_move_pages);
-cond_syscall(compat_sys_move_pages);
-
-cond_syscall(sys_perf_event_open);
-cond_syscall(sys_accept4);
-cond_syscall(sys_recvmmsg);
-cond_syscall(compat_sys_recvmmsg);
+COND_SYSCALL(swapon);
+COND_SYSCALL(swapoff);
+COND_SYSCALL(mprotect);
+COND_SYSCALL(msync);
+COND_SYSCALL(mlock);
+COND_SYSCALL(munlock);
+COND_SYSCALL(mlockall);
+COND_SYSCALL(munlockall);
+COND_SYSCALL(mincore);
+COND_SYSCALL(madvise);
+COND_SYSCALL(remap_file_pages);
+COND_SYSCALL(mbind);
+COND_SYSCALL_COMPAT(mbind);
+COND_SYSCALL(get_mempolicy);
+COND_SYSCALL_COMPAT(get_mempolicy);
+COND_SYSCALL(set_mempolicy);
+COND_SYSCALL_COMPAT(set_mempolicy);
+COND_SYSCALL(migrate_pages);
+COND_SYSCALL_COMPAT(migrate_pages);
+COND_SYSCALL(move_pages);
+COND_SYSCALL_COMPAT(move_pages);
+
+COND_SYSCALL(perf_event_open);
+COND_SYSCALL(accept4);
+COND_SYSCALL(recvmmsg);
+COND_SYSCALL_COMPAT(recvmmsg);
 
 /*
  * Architecture specific syscalls: see further below
  */
 
 /* fanotify */
-cond_syscall(sys_fanotify_init);
-cond_syscall(sys_fanotify_mark);
+COND_SYSCALL(fanotify_init);
+COND_SYSCALL(fanotify_mark);
 
 /* open by handle */
-cond_syscall(sys_name_to_handle_at);
-cond_syscall(sys_open_by_handle_at);
-cond_syscall(compat_sys_open_by_handle_at);
+COND_SYSCALL(name_to_handle_at);
+COND_SYSCALL(open_by_handle_at);
+COND_SYSCALL_COMPAT(open_by_handle_at);
 
-cond_syscall(sys_sendmmsg);
-cond_syscall(compat_sys_sendmmsg);
-cond_syscall(sys_process_vm_readv);
-cond_syscall(compat_sys_process_vm_readv);
-cond_syscall(sys_process_vm_writev);
-cond_syscall(compat_sys_process_vm_writev);
+COND_SYSCALL(sendmmsg);
+COND_SYSCALL_COMPAT(sendmmsg);
+COND_SYSCALL(process_vm_readv);
+COND_SYSCALL_COMPAT(process_vm_readv);
+COND_SYSCALL(process_vm_writev);
+COND_SYSCALL_COMPAT(process_vm_writev);
 
 /* compare kernel pointers */
-cond_syscall(sys_kcmp);
+COND_SYSCALL(kcmp);
 
-cond_syscall(sys_finit_module);
+COND_SYSCALL(finit_module);
 
 /* operate on Secure Computing state */
-cond_syscall(sys_seccomp);
+COND_SYSCALL(seccomp);
 
-cond_syscall(sys_memfd_create);
+COND_SYSCALL(memfd_create);
 
 /* access BPF programs and maps */
-cond_syscall(sys_bpf);
+COND_SYSCALL(bpf);
 
 /* execveat */
-cond_syscall(sys_execveat);
+COND_SYSCALL(execveat);
 
-cond_syscall(sys_userfaultfd);
+COND_SYSCALL(userfaultfd);
 
 /* membarrier */
-cond_syscall(sys_membarrier);
+COND_SYSCALL(membarrier);
 
-cond_syscall(sys_mlock2);
+COND_SYSCALL(mlock2);
 
-cond_syscall(sys_copy_file_range);
+COND_SYSCALL(copy_file_range);
 
 /* memory protection keys */
-cond_syscall(sys_pkey_mprotect);
-cond_syscall(sys_pkey_alloc);
-cond_syscall(sys_pkey_free);
+COND_SYSCALL(pkey_mprotect);
+COND_SYSCALL(pkey_alloc);
+COND_SYSCALL(pkey_free);
 
 
 /*
@@ -327,35 +330,35 @@ cond_syscall(sys_pkey_free);
  */
 
 /* pciconfig: alpha, arm, arm64, ia64, sparc */
-cond_syscall(sys_pciconfig_read);
-cond_syscall(sys_pciconfig_write);
-cond_syscall(sys_pciconfig_iobase);
+COND_SYSCALL(pciconfig_read);
+COND_SYSCALL(pciconfig_write);
+COND_SYSCALL(pciconfig_iobase);
 
 /* sys_socketcall: arm, mips, x86, ... */
-cond_syscall(sys_socketcall);
-cond_syscall(compat_sys_socketcall);
+COND_SYSCALL(socketcall);
+COND_SYSCALL_COMPAT(socketcall);
 
 /* compat syscalls for arm64, x86, ... */
-cond_syscall(compat_sys_sysctl);
-cond_syscall(compat_sys_fanotify_mark);
+COND_SYSCALL_COMPAT(sysctl);
+COND_SYSCALL_COMPAT(fanotify_mark);
 
 /* x86 */
-cond_syscall(sys_vm86old);
-cond_syscall(sys_modify_ldt);
-cond_syscall(compat_sys_quotactl32);
-cond_syscall(sys_vm86);
-cond_syscall(sys_kexec_file_load);
+COND_SYSCALL(vm86old);
+COND_SYSCALL(modify_ldt);
+COND_SYSCALL_COMPAT(quotactl32);
+COND_SYSCALL(vm86);
+COND_SYSCALL(kexec_file_load);
 
 /* s390 */
-cond_syscall(sys_s390_pci_mmio_read);
-cond_syscall(sys_s390_pci_mmio_write);
-cond_syscall(compat_sys_s390_ipc);
+COND_SYSCALL(s390_pci_mmio_read);
+COND_SYSCALL(s390_pci_mmio_write);
+COND_SYSCALL_COMPAT(s390_ipc);
 
 /* powerpc */
 cond_syscall(ppc_rtas);
-cond_syscall(sys_spu_run);
-cond_syscall(sys_spu_create);
-cond_syscall(sys_subpage_prot);
+COND_SYSCALL(spu_run);
+COND_SYSCALL(spu_create);
+COND_SYSCALL(subpage_prot);
 
 
 /*
@@ -364,22 +367,22 @@ cond_syscall(sys_subpage_prot);
  */
 
 /* __ARCH_WANT_SYSCALL_NO_FLAGS */
-cond_syscall(sys_epoll_create);
-cond_syscall(sys_inotify_init);
-cond_syscall(sys_eventfd);
-cond_syscall(sys_signalfd);
-cond_syscall(compat_sys_signalfd);
+COND_SYSCALL(epoll_create);
+COND_SYSCALL(inotify_init);
+COND_SYSCALL(eventfd);
+COND_SYSCALL(signalfd);
+COND_SYSCALL_COMPAT(signalfd);
 
 /* __ARCH_WANT_SYSCALL_OFF_T */
-cond_syscall(sys_fadvise64);
+COND_SYSCALL(fadvise64);
 
 /* __ARCH_WANT_SYSCALL_DEPRECATED */
-cond_syscall(sys_epoll_wait);
-cond_syscall(sys_recv);
-cond_syscall(compat_sys_recv);
-cond_syscall(sys_send);
-cond_syscall(sys_bdflush);
-cond_syscall(sys_uselib);
+COND_SYSCALL(epoll_wait);
+COND_SYSCALL(recv);
+COND_SYSCALL_COMPAT(recv);
+COND_SYSCALL(send);
+COND_SYSCALL(bdflush);
+COND_SYSCALL(uselib);
 
 
 /*
@@ -387,33 +390,33 @@ cond_syscall(sys_uselib);
  */
 
 /* obsolete: SGETMASK_SYSCALL */
-cond_syscall(sys_sgetmask);
-cond_syscall(sys_ssetmask);
+COND_SYSCALL(sgetmask);
+COND_SYSCALL(ssetmask);
 
 /* obsolete: SYSFS_SYSCALL */
-cond_syscall(sys_sysfs);
+COND_SYSCALL(sysfs);
 
 /* obsolete: __ARCH_WANT_SYS_IPC */
-cond_syscall(sys_ipc);
-cond_syscall(compat_sys_ipc);
+COND_SYSCALL(ipc);
+COND_SYSCALL_COMPAT(ipc);
 
 /* obsolete: UID16 */
-cond_syscall(sys_chown16);
-cond_syscall(sys_fchown16);
-cond_syscall(sys_getegid16);
-cond_syscall(sys_geteuid16);
-cond_syscall(sys_getgid16);
-cond_syscall(sys_getgroups16);
-cond_syscall(sys_getresgid16);
-cond_syscall(sys_getresuid16);
-cond_syscall(sys_getuid16);
-cond_syscall(sys_lchown16);
-cond_syscall(sys_setfsgid16);
-cond_syscall(sys_setfsuid16);
-cond_syscall(sys_setgid16);
-cond_syscall(sys_setgroups16);
-cond_syscall(sys_setregid16);
-cond_syscall(sys_setresgid16);
-cond_syscall(sys_setresuid16);
-cond_syscall(sys_setreuid16);
-cond_syscall(sys_setuid16);
+COND_SYSCALL(chown16);
+COND_SYSCALL(fchown16);
+COND_SYSCALL(getegid16);
+COND_SYSCALL(geteuid16);
+COND_SYSCALL(getgid16);
+COND_SYSCALL(getgroups16);
+COND_SYSCALL(getresgid16);
+COND_SYSCALL(getresuid16);
+COND_SYSCALL(getuid16);
+COND_SYSCALL(lchown16);
+COND_SYSCALL(setfsgid16);
+COND_SYSCALL(setfsuid16);
+COND_SYSCALL(setgid16);
+COND_SYSCALL(setgroups16);
+COND_SYSCALL(setregid16);
+COND_SYSCALL(setresgid16);
+COND_SYSCALL(setresuid16);
+COND_SYSCALL(setreuid16);
+COND_SYSCALL(setuid16);
-- 
cgit v1.2.3-71-gd317


From 7303e30ec1d8fb5ca1f07c92d069241c32b2ee1b Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Thu, 5 Apr 2018 11:53:03 +0200
Subject: syscalls/core: Prepare CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y for compat
 syscalls

It may be useful for an architecture to override the definitions of the
COMPAT_SYSCALL_DEFINE0() and __COMPAT_SYSCALL_DEFINEx() macros in
<linux/compat.h>, in particular to use a different calling convention
for syscalls. This patch provides a mechanism to do so, based on the
previously introduced CONFIG_ARCH_HAS_SYSCALL_WRAPPER. If it is enabled,
<asm/sycall_wrapper.h> is included in <linux/compat.h> and may be used
to define the macros mentioned above. Moreover, as the syscall calling
convention may be different if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is set,
the compat syscall function prototypes in <linux/compat.h> are #ifndef'd
out in that case.

As some of the syscalls and/or compat syscalls may not be present,
the COND_SYSCALL() and COND_SYSCALL_COMPAT() macros in kernel/sys_ni.c
as well as the SYS_NI() and COMPAT_SYS_NI() macros in
kernel/time/posix-stubs.c can be re-defined in <asm/syscall_wrapper.h> iff
CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180405095307.3730-5-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compat.h    | 22 ++++++++++++++++++++++
 init/Kconfig              |  9 ++++++---
 kernel/sys_ni.c           | 10 ++++++++++
 kernel/time/posix-stubs.c | 10 ++++++++++
 4 files changed, 48 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 9847c5a013c3..2d85ec5cfda2 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -24,6 +24,17 @@
 #include <asm/siginfo.h>
 #include <asm/signal.h>
 
+#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
+/*
+ * It may be useful for an architecture to override the definitions of the
+ * COMPAT_SYSCALL_DEFINE0 and COMPAT_SYSCALL_DEFINEx() macros, in particular
+ * to use a different calling convention for syscalls. To allow for that,
+ + the prototypes for the compat_sys_*() functions below will *not* be included
+ * if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled.
+ */
+#include <asm/syscall_wrapper.h>
+#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */
+
 #ifndef COMPAT_USE_64BIT_TIME
 #define COMPAT_USE_64BIT_TIME 0
 #endif
@@ -32,10 +43,12 @@
 #define __SC_DELOUSE(t,v) ((__force t)(unsigned long)(v))
 #endif
 
+#ifndef COMPAT_SYSCALL_DEFINE0
 #define COMPAT_SYSCALL_DEFINE0(name) \
 	asmlinkage long compat_sys_##name(void); \
 	ALLOW_ERROR_INJECTION(compat_sys_##name, ERRNO); \
 	asmlinkage long compat_sys_##name(void)
+#endif /* COMPAT_SYSCALL_DEFINE0 */
 
 #define COMPAT_SYSCALL_DEFINE1(name, ...) \
         COMPAT_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
@@ -50,6 +63,7 @@
 #define COMPAT_SYSCALL_DEFINE6(name, ...) \
 	COMPAT_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
 
+#ifndef COMPAT_SYSCALL_DEFINEx
 #define COMPAT_SYSCALL_DEFINEx(x, name, ...)				\
 	asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
 	asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))\
@@ -62,6 +76,7 @@
 		return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));	\
 	}								\
 	static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+#endif /* COMPAT_SYSCALL_DEFINEx */
 
 #ifndef compat_user_stack_pointer
 #define compat_user_stack_pointer() current_user_stack_pointer()
@@ -517,7 +532,12 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long);
  * Please note that these prototypes here are only provided for information
  * purposes, for static analysis, and for linking from the syscall table.
  * These functions should not be called elsewhere from kernel code.
+ *
+ * As the syscall calling convention may be different from the default
+ * for architectures overriding the syscall calling convention, do not
+ * include the prototypes if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled.
  */
+#ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
 asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p);
 asmlinkage long compat_sys_io_submit(compat_aio_context_t ctx_id, int nr,
 				     u32 __user *iocb);
@@ -955,6 +975,8 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr);
 /* obsolete: net/socket.c */
 asmlinkage long compat_sys_socketcall(int call, u32 __user *args);
 
+#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */
+
 
 /*
  * For most but not all architectures, "am I in a compat syscall?" and
diff --git a/init/Kconfig b/init/Kconfig
index 068eb6c3bbf7..2dbc88051bde 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1925,8 +1925,11 @@ config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	bool
 
 # It may be useful for an architecture to override the definitions of the
-# SYSCALL_DEFINE() and __SYSCALL_DEFINEx() macros in <linux/syscalls.h>,
-# in particular to use a different calling convention for syscalls.
+# SYSCALL_DEFINE() and __SYSCALL_DEFINEx() macros in <linux/syscalls.h>
+# and the COMPAT_ variants in <linux/compat.h>, in particular to use a
+# different calling convention for syscalls. They can also override the
+# macros for not-implemented syscalls in kernel/sys_ni.c and
+# kernel/time/posix-stubs.c. All these overrides need to be available in
+# <asm/syscall_wrapper.h>.
 config ARCH_HAS_SYSCALL_WRAPPER
 	def_bool n
-	depends on !COMPAT
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6cafc008f6db..9791364925dc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -5,6 +5,11 @@
 
 #include <asm/unistd.h>
 
+#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
+/* Architectures may override COND_SYSCALL and COND_SYSCALL_COMPAT */
+#include <asm/syscall_wrapper.h>
+#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */
+
 /*  we can't #include <linux/syscalls.h> here,
     but tell gcc to not warn with -Wmissing-prototypes  */
 asmlinkage long sys_ni_syscall(void);
@@ -17,8 +22,13 @@ asmlinkage long sys_ni_syscall(void)
 	return -ENOSYS;
 }
 
+#ifndef COND_SYSCALL
 #define COND_SYSCALL(name) cond_syscall(sys_##name)
+#endif /* COND_SYSCALL */
+
+#ifndef COND_SYSCALL_COMPAT
 #define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name)
+#endif /* COND_SYSCALL_COMPAT */
 
 /*
  * This list is kept in the same order as include/uapi/asm-generic/unistd.h.
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index b258bee13b02..69a937c3cd81 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -19,6 +19,11 @@
 #include <linux/posix-timers.h>
 #include <linux/compat.h>
 
+#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
+/* Architectures may override SYS_NI and COMPAT_SYS_NI */
+#include <asm/syscall_wrapper.h>
+#endif
+
 asmlinkage long sys_ni_posix_timers(void)
 {
 	pr_err_once("process %d (%s) attempted a POSIX timer syscall "
@@ -27,8 +32,13 @@ asmlinkage long sys_ni_posix_timers(void)
 	return -ENOSYS;
 }
 
+#ifndef SYS_NI
 #define SYS_NI(name)  SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
+#endif
+
+#ifndef COMPAT_SYS_NI
 #define COMPAT_SYS_NI(name)  SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers)
+#endif
 
 SYS_NI(timer_create);
 SYS_NI(timer_gettime);
-- 
cgit v1.2.3-71-gd317