From 6fe0ce1eb04f99a1eb1eb6e7f775666966cf6c80 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Tue, 6 Feb 2018 09:55:48 +0800 Subject: sched/deadline: Make update_curr_dl() more accurate rq->clock_task may be updated between the two calls of rq_clock_task() in update_curr_dl(). Calling rq_clock_task() only once makes it more accurate and efficient, taking update_curr() as reference. Suggested-by: Peter Zijlstra Signed-off-by: Wen Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Biao Cc: Linus Torvalds Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1517882148-44599-1-git-send-email-wen.yang99@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9bb0e0c412ec..9df09782025c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1153,6 +1153,7 @@ static void update_curr_dl(struct rq *rq) struct sched_dl_entity *dl_se = &curr->dl; u64 delta_exec, scaled_delta_exec; int cpu = cpu_of(rq); + u64 now; if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1165,7 +1166,8 @@ static void update_curr_dl(struct rq *rq) * natural solution, but the full ramifications of this * approach need further study. */ - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + now = rq_clock_task(rq); + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; @@ -1178,7 +1180,7 @@ static void update_curr_dl(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq_clock_task(rq); + curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); -- cgit v1.2.3-71-gd317 From a7711602c7b79950ea437178f601b52ab08ef659 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Tue, 6 Feb 2018 09:53:28 +0800 Subject: sched/rt: Make update_curr_rt() more accurate rq->clock_task may be updated between the two calls of rq_clock_task() in update_curr_rt(). Calling rq_clock_task() only once makes it more accurate and efficient, taking update_curr() as reference. Signed-off-by: Wen Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Biao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1517882008-44552-1-git-send-email-wen.yang99@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 663b2355a3aa..aad49451584e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; - u64 now = rq_clock_task(rq); u64 delta_exec; + u64 now; if (curr->sched_class != &rt_sched_class) return; + now = rq_clock_task(rq); delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; -- cgit v1.2.3-71-gd317 From 269d599271fa604f09d5cb0093c5dd5d59964dd5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Feb 2018 17:52:13 +0100 Subject: sched/core: Fix DEBUG_SPINLOCK annotation for rq->lock Mark noticed that he had sporadic "spinlock recursion" warnings from the DEBUG_SPINLOCK code. Now rq->lock is special in that the owner changes in the middle of a context switch. It so happens that we fix up the lock.owner too late, @prev can run (remotely) the moment prev->on_cpu is cleared, this then allows @prev to again try and acquire this rq->lock and trigger this warning. So we have to switch lock.owner before clearing prev->on_cpu. Do this by moving the DEBUG_SPINLOCK annotation from after switch_to() to before switch_to() and collect all lockdep annotations there into prepare_lock_switch() to mirror the existing finish_lock_switch(). Debugged-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bf724c1952ea..e7c535eee0a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2601,19 +2601,31 @@ static inline void finish_task(struct task_struct *prev) #endif } -static inline void finish_lock_switch(struct rq *rq) +static inline void +prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ + rq_unpin_lock(rq, rf); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; + rq->lock.owner = next; #endif +} + +static inline void finish_lock_switch(struct rq *rq) +{ /* * If we are tracking spinlock dependencies then we have to * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - raw_spin_unlock_irq(&rq->lock); } @@ -2844,14 +2856,7 @@ context_switch(struct rq *rq, struct task_struct *prev, rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - /* - * Since the runqueue lock will be released by the next - * task (which is an invalid locking op but in the case - * of the scheduler it's an obvious special-case), so we - * do an early lockdep release here: - */ - rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); -- cgit v1.2.3-71-gd317 From 43d1b29b27c76e7454cd6c85bec4d0e9cbb039f3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 8 Feb 2018 21:48:22 +0800 Subject: sched/cpufreq: Remove unused SUGOV_KTHREAD_PRIORITY macro Since schedutil kernel thread directly set priority to 0, the macro SUGOV_KTHREAD_PRIORITY is not used. So remove it. Signed-off-by: Leo Yan Acked-by: Viresh Kumar Acked-by: Daniel Lezcano Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: Vikram Mulukutla Cc: Vincent Guittot Link: http://lkml.kernel.org/r/1518097702-9665-1-git-send-email-leo.yan@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index dd062a1c8cf0..7936f548e071 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -19,8 +19,6 @@ #include "sched.h" -#define SUGOV_KTHREAD_PRIORITY 50 - struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; -- cgit v1.2.3-71-gd317 From 95bcade33a8af38755c9b0636e36a36ad3789fe6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 13 Feb 2018 13:22:56 +0000 Subject: locking/qspinlock: Ensure node is initialised before updating prev->next When a locker ends up queuing on the qspinlock locking slowpath, we initialise the relevant mcs node and publish it indirectly by updating the tail portion of the lock word using xchg_tail. If we find that there was a pre-existing locker in the queue, we subsequently update their ->next field to point at our node so that we are notified when it's our turn to take the lock. This can be roughly illustrated as follows: /* Initialise the fields in node and encode a pointer to node in tail */ tail = initialise_node(node); /* * Exchange tail into the lockword using an atomic read-modify-write * operation with release semantics */ old = xchg_tail(lock, tail); /* If there was a pre-existing waiter ... */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); smp_read_barrier_depends(); /* ... then update their ->next field to point to node. WRITE_ONCE(prev->next, node); } The conditional update of prev->next therefore relies on the address dependency from the result of xchg_tail ensuring order against the prior initialisation of node. However, since the release semantics of the xchg_tail operation apply only to the write portion of the RmW, then this ordering is not guaranteed and it is possible for the CPU to return old before the writes to node have been published, consequently allowing us to point prev->next to an uninitialised node. This patch fixes the problem by making the update of prev->next a RELEASE operation, which also removes the reliance on dependency ordering. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1518528177-19169-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38ece035039e..348c8cec1042 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -408,14 +408,15 @@ queue: */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); + /* - * The above xchg_tail() is also a load of @lock which - * generates, through decode_tail(), a pointer. The address - * dependency matches the RELEASE of xchg_tail() such that - * the subsequent access to @prev happens after. + * We must ensure that the stores to @node are observed before + * the write to prev->next. The address dependency from + * xchg_tail is not sufficient to ensure this because the read + * component of xchg_tail is unordered with respect to the + * initialisation of @node. */ - - WRITE_ONCE(prev->next, node); + smp_store_release(&prev->next, node); pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); -- cgit v1.2.3-71-gd317 From 11dc13224c975efcec96647a4768a6f1bb7a19a8 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 13 Feb 2018 13:22:57 +0000 Subject: locking/qspinlock: Ensure node->count is updated before initialising node When queuing on the qspinlock, the count field for the current CPU's head node is incremented. This needn't be atomic because locking in e.g. IRQ context is balanced and so an IRQ will return with node->count as it found it. However, the compiler could in theory reorder the initialisation of node[idx] before the increment of the head node->count, causing an IRQ to overwrite the initialised node and potentially corrupt the lock state. Avoid the potential for this harmful compiler reordering by placing a barrier() between the increment of the head node->count and the subsequent node initialisation. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1518528177-19169-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 348c8cec1042..d880296245c5 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -379,6 +379,14 @@ queue: tail = encode_tail(smp_processor_id(), idx); node += idx; + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + node->locked = 0; node->next = NULL; pv_init_node(node); -- cgit v1.2.3-71-gd317 From 9a3efb6b661f71d5675369ace9257833f0e78ef3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 13 Feb 2018 19:00:21 -0800 Subject: bpf: fix memory leak in lpm_trie map_free callback function There is a memory leak happening in lpm_trie map_free callback function trie_free. The trie structure itself does not get freed. Also, trie_free function did not do synchronize_rcu before freeing various data structures. This is incorrect as some rcu_read_lock region(s) for lookup, update, delete or get_next_key may not complete yet. The fix is to add synchronize_rcu in the beginning of trie_free. The useless spin_lock is removed from this function as well. Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") Reported-by: Mathieu Malaterre Reported-by: Alexei Starovoitov Tested-by: Mathieu Malaterre Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/lpm_trie.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 7b469d10d0e9..a75e02c961b5 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -555,7 +555,10 @@ static void trie_free(struct bpf_map *map) struct lpm_trie_node __rcu **slot; struct lpm_trie_node *node; - raw_spin_lock(&trie->lock); + /* Wait for outstanding programs to complete + * update/lookup/delete/get_next_key and free the trie. + */ + synchronize_rcu(); /* Always start at the root and walk down to a node that has no * children. Then free that node, nullify its reference in the parent @@ -569,7 +572,7 @@ static void trie_free(struct bpf_map *map) node = rcu_dereference_protected(*slot, lockdep_is_held(&trie->lock)); if (!node) - goto unlock; + goto out; if (rcu_access_pointer(node->child[0])) { slot = &node->child[0]; @@ -587,8 +590,8 @@ static void trie_free(struct bpf_map *map) } } -unlock: - raw_spin_unlock(&trie->lock); +out: + kfree(trie); } static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) -- cgit v1.2.3-71-gd317 From 952fad8e323975c4e826b659087d2648777594a6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2018 15:33:52 -0800 Subject: bpf: fix sock_map_alloc() error path In case user program provides silly parameters, we want a map_alloc() handler to return an error, not a NULL pointer, otherwise we crash later in find_and_alloc_map() Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 48c33417d13c..a927e89dad6e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -521,8 +521,8 @@ static struct smap_psock *smap_init_psock(struct sock *sock, static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; - int err = -EINVAL; u64 cost; + int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -547,6 +547,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); + err = -EINVAL; if (cost >= U32_MAX - PAGE_SIZE) goto free_stab; -- cgit v1.2.3-71-gd317 From 7fc17e909edfb9bf421ee04e981d3d474175c7c7 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 14 Feb 2018 22:17:34 +0800 Subject: bpf: cpumap: use GFP_KERNEL instead of GFP_ATOMIC in __cpu_map_entry_alloc() There're several implications after commit 0bf7800f1799 ("ptr_ring: try vmalloc() when kmalloc() fails") with the using of vmalloc() since can't allow GFP_ATOMIC but mandate GFP_KERNEL. This will lead a WARN since cpumap try to call with GFP_ATOMIC. Fortunately, entry allocation of cpumap can only be done through syscall path which means GFP_ATOMIC is not necessary, so fixing this by replacing GFP_ATOMIC with GFP_KERNEL. Reported-by: syzbot+1a240cdb1f4cc88819df@syzkaller.appspotmail.com Fixes: 0bf7800f1799 ("ptr_ring: try vmalloc() when kmalloc() fails") Cc: Michal Hocko Cc: Daniel Borkmann Cc: Matthew Wilcox Cc: Jesper Dangaard Brouer Cc: akpm@linux-foundation.org Cc: dhowells@redhat.com Cc: hannes@cmpxchg.org Signed-off-by: Jason Wang Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index fbfdada6caee..a4bb0b34375a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -334,7 +334,7 @@ static int cpu_map_kthread_run(void *data) static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) { - gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; int numa, err; -- cgit v1.2.3-71-gd317 From 9c481b908b011398b1491752271cd1e2c9ad5758 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 14 Feb 2018 15:31:00 +0100 Subject: bpf: fix bpf_prog_array_copy_to_user warning from perf event prog query syzkaller tried to perform a prog query in perf_event_query_prog_array() where struct perf_event_query_bpf had an ids_len of 1,073,741,353 and thus causing a warning due to failed kcalloc() allocation out of the bpf_prog_array_copy_to_user() helper. Given we cannot attach more than 64 programs to a perf event, there's no point in allowing huge ids_len. Therefore, allow a buffer that would fix the maximum number of ids and also add a __GFP_NOWARN to the temporary ids buffer. Fixes: f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") Fixes: 0911287ce32b ("bpf: fix bpf_prog_array_copy_to_user() issues") Reported-by: syzbot+cab5816b0edbabf598b3@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- kernel/trace/bpf_trace.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 29ca9208dcfa..d315b393abdd 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1590,7 +1590,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, * so always copy 'cnt' prog_ids to the user. * In a rare race the user will see zero prog_ids */ - ids = kcalloc(cnt, sizeof(u32), GFP_USER); + ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; rcu_read_lock(); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fc2838ac8b78..c0a9e310d715 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -872,6 +872,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return -EINVAL; if (copy_from_user(&query, uquery, sizeof(query))) return -EFAULT; + if (query.ids_len > BPF_TRACE_MAX_PROGS) + return -E2BIG; mutex_lock(&bpf_event_mutex); ret = bpf_prog_array_copy_info(event->tp_event->prog_array, -- cgit v1.2.3-71-gd317 From 9c2d63b843a5c8a8d0559cc067b5398aa5ec3ffc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 16 Feb 2018 01:10:29 +0100 Subject: bpf: fix mlock precharge on arraymaps syzkaller recently triggered OOM during percpu map allocation; while there is work in progress by Dennis Zhou to add __GFP_NORETRY semantics for percpu allocator under pressure, there seems also a missing bpf_map_precharge_memlock() check in array map allocation. Given today the actual bpf_map_charge_memlock() happens after the find_and_alloc_map() in syscall path, the bpf_map_precharge_memlock() is there to bail out early before we go and do the map setup work when we find that we hit the limits anyway. Therefore add this for array map as well. Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Fixes: a10423b87a7e ("bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map") Reported-by: syzbot+adb03f3f0bb57ce3acda@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Cc: Dennis Zhou Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b1f66480135b..a364c408f25a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -73,11 +73,11 @@ static int array_map_alloc_check(union bpf_attr *attr) static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; - int numa_node = bpf_map_attr_numa_node(attr); + int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); + u64 cost, array_size, mask64; struct bpf_array *array; - u64 array_size, mask64; elem_size = round_up(attr->value_size, 8); @@ -109,8 +109,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ - if (array_size >= U32_MAX - PAGE_SIZE) + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); + if (percpu) { + cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + } + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(cost); + if (ret < 0) + return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); @@ -121,20 +132,13 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; array->elem_size = elem_size; - if (!percpu) - goto out; - - array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); - - if (array_size >= U32_MAX - PAGE_SIZE || - bpf_array_alloc_percpu(array)) { + if (percpu && bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } -out: - array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; return &array->map; } -- cgit v1.2.3-71-gd317 From 12310e3437554328bcd75186cf331bc712cb30b2 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 10 Jan 2018 00:51:23 +0100 Subject: kprobes: Propagate error from arm_kprobe_ftrace() Improve error handling when arming ftrace-based kprobes. Specifically, if we fail to arm a ftrace-based kprobe, register_kprobe()/enable_kprobe() should report an error instead of success. Previously, this has lead to confusing situations where register_kprobe() would return 0 indicating success, but the kprobe would not be functional if ftrace registration during the kprobe arming process had failed. We should therefore take any errors returned by ftrace into account and propagate this error so that we do not register/enable kprobes that cannot be armed. This can happen if, for example, register_ftrace_function() finds an IPMODIFY conflict (since kprobe_ftrace_ops has this flag set) and returns an error. Such a conflict is possible since livepatches also set the IPMODIFY flag for their ftrace_ops. arm_all_kprobes() keeps its current behavior and attempts to arm all kprobes. It returns the last encountered error and gives a warning if not all probes could be armed. This patch is based on Petr Mladek's original patchset (patches 2 and 3) back in 2015, which improved kprobes error handling, found here: https://lkml.org/lkml/2015/2/26/452 However, further work on this had been paused since then and the patches were not upstreamed. Based-on-patches-by: Petr Mladek Signed-off-by: Jessica Yu Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Jiri Kosina Cc: Joe Lawrence Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Miroslav Benes Cc: Peter Zijlstra Cc: Petr Mladek Cc: Steven Rostedt Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Link: http://lkml.kernel.org/r/20180109235124.30886-2-jeyu@kernel.org Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 100 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index da2ccf142358..2d988141ab85 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -978,18 +978,36 @@ static int prepare_kprobe(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static void arm_kprobe_ftrace(struct kprobe *p) +static int arm_kprobe_ftrace(struct kprobe *p) { - int ret; + int ret = 0; ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 0, 0); - WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); - kprobe_ftrace_enabled++; - if (kprobe_ftrace_enabled == 1) { + if (ret) { + pr_debug("Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + return ret; + } + + if (kprobe_ftrace_enabled == 0) { ret = register_ftrace_function(&kprobe_ftrace_ops); - WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + if (ret) { + pr_debug("Failed to init kprobe-ftrace (%d)\n", ret); + goto err_ftrace; + } } + + kprobe_ftrace_enabled++; + return ret; + +err_ftrace: + /* + * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a + * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental + * empty filter_hash which would undesirably trace all functions. + */ + ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); + return ret; } /* Caller must lock kprobe_mutex */ @@ -1008,22 +1026,23 @@ static void disarm_kprobe_ftrace(struct kprobe *p) } #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) -#define arm_kprobe_ftrace(p) do {} while (0) +#define arm_kprobe_ftrace(p) (-ENODEV) #define disarm_kprobe_ftrace(p) do {} while (0) #endif /* Arm a kprobe with text_mutex */ -static void arm_kprobe(struct kprobe *kp) +static int arm_kprobe(struct kprobe *kp) { - if (unlikely(kprobe_ftrace(kp))) { - arm_kprobe_ftrace(kp); - return; - } + if (unlikely(kprobe_ftrace(kp))) + return arm_kprobe_ftrace(kp); + cpus_read_lock(); mutex_lock(&text_mutex); __arm_kprobe(kp); mutex_unlock(&text_mutex); cpus_read_unlock(); + + return 0; } /* Disarm a kprobe with text_mutex */ @@ -1362,9 +1381,15 @@ out: if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; - if (!kprobes_all_disarmed) + if (!kprobes_all_disarmed) { /* Arm the breakpoint again. */ - arm_kprobe(ap); + ret = arm_kprobe(ap); + if (ret) { + ap->flags |= KPROBE_FLAG_DISABLED; + list_del_rcu(&p->list); + synchronize_sched(); + } + } } return ret; } @@ -1573,8 +1598,14 @@ int register_kprobe(struct kprobe *p) hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - if (!kprobes_all_disarmed && !kprobe_disabled(p)) - arm_kprobe(p); + if (!kprobes_all_disarmed && !kprobe_disabled(p)) { + ret = arm_kprobe(p); + if (ret) { + hlist_del_rcu(&p->hlist); + synchronize_sched(); + goto out; + } + } /* Try to optimize kprobe */ try_to_optimize_kprobe(p); @@ -2116,7 +2147,9 @@ int enable_kprobe(struct kprobe *kp) if (!kprobes_all_disarmed && kprobe_disabled(p)) { p->flags &= ~KPROBE_FLAG_DISABLED; - arm_kprobe(p); + ret = arm_kprobe(p); + if (ret) + p->flags |= KPROBE_FLAG_DISABLED; } out: mutex_unlock(&kprobe_mutex); @@ -2407,11 +2440,12 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; -static void arm_all_kprobes(void) +static int arm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; - unsigned int i; + unsigned int i, total = 0, errors = 0; + int err, ret = 0; mutex_lock(&kprobe_mutex); @@ -2428,16 +2462,28 @@ static void arm_all_kprobes(void) /* Arming kprobes doesn't optimize kprobe itself */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, head, hlist) - if (!kprobe_disabled(p)) - arm_kprobe(p); + /* Arm all kprobes on a best-effort basis */ + hlist_for_each_entry_rcu(p, head, hlist) { + if (!kprobe_disabled(p)) { + err = arm_kprobe(p); + if (err) { + errors++; + ret = err; + } + total++; + } + } } - printk(KERN_INFO "Kprobes globally enabled\n"); + if (errors) + pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n", + errors, total); + else + pr_info("Kprobes globally enabled\n"); already_enabled: mutex_unlock(&kprobe_mutex); - return; + return ret; } static void disarm_all_kprobes(void) @@ -2494,6 +2540,7 @@ static ssize_t write_enabled_file_bool(struct file *file, { char buf[32]; size_t buf_size; + int ret = 0; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) @@ -2504,7 +2551,7 @@ static ssize_t write_enabled_file_bool(struct file *file, case 'y': case 'Y': case '1': - arm_all_kprobes(); + ret = arm_all_kprobes(); break; case 'n': case 'N': @@ -2515,6 +2562,9 @@ static ssize_t write_enabled_file_bool(struct file *file, return -EINVAL; } + if (ret) + return ret; + return count; } -- cgit v1.2.3-71-gd317 From 297f9233b53a08fd457815e19f1d6f2c3389857b Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 10 Jan 2018 00:51:24 +0100 Subject: kprobes: Propagate error from disarm_kprobe_ftrace() Improve error handling when disarming ftrace-based kprobes. Like with arm_kprobe_ftrace(), propagate any errors from disarm_kprobe_ftrace() so that we do not disable/unregister kprobes that are still armed. In other words, unregister_kprobe() and disable_kprobe() should not report success if the kprobe could not be disarmed. disarm_all_kprobes() keeps its current behavior and attempts to disarm all kprobes. It returns the last encountered error and gives a warning if not all probes could be disarmed. This patch is based on Petr Mladek's original patchset (patches 2 and 3) back in 2015, which improved kprobes error handling, found here: https://lkml.org/lkml/2015/2/26/452 However, further work on this had been paused since then and the patches were not upstreamed. Based-on-patches-by: Petr Mladek Signed-off-by: Jessica Yu Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Jiri Kosina Cc: Joe Lawrence Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Miroslav Benes Cc: Peter Zijlstra Cc: Petr Mladek Cc: Steven Rostedt Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Link: http://lkml.kernel.org/r/20180109235124.30886-3-jeyu@kernel.org Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 78 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2d988141ab85..102160ff5c66 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1011,23 +1011,27 @@ err_ftrace: } /* Caller must lock kprobe_mutex */ -static void disarm_kprobe_ftrace(struct kprobe *p) +static int disarm_kprobe_ftrace(struct kprobe *p) { - int ret; + int ret = 0; - kprobe_ftrace_enabled--; - if (kprobe_ftrace_enabled == 0) { + if (kprobe_ftrace_enabled == 1) { ret = unregister_ftrace_function(&kprobe_ftrace_ops); - WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret)) + return ret; } + + kprobe_ftrace_enabled--; + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); + return ret; } #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) #define arm_kprobe_ftrace(p) (-ENODEV) -#define disarm_kprobe_ftrace(p) do {} while (0) +#define disarm_kprobe_ftrace(p) (-ENODEV) #endif /* Arm a kprobe with text_mutex */ @@ -1046,18 +1050,18 @@ static int arm_kprobe(struct kprobe *kp) } /* Disarm a kprobe with text_mutex */ -static void disarm_kprobe(struct kprobe *kp, bool reopt) +static int disarm_kprobe(struct kprobe *kp, bool reopt) { - if (unlikely(kprobe_ftrace(kp))) { - disarm_kprobe_ftrace(kp); - return; - } + if (unlikely(kprobe_ftrace(kp))) + return disarm_kprobe_ftrace(kp); cpus_read_lock(); mutex_lock(&text_mutex); __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); cpus_read_unlock(); + + return 0; } /* @@ -1639,11 +1643,12 @@ static int aggr_kprobe_disabled(struct kprobe *ap) static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; + int ret; /* Get an original kprobe for return */ orig_p = __get_valid_kprobe(p); if (unlikely(orig_p == NULL)) - return NULL; + return ERR_PTR(-EINVAL); if (!kprobe_disabled(p)) { /* Disable probe if it is a child probe */ @@ -1657,8 +1662,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) * should have already been disarmed, so * skip unneed disarming process. */ - if (!kprobes_all_disarmed) - disarm_kprobe(orig_p, true); + if (!kprobes_all_disarmed) { + ret = disarm_kprobe(orig_p, true); + if (ret) { + p->flags &= ~KPROBE_FLAG_DISABLED; + return ERR_PTR(ret); + } + } orig_p->flags |= KPROBE_FLAG_DISABLED; } } @@ -1675,8 +1685,8 @@ static int __unregister_kprobe_top(struct kprobe *p) /* Disable kprobe. This will disarm it if needed. */ ap = __disable_kprobe(p); - if (ap == NULL) - return -EINVAL; + if (IS_ERR(ap)) + return PTR_ERR(ap); if (ap == p) /* @@ -2109,12 +2119,14 @@ static void kill_kprobe(struct kprobe *p) int disable_kprobe(struct kprobe *kp) { int ret = 0; + struct kprobe *p; mutex_lock(&kprobe_mutex); /* Disable this kprobe */ - if (__disable_kprobe(kp) == NULL) - ret = -EINVAL; + p = __disable_kprobe(kp); + if (IS_ERR(p)) + ret = PTR_ERR(p); mutex_unlock(&kprobe_mutex); return ret; @@ -2486,34 +2498,50 @@ already_enabled: return ret; } -static void disarm_all_kprobes(void) +static int disarm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; - unsigned int i; + unsigned int i, total = 0, errors = 0; + int err, ret = 0; mutex_lock(&kprobe_mutex); /* If kprobes are already disarmed, just return */ if (kprobes_all_disarmed) { mutex_unlock(&kprobe_mutex); - return; + return 0; } kprobes_all_disarmed = true; - printk(KERN_INFO "Kprobes globally disabled\n"); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; + /* Disarm all kprobes on a best-effort basis */ hlist_for_each_entry_rcu(p, head, hlist) { - if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - disarm_kprobe(p, false); + if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) { + err = disarm_kprobe(p, false); + if (err) { + errors++; + ret = err; + } + total++; + } } } + + if (errors) + pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n", + errors, total); + else + pr_info("Kprobes globally disabled\n"); + mutex_unlock(&kprobe_mutex); /* Wait for disarming all kprobes by optimizer */ wait_for_kprobe_optimizer(); + + return ret; } /* @@ -2556,7 +2584,7 @@ static ssize_t write_enabled_file_bool(struct file *file, case 'n': case 'N': case '0': - disarm_all_kprobes(); + ret = disarm_all_kprobes(); break; default: return -EINVAL; -- cgit v1.2.3-71-gd317 From 0b24a0bbe2147815d982d9335c41bb10c04f40bc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 14 Feb 2018 17:47:35 +0200 Subject: irqdomain: Re-use DEFINE_SHOW_ATTRIBUTE() macro ...instead of open coding file operations followed by custom ->open() callbacks per each attribute. Signed-off-by: Andy Shevchenko Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e6a9c36470ee..82b8b18ee1eb 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1726,25 +1726,14 @@ static int irq_domain_debug_show(struct seq_file *m, void *p) irq_domain_debug_show_one(m, d, 0); return 0; } - -static int irq_domain_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_domain_debug_show, inode->i_private); -} - -static const struct file_operations dfs_domain_ops = { - .open = irq_domain_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(irq_domain_debug); static void debugfs_add_domain_dir(struct irq_domain *d) { if (!d->name || !domain_dir || d->debugfs_file) return; d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, - &dfs_domain_ops); + &irq_domain_debug_fops); } static void debugfs_remove_domain_dir(struct irq_domain *d) @@ -1760,7 +1749,8 @@ void __init irq_domain_debugfs_init(struct dentry *root) if (!domain_dir) return; - debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops); + debugfs_create_file("default", 0444, domain_dir, NULL, + &irq_domain_debug_fops); mutex_lock(&irq_domain_mutex); list_for_each_entry(d, &irq_domain_list, link) debugfs_add_domain_dir(d); -- cgit v1.2.3-71-gd317 From 27d4ee03078aba88c5e07dcc4917e8d01d046f38 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 11 Feb 2018 10:38:28 +0100 Subject: workqueue: Allow retrieval of current task's work struct Introduce a helper to retrieve the current task's work struct if it is a workqueue worker. This allows us to fix a long-standing deadlock in several DRM drivers wherein the ->runtime_suspend callback waits for a specific worker to finish and that worker in turn calls a function which waits for runtime suspend to finish. That function is invoked from multiple call sites and waiting for runtime suspend to finish is the correct thing to do except if it's executing in the context of the worker. Cc: Lai Jiangshan Cc: Dave Airlie Cc: Ben Skeggs Cc: Alex Deucher Acked-by: Tejun Heo Reviewed-by: Lyude Paul Signed-off-by: Lukas Wunner Link: https://patchwork.freedesktop.org/patch/msgid/2d8f603074131eb87e588d2b803a71765bd3a2fd.1518338788.git.lukas@wunner.de --- include/linux/workqueue.h | 1 + kernel/workqueue.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 4a54ef96aff5..bc0cda180c8b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -465,6 +465,7 @@ extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); +extern struct work_struct *current_work(void); extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 43d18cb46308..255c20efdf7b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4167,6 +4167,22 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) } EXPORT_SYMBOL_GPL(workqueue_set_max_active); +/** + * current_work - retrieve %current task's work struct + * + * Determine if %current task is a workqueue worker and what it's working on. + * Useful to find out the context that the %current task is running in. + * + * Return: work struct if %current task is a workqueue worker, %NULL otherwise. + */ +struct work_struct *current_work(void) +{ + struct worker *worker = current_wq_worker(); + + return worker ? worker->current_work : NULL; +} +EXPORT_SYMBOL(current_work); + /** * current_is_workqueue_rescuer - is %current workqueue rescuer? * -- cgit v1.2.3-71-gd317 From d34bc48f8275b6ce0da44f639d68344891268ee9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 21 Feb 2018 14:45:17 -0800 Subject: include/linux/sched/mm.h: re-inline mmdrop() As Peter points out, Doing a CALL+RET for just the decrement is a bit silly. Fixes: d70f2a14b72a4bc ("include/linux/sched/mm.h: uninline mmdrop_async(), etc") Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Michal Hocko Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 13 ++++++++++++- kernel/fork.c | 15 ++------------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 1149533aa2fa..9806184bb3d5 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -36,7 +36,18 @@ static inline void mmgrab(struct mm_struct *mm) atomic_inc(&mm->mm_count); } -extern void mmdrop(struct mm_struct *mm); +extern void __mmdrop(struct mm_struct *mm); + +static inline void mmdrop(struct mm_struct *mm) +{ + /* + * The implicit full barrier implied by atomic_dec_and_test() is + * required by the membarrier system call before returning to + * user-space, after storing to rq->curr. + */ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) + __mmdrop(mm); +} /** * mmget() - Pin the address space associated with a &struct mm_struct. diff --git a/kernel/fork.c b/kernel/fork.c index be8aa5b98666..e5d9d405ae4e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -592,7 +592,7 @@ static void check_mm(struct mm_struct *mm) * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ -static void __mmdrop(struct mm_struct *mm) +void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); mm_free_pgd(mm); @@ -603,18 +603,7 @@ static void __mmdrop(struct mm_struct *mm) put_user_ns(mm->user_ns); free_mm(mm); } - -void mmdrop(struct mm_struct *mm) -{ - /* - * The implicit full barrier implied by atomic_dec_and_test() is - * required by the membarrier system call before returning to - * user-space, after storing to rq->curr. - */ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) - __mmdrop(mm); -} -EXPORT_SYMBOL_GPL(mmdrop); +EXPORT_SYMBOL_GPL(__mmdrop); static void mmdrop_async_fn(struct work_struct *work) { -- cgit v1.2.3-71-gd317 From 88913bd8ea2a75d7e460a4bed5f75e1c32660d7e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 21 Feb 2018 14:45:32 -0800 Subject: kernel/relay.c: limit kmalloc size to KMALLOC_MAX_SIZE chan->n_subbufs is set by the user and relay_create_buf() does a kmalloc() of chan->n_subbufs * sizeof(size_t *). kmalloc_slab() will generate a warning when this fails if chan->subbufs * sizeof(size_t *) > KMALLOC_MAX_SIZE. Limit chan->n_subbufs to the maximum allowed kmalloc() size. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1802061216100.122576@chino.kir.corp.google.com Fixes: f6302f1bcd75 ("relay: prevent integer overflow in relay_open()") Signed-off-by: David Rientjes Reviewed-by: Andrew Morton Cc: Jens Axboe Cc: Dave Jiang Cc: Al Viro Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index c3029402f15c..c955b10c973c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -163,7 +163,7 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan) { struct rchan_buf *buf; - if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) + if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *)) return NULL; buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); -- cgit v1.2.3-71-gd317 From 63bb0045b98ae821e56e27c2250e14bb0ae663e5 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Tue, 20 Feb 2018 19:47:46 -0700 Subject: ptrace, seccomp: tweak get_metadata behavior slightly Previously if users passed a small size for the input structure size, they would get get odd behavior. It doesn't make sense to pass a structure smaller than at least filter_off size, so let's just give -EINVAL in this case. This changes userspace visible behavior, but was only introduced in commit 26500475ac1b ("ptrace, seccomp: add support for retrieving seccomp metadata") in 4.16-rc2, so should be safe to change if merged before then. Reported-by: Eugene Syromiatnikov Signed-off-by: Tycho Andersen CC: Kees Cook CC: Oleg Nesterov Signed-off-by: Kees Cook --- kernel/seccomp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 61bd9dc260c8..1245b2338fff 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1076,14 +1076,16 @@ long seccomp_get_metadata(struct task_struct *task, size = min_t(unsigned long, size, sizeof(kmd)); - if (copy_from_user(&kmd, data, size)) + if (size < sizeof(kmd.filter_off)) + return -EINVAL; + + if (copy_from_user(&kmd.filter_off, data, sizeof(kmd.filter_off))) return -EFAULT; filter = get_nth_filter(task, kmd.filter_off); if (IS_ERR(filter)) return PTR_ERR(filter); - memset(&kmd, 0, sizeof(kmd)); if (filter->log) kmd.flags |= SECCOMP_FILTER_FLAG_LOG; -- cgit v1.2.3-71-gd317 From bef3efbeb897b56867e271cdbc5f8adaacaeb9cd Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Thu, 22 Feb 2018 09:15:06 -0800 Subject: efivarfs: Limit the rate for non-root to read files Each read from a file in efivarfs results in two calls to EFI (one to get the file size, another to get the actual data). On X86 these EFI calls result in broadcast system management interrupts (SMI) which affect performance of the whole system. A malicious user can loop performing reads from efivarfs bringing the system to its knees. Linus suggested per-user rate limit to solve this. So we add a ratelimit structure to "user_struct" and initialize it for the root user for no limit. When allocating user_struct for other users we set the limit to 100 per second. This could be used for other places that want to limit the rate of some detrimental user action. In efivarfs if the limit is exceeded when reading, we take an interruptible nap for 50ms and check the rate limit again. Signed-off-by: Tony Luck Acked-by: Ard Biesheuvel Signed-off-by: Linus Torvalds --- fs/efivarfs/file.c | 6 ++++++ include/linux/sched/user.h | 4 ++++ kernel/user.c | 3 +++ 3 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 5f22e74bbade..8e568428c88b 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -74,6 +75,11 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, ssize_t size = 0; int err; + while (!__ratelimit(&file->f_cred->user->ratelimit)) { + if (!msleep_interruptible(50)) + return -EINTR; + } + err = efivar_entry_size(var, &datasize); /* diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 0dcf4e480ef7..96fe289c4c6e 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -4,6 +4,7 @@ #include #include +#include struct key; @@ -41,6 +42,9 @@ struct user_struct { defined(CONFIG_NET) atomic_long_t locked_vm; #endif + + /* Miscellaneous per-user rate limit */ + struct ratelimit_state ratelimit; }; extern int uids_sysfs_init(void); diff --git a/kernel/user.c b/kernel/user.c index 9a20acce460d..36288d840675 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -101,6 +101,7 @@ struct user_struct root_user = { .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, + .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; /* @@ -191,6 +192,8 @@ struct user_struct *alloc_uid(kuid_t uid) new->uid = uid; atomic_set(&new->__count, 1); + ratelimit_state_init(&new->ratelimit, HZ, 100); + ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); /* * Before adding this, check whether we raced -- cgit v1.2.3-71-gd317 From 32fff239de37ef226d5b66329dd133f64d63b22d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Feb 2018 08:33:24 -0800 Subject: bpf: add schedule points in percpu arrays management syszbot managed to trigger RCU detected stalls in bpf_array_free_percpu() It takes time to allocate a huge percpu map, but even more time to free it. Since we run in process context, use cond_resched() to yield cpu if needed. Fixes: a10423b87a7e ("bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index a364c408f25a..14750e7c5ee4 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -26,8 +26,10 @@ static void bpf_array_free_percpu(struct bpf_array *array) { int i; - for (i = 0; i < array->map.max_entries; i++) + for (i = 0; i < array->map.max_entries; i++) { free_percpu(array->pptrs[i]); + cond_resched(); + } } static int bpf_array_alloc_percpu(struct bpf_array *array) @@ -43,6 +45,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) return -ENOMEM; } array->pptrs[i] = ptr; + cond_resched(); } return 0; -- cgit v1.2.3-71-gd317 From 6c5f61023c5b0edb0c8a64c902fe97c6453b1852 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 22 Feb 2018 10:10:35 -0800 Subject: bpf: fix rcu lockdep warning for lpm_trie map_free callback Commit 9a3efb6b661f ("bpf: fix memory leak in lpm_trie map_free callback function") fixed a memory leak and removed unnecessary locks in map_free callback function. Unfortrunately, it introduced a lockdep warning. When lockdep checking is turned on, running tools/testing/selftests/bpf/test_lpm_map will have: [ 98.294321] ============================= [ 98.294807] WARNING: suspicious RCU usage [ 98.295359] 4.16.0-rc2+ #193 Not tainted [ 98.295907] ----------------------------- [ 98.296486] /home/yhs/work/bpf/kernel/bpf/lpm_trie.c:572 suspicious rcu_dereference_check() usage! [ 98.297657] [ 98.297657] other info that might help us debug this: [ 98.297657] [ 98.298663] [ 98.298663] rcu_scheduler_active = 2, debug_locks = 1 [ 98.299536] 2 locks held by kworker/2:1/54: [ 98.300152] #0: ((wq_completion)"events"){+.+.}, at: [<00000000196bc1f0>] process_one_work+0x157/0x5c0 [ 98.301381] #1: ((work_completion)(&map->work)){+.+.}, at: [<00000000196bc1f0>] process_one_work+0x157/0x5c0 Since actual trie tree removal happens only after no other accesses to the tree are possible, replacing rcu_dereference_protected(*slot, lockdep_is_held(&trie->lock)) with rcu_dereference_protected(*slot, 1) fixed the issue. Fixes: 9a3efb6b661f ("bpf: fix memory leak in lpm_trie map_free callback function") Reported-by: Eric Dumazet Suggested-by: Eric Dumazet Signed-off-by: Yonghong Song Reviewed-by: Eric Dumazet Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index a75e02c961b5..b4b5b81e7251 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -569,8 +569,7 @@ static void trie_free(struct bpf_map *map) slot = &trie->root; for (;;) { - node = rcu_dereference_protected(*slot, - lockdep_is_held(&trie->lock)); + node = rcu_dereference_protected(*slot, 1); if (!node) goto out; -- cgit v1.2.3-71-gd317 From 651ca2c00405a2ae3870cc0b4f15a182eb6fbe26 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 Feb 2018 12:08:05 +0100 Subject: genirq/matrix: Handle CPU offlining proper At CPU hotunplug the corresponding per cpu matrix allocator is shut down and the allocated interrupt bits are discarded under the assumption that all allocated bits have been either migrated away or shut down through the managed interrupts mechanism. This is not true because interrupts which are not started up might have a vector allocated on the outgoing CPU. When the interrupt is started up later or completely shutdown and freed then the allocated vector is handed back, triggering warnings or causing accounting issues which result in suspend failures and other issues. Change the CPU hotplug mechanism of the matrix allocator so that the remaining allocations at unplug time are preserved and global accounting at hotplug is correctly readjusted to take the dormant vectors into account. Fixes: 2f75d9e1c905 ("genirq: Implement bitmap matrix allocator") Reported-by: Yuriy Vostrikov Signed-off-by: Thomas Gleixner Tested-by: Yuriy Vostrikov Cc: Peter Zijlstra Cc: Randy Dunlap Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180222112316.849980972@linutronix.de --- kernel/irq/matrix.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 5187dfe809ac..4c5770407031 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -16,6 +16,7 @@ struct cpumap { unsigned int available; unsigned int allocated; unsigned int managed; + bool initialized; bool online; unsigned long alloc_map[IRQ_MATRIX_SIZE]; unsigned long managed_map[IRQ_MATRIX_SIZE]; @@ -81,9 +82,11 @@ void irq_matrix_online(struct irq_matrix *m) BUG_ON(cm->online); - bitmap_zero(cm->alloc_map, m->matrix_bits); - cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc); - cm->allocated = 0; + if (!cm->initialized) { + cm->available = m->alloc_size; + cm->available -= cm->managed + m->systembits_inalloc; + cm->initialized = true; + } m->global_available += cm->available; cm->online = true; m->online_maps++; @@ -370,14 +373,16 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) return; - if (cm->online) { - clear_bit(bit, cm->alloc_map); - cm->allocated--; + clear_bit(bit, cm->alloc_map); + cm->allocated--; + + if (cm->online) m->total_allocated--; - if (!managed) { - cm->available++; + + if (!managed) { + cm->available++; + if (cm->online) m->global_available++; - } } trace_irq_matrix_free(bit, cpu, m, cm); } -- cgit v1.2.3-71-gd317