From 33ccec5fd740d0d5b78b77846f76eb5b4feb4327 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 17 Feb 2021 10:45:25 +0300 Subject: bpf: Fix a warning message in mark_ptr_not_null_reg() The WARN_ON() argument is a condition, not an error message. So this code will print a stack trace but will not print the warning message. Fix that and also change it to only WARN_ONCE(). Fixes: 4ddb74165ae5 ("bpf: Extract nullable reg type conversion into a helper function") Signed-off-by: Dan Carpenter Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/YCzJlV3hnF%2Ft1Pk4@mwanda --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1dda9d81f12c..3d34ba492d46 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1120,7 +1120,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) reg->type = PTR_TO_RDWR_BUF; break; default: - WARN_ON("unknown nullable register type"); + WARN_ONCE(1, "unknown nullable register type"); } } -- cgit v1.2.3-71-gd317 From 53f523f3052ac16bbc7718032aa6b848f971d28c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 17 Feb 2021 16:16:47 -0800 Subject: bpf: Clear percpu pointers in bpf_prog_clone_free() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to bpf_prog_realloc(), bpf_prog_clone_create() also copies the percpu pointers, but the clone still shares them with the original prog, so we have to clear these two percpu pointers in bpf_prog_clone_free(). Otherwise we would get a double free: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 13 PID: 8140 Comm: kworker/13:247 Kdump: loaded Tainted: G                W    OE   5.11.0-rc4.bm.1-amd64+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 test_bpf: #1 TXA Workqueue: events bpf_prog_free_deferred RIP: 0010:percpu_ref_get_many.constprop.97+0x42/0xf0 Code: [...] RSP: 0018:ffffa6bce1f9bda0 EFLAGS: 00010002 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 00000000021dfc7b RDX: ffffffffae2eeb90 RSI: 867f92637e338da5 RDI: 0000000000000046 RBP: ffffa6bce1f9bda8 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000046 R11: 0000000000000000 R12: 0000000000000280 R13: 0000000000000000 R14: 0000000000000000 R15: ffff9b5f3ffdedc0 FS:    0000000000000000(0000) GS:ffff9b5f2fb40000(0000) knlGS:0000000000000000 CS:    0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000027c36c002 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace:     refill_obj_stock+0x5e/0xd0     free_percpu+0xee/0x550     __bpf_prog_free+0x4d/0x60     process_one_work+0x26a/0x590     worker_thread+0x3c/0x390     ? process_one_work+0x590/0x590     kthread+0x130/0x150     ? kthread_park+0x80/0x80     ret_from_fork+0x1f/0x30 This bug is 100% reproducible with test_kmod.sh. Fixes: 700d4796ef59 ("bpf: Optimize program stats") Fixes: ca06f55b9002 ("bpf: Add per-program recursion prevention mechanism") Reported-by: Jiang Wang Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210218001647.71631-1-xiyou.wangcong@gmail.com --- kernel/bpf/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0ae015ad1e05..aa1e64196d8d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1118,6 +1118,8 @@ static void bpf_prog_clone_free(struct bpf_prog *fp) * clone is guaranteed to not be locked. */ fp->aux = NULL; + fp->stats = NULL; + fp->active = NULL; __bpf_prog_free(fp); } -- cgit v1.2.3-71-gd317 From f4eda8b6e4a5c7897c6bb992ed63a27061b371ef Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Tue, 23 Feb 2021 13:04:16 +0400 Subject: bpf: Drop imprecise log message Now it is possible for global function to have a pointer argument that points to something different than struct. Drop the irrelevant log message and keep the logic same. Fixes: e5069b9c23b3 ("bpf: Support pointers in global func args") Signed-off-by: Dmitrii Banshchikov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210223090416.333943-1-me@ubique.spb.ru --- kernel/bpf/btf.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2efeb5f4b343..b1a76fe046cb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4321,8 +4321,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, * is not supported yet. * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. */ - if (log->level & BPF_LOG_LEVEL) - bpf_log(log, "arg#%d type is not a struct\n", arg); return NULL; } tname = btf_name_by_offset(btf, t->name_off); -- cgit v1.2.3-71-gd317 From 83a2881903f3d5bc08ded4fb04f6e3bedb1fba65 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 1 Mar 2021 16:40:19 +0100 Subject: bpf: Account for BPF_FETCH in insn_has_def32() insn_has_def32() returns false for 32-bit BPF_FETCH insns. This makes adjust_insn_aux_data() incorrectly set zext_dst, as can be seen in [1]. This happens because insn_no_def() does not know about the BPF_FETCH variants of BPF_STX. Fix in two steps. First, replace insn_no_def() with insn_def_regno(), which returns the register an insn defines. Normally insn_no_def() calls are followed by insn->dst_reg uses; replace those with the insn_def_regno() return value. Second, adjust the BPF_STX special case in is_reg64() to deal with queries made from opt_subreg_zext_lo32_rnd_hi32(), where the state information is no longer available. Add a comment, since the purpose of this special case is not clear at first glance. [1] https://lore.kernel.org/bpf/20210223150845.1857620-1-jackmanb@google.com/ Fixes: 5ffa25502b5a ("bpf: Add instructions for atomic_[cmp]xchg") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Brendan Jackman Link: https://lore.kernel.org/bpf/20210301154019.129110-1-iii@linux.ibm.com --- kernel/bpf/verifier.c | 70 ++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3d34ba492d46..bb3eaab934f3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1703,7 +1703,11 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (class == BPF_STX) { - if (reg->type != SCALAR_VALUE) + /* BPF_STX (including atomic variants) has multiple source + * operands, one of which is a ptr. Check whether the caller is + * asking about it. + */ + if (t == SRC_OP && reg->type != SCALAR_VALUE) return true; return BPF_SIZE(code) == BPF_DW; } @@ -1735,22 +1739,38 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, return true; } -/* Return TRUE if INSN doesn't have explicit value define. */ -static bool insn_no_def(struct bpf_insn *insn) +/* Return the regno defined by the insn, or -1. */ +static int insn_def_regno(const struct bpf_insn *insn) { - u8 class = BPF_CLASS(insn->code); - - return (class == BPF_JMP || class == BPF_JMP32 || - class == BPF_STX || class == BPF_ST); + switch (BPF_CLASS(insn->code)) { + case BPF_JMP: + case BPF_JMP32: + case BPF_ST: + return -1; + case BPF_STX: + if (BPF_MODE(insn->code) == BPF_ATOMIC && + (insn->imm & BPF_FETCH)) { + if (insn->imm == BPF_CMPXCHG) + return BPF_REG_0; + else + return insn->src_reg; + } else { + return -1; + } + default: + return insn->dst_reg; + } } /* Return TRUE if INSN has defined any 32-bit value explicitly. */ static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) { - if (insn_no_def(insn)) + int dst_reg = insn_def_regno(insn); + + if (dst_reg == -1) return false; - return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP); + return !is_reg64(env, insn, dst_reg, NULL, DST_OP); } static void mark_insn_zext(struct bpf_verifier_env *env, @@ -11006,9 +11026,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, for (i = 0; i < len; i++) { int adj_idx = i + delta; struct bpf_insn insn; - u8 load_reg; + int load_reg; insn = insns[adj_idx]; + load_reg = insn_def_regno(&insn); if (!aux[adj_idx].zext_dst) { u8 code, class; u32 imm_rnd; @@ -11018,14 +11039,14 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, code = insn.code; class = BPF_CLASS(code); - if (insn_no_def(&insn)) + if (load_reg == -1) continue; /* NOTE: arg "reg" (the fourth one) is only used for - * BPF_STX which has been ruled out in above - * check, it is safe to pass NULL here. + * BPF_STX + SRC_OP, so it is safe to pass NULL + * here. */ - if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) { + if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) { if (class == BPF_LD && BPF_MODE(code) == BPF_IMM) i++; @@ -11040,7 +11061,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, imm_rnd = get_random_int(); rnd_hi32_patch[0] = insn; rnd_hi32_patch[1].imm = imm_rnd; - rnd_hi32_patch[3].dst_reg = insn.dst_reg; + rnd_hi32_patch[3].dst_reg = load_reg; patch = rnd_hi32_patch; patch_len = 4; goto apply_patch_buffer; @@ -11049,22 +11070,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (!bpf_jit_needs_zext()) continue; - /* zext_dst means that we want to zero-extend whatever register - * the insn defines, which is dst_reg most of the time, with - * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH. - */ - if (BPF_CLASS(insn.code) == BPF_STX && - BPF_MODE(insn.code) == BPF_ATOMIC) { - /* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not - * define any registers, therefore zext_dst cannot be - * set. - */ - if (WARN_ON(!(insn.imm & BPF_FETCH))) - return -EINVAL; - load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0 - : insn.src_reg; - } else { - load_reg = insn.dst_reg; + if (WARN_ON(load_reg == -1)) { + verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); + return -EFAULT; } zext_patch[0] = insn; -- cgit v1.2.3-71-gd317 From 39491867ace594b4912c35f576864d204beed2b3 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 4 Mar 2021 18:56:46 -0800 Subject: bpf: Explicitly zero-extend R0 after 32-bit cmpxchg As pointed out by Ilya and explained in the new comment, there's a discrepancy between x86 and BPF CMPXCHG semantics: BPF always loads the value from memory into r0, while x86 only does so when r0 and the value in memory are different. The same issue affects s390. At first this might sound like pure semantics, but it makes a real difference when the comparison is 32-bit, since the load will zero-extend r0/rax. The fix is to explicitly zero-extend rax after doing such a CMPXCHG. Since this problem affects multiple archs, this is done in the verifier by patching in a BPF_ZEXT_REG instruction after every 32-bit cmpxchg. Any archs that don't need such manual zero-extension can do a look-ahead with insn_is_zext to skip the unnecessary mov. Note this still goes on top of Ilya's patch: https://lore.kernel.org/bpf/20210301154019.129110-1-iii@linux.ibm.com/T/#u Differences v5->v6[1]: - Moved is_cmpxchg_insn and ensured it can be safely re-used. Also renamed it and removed 'inline' to match the style of the is_*_function helpers. - Fixed up comments in verifier test (thanks for the careful review, Martin!) Differences v4->v5[1]: - Moved the logic entirely into opt_subreg_zext_lo32_rnd_hi32, thanks to Martin for suggesting this. Differences v3->v4[1]: - Moved the optimization against pointless zext into the correct place: opt_subreg_zext_lo32_rnd_hi32 is called _after_ fixup_bpf_calls. Differences v2->v3[1]: - Moved patching into fixup_bpf_calls (patch incoming to rename this function) - Added extra commentary on bpf_jit_needs_zext - Added check to avoid adding a pointless zext(r0) if there's already one there. Difference v1->v2[1]: Now solved centrally in the verifier instead of specifically for the x86 JIT. Thanks to Ilya and Daniel for the suggestions! [1] v5: https://lore.kernel.org/bpf/CA+i-1C3ytZz6FjcPmUg5s4L51pMQDxWcZNvM86w4RHZ_o2khwg@mail.gmail.com/T/#t v4: https://lore.kernel.org/bpf/CA+i-1C3ytZz6FjcPmUg5s4L51pMQDxWcZNvM86w4RHZ_o2khwg@mail.gmail.com/T/#t v3: https://lore.kernel.org/bpf/08669818-c99d-0d30-e1db-53160c063611@iogearbox.net/T/#t v2: https://lore.kernel.org/bpf/08669818-c99d-0d30-e1db-53160c063611@iogearbox.net/T/#t v1: https://lore.kernel.org/bpf/d7ebaefb-bfd6-a441-3ff2-2fdfe699b1d2@iogearbox.net/T/#t Reported-by: Ilya Leoshkevich Fixes: 5ffa25502b5a ("bpf: Add instructions for atomic_[cmp]xchg") Signed-off-by: Brendan Jackman Acked-by: Martin KaFai Lau Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 4 ++++ kernel/bpf/verifier.c | 19 +++++++++++++++- .../selftests/bpf/verifier/atomic_cmpxchg.c | 25 ++++++++++++++++++++++ tools/testing/selftests/bpf/verifier/atomic_or.c | 25 ++++++++++++++++++++++ 4 files changed, 72 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index aa1e64196d8d..3a283bf97f2f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2344,6 +2344,10 @@ bool __weak bpf_helper_changes_pkt_data(void *func) /* Return TRUE if the JIT backend wants verifier to enable sub-register usage * analysis code and wants explicit zero extension inserted by verifier. * Otherwise, return FALSE. + * + * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if + * you don't override this. JITs that don't want these extra insns can detect + * them using insn_is_zext. */ bool __weak bpf_jit_needs_zext(void) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb3eaab934f3..c56e3fcb5f1a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,6 +504,13 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; } +static bool is_cmpxchg_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_CMPXCHG; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -11067,7 +11074,17 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, goto apply_patch_buffer; } - if (!bpf_jit_needs_zext()) + /* Add in an zero-extend instruction if a) the JIT has requested + * it or b) it's a CMPXCHG. + * + * The latter is because: BPF_CMPXCHG always loads a value into + * R0, therefore always zero-extends. However some archs' + * equivalent instruction only does this load when the + * comparison is successful. This detail of CMPXCHG is + * orthogonal to the general zero-extension behaviour of the + * CPU, so it's treated independently of bpf_jit_needs_zext. + */ + if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn)) continue; if (WARN_ON(load_reg == -1)) { diff --git a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c index 2efd8bcf57a1..6e52dfc64415 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c +++ b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c @@ -94,3 +94,28 @@ .result = REJECT, .errstr = "invalid read from stack", }, +{ + "BPF_W cmpxchg should zero top 32 bits", + .insns = { + /* r0 = U64_MAX; */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 1), + /* u64 val = r0; */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + /* r0 = (u32)atomic_cmpxchg((u32 *)&val, r0, 1); */ + BPF_MOV32_IMM(BPF_REG_1, 1), + BPF_ATOMIC_OP(BPF_W, BPF_CMPXCHG, BPF_REG_10, BPF_REG_1, -8), + /* r1 = 0x00000000FFFFFFFFull; */ + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1), + /* if (r0 != r1) exit(1); */ + BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_1, 2), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/atomic_or.c b/tools/testing/selftests/bpf/verifier/atomic_or.c index 70f982e1f9f0..9d0716ac5080 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_or.c +++ b/tools/testing/selftests/bpf/verifier/atomic_or.c @@ -75,3 +75,28 @@ }, .result = ACCEPT, }, +{ + "BPF_W atomic_fetch_or should zero top 32 bits", + .insns = { + /* r1 = U64_MAX; */ + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1), + /* u64 val = r1; */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* r1 = (u32)atomic_fetch_or((u32 *)&val, 2); */ + BPF_MOV32_IMM(BPF_REG_1, 2), + BPF_ATOMIC_OP(BPF_W, BPF_OR | BPF_FETCH, BPF_REG_10, BPF_REG_1, -8), + /* r2 = 0x00000000FFFFFFFF; */ + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 1), + /* if (r2 != r1) exit(1); */ + BPF_JMP_REG(BPF_JEQ, BPF_REG_2, BPF_REG_1, 2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, -- cgit v1.2.3-71-gd317 From 8a6edb5257e2a84720fe78cb179eca58ba76126f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 13 Feb 2021 13:10:35 +0100 Subject: sched: Fix migration_cpu_stop() requeueing When affine_move_task(p) is called on a running task @p, which is not otherwise already changing affinity, we'll first set p->migration_pending and then do: stop_one_cpu(cpu_of_rq(rq), migration_cpu_stop, &arg); This then gets us to migration_cpu_stop() running on the CPU that was previously running our victim task @p. If we find that our task is no longer on that runqueue (this can happen because of a concurrent migration due to load-balance etc.), then we'll end up at the: } else if (dest_cpu < 1 || pending) { branch. Which we'll take because we set pending earlier. Here we first check if the task @p has already satisfied the affinity constraints, if so we bail early [A]. Otherwise we'll reissue migration_cpu_stop() onto the CPU that is now hosting our task @p: stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, &pending->arg, &pending->stop_work); Except, we've never initialized pending->arg, which will be all 0s. This then results in running migration_cpu_stop() on the next CPU with arg->p == NULL, which gives the by now obvious result of fireworks. The cure is to change affine_move_task() to always use pending->arg, furthermore we can use the exact same pattern as the SCA_MIGRATE_ENABLE case, since we'll block on the pending->done completion anyway, no point in adding yet another completion in stop_one_cpu(). This then gives a clear distinction between the two migration_cpu_stop() use cases: - sched_exec() / migrate_task_to() : arg->pending == NULL - affine_move_task() : arg->pending != NULL; And we can have it ignore p->migration_pending when !arg->pending. Any stop work from sched_exec() / migrate_task_to() is in addition to stop works from affine_move_task(), which will be sufficient to issue the completion. Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") Cc: stable@kernel.org Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210224131355.357743989@infradead.org --- kernel/sched/core.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca2bb629595f..79ddba55b123 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1922,6 +1922,24 @@ static int migration_cpu_stop(void *data) rq_lock(rq, &rf); pending = p->migration_pending; + if (pending && !arg->pending) { + /* + * This happens from sched_exec() and migrate_task_to(), + * neither of them care about pending and just want a task to + * maybe move about. + * + * Even if there is a pending, we can ignore it, since + * affine_move_task() will have it's own stop_work's in flight + * which will manage the completion. + * + * Notably, pending doesn't need to match arg->pending. This can + * happen when tripple concurrent affine_move_task() first sets + * pending, then clears pending and eventually sets another + * pending. + */ + pending = NULL; + } + /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because @@ -2194,10 +2212,6 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag int dest_cpu, unsigned int flags) { struct set_affinity_pending my_pending = { }, *pending = NULL; - struct migration_arg arg = { - .task = p, - .dest_cpu = dest_cpu, - }; bool complete = false; /* Can the task run on the task's current CPU? If so, we're done */ @@ -2235,6 +2249,12 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag /* Install the request */ refcount_set(&my_pending.refs, 1); init_completion(&my_pending.done); + my_pending.arg = (struct migration_arg) { + .task = p, + .dest_cpu = -1, /* any */ + .pending = &my_pending, + }; + p->migration_pending = &my_pending; } else { pending = p->migration_pending; @@ -2265,12 +2285,6 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag p->migration_flags &= ~MDF_PUSH; task_rq_unlock(rq, p, rf); - pending->arg = (struct migration_arg) { - .task = p, - .dest_cpu = -1, - .pending = pending, - }; - stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, &pending->arg, &pending->stop_work); @@ -2283,8 +2297,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag * is_migration_disabled(p) checks to the stopper, which will * run on the same CPU as said p. */ + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ task_rq_unlock(rq, p, rf); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); } else { -- cgit v1.2.3-71-gd317 From c20cf065d4a619d394d23290093b1002e27dff86 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2021 11:50:39 +0100 Subject: sched: Simplify migration_cpu_stop() When affine_move_task() issues a migration_cpu_stop(), the purpose of that function is to complete that @pending, not any random other p->migration_pending that might have gotten installed since. This realization much simplifies migration_cpu_stop() and allows further necessary steps to fix all this as it provides the guarantee that @pending's stopper will complete @pending (and not some random other @pending). Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") Cc: stable@kernel.org Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210224131355.430014682@infradead.org --- kernel/sched/core.c | 56 ++++++++--------------------------------------------- 1 file changed, 8 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 79ddba55b123..088e8f492271 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1898,8 +1898,8 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, */ static int migration_cpu_stop(void *data) { - struct set_affinity_pending *pending; struct migration_arg *arg = data; + struct set_affinity_pending *pending = arg->pending; struct task_struct *p = arg->task; int dest_cpu = arg->dest_cpu; struct rq *rq = this_rq(); @@ -1921,25 +1921,6 @@ static int migration_cpu_stop(void *data) raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); - pending = p->migration_pending; - if (pending && !arg->pending) { - /* - * This happens from sched_exec() and migrate_task_to(), - * neither of them care about pending and just want a task to - * maybe move about. - * - * Even if there is a pending, we can ignore it, since - * affine_move_task() will have it's own stop_work's in flight - * which will manage the completion. - * - * Notably, pending doesn't need to match arg->pending. This can - * happen when tripple concurrent affine_move_task() first sets - * pending, then clears pending and eventually sets another - * pending. - */ - pending = NULL; - } - /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because @@ -1950,31 +1931,20 @@ static int migration_cpu_stop(void *data) goto out; if (pending) { - p->migration_pending = NULL; + if (p->migration_pending == pending) + p->migration_pending = NULL; complete = true; } - /* migrate_enable() -- we must not race against SCA */ - if (dest_cpu < 0) { - /* - * When this was migrate_enable() but we no longer - * have a @pending, a concurrent SCA 'fixed' things - * and we should be valid again. Nothing to do. - */ - if (!pending) { - WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); - goto out; - } - + if (dest_cpu < 0) dest_cpu = cpumask_any_distribute(&p->cpus_mask); - } if (task_on_rq_queued(p)) rq = __migrate_task(rq, &rf, p, dest_cpu); else p->wake_cpu = dest_cpu; - } else if (dest_cpu < 0 || pending) { + } else if (pending) { /* * This happens when we get migrated between migrate_enable()'s * preempt_enable() and scheduling the stopper task. At that @@ -1989,22 +1959,13 @@ static int migration_cpu_stop(void *data) * ->pi_lock, so the allowed mask is stable - if it got * somewhere allowed, we're done. */ - if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { - p->migration_pending = NULL; + if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { + if (p->migration_pending == pending) + p->migration_pending = NULL; complete = true; goto out; } - /* - * When this was migrate_enable() but we no longer have an - * @pending, a concurrent SCA 'fixed' things and we should be - * valid again. Nothing to do. - */ - if (!pending) { - WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); - goto out; - } - /* * When migrate_enable() hits a rq mis-match we can't reliably * determine is_migration_disabled() and so have to chase after @@ -2022,7 +1983,6 @@ out: complete_all(&pending->done); /* For pending->{arg,stop_work} */ - pending = arg->pending; if (pending && refcount_dec_and_test(&pending->refs)) wake_up_var(&pending->refs); -- cgit v1.2.3-71-gd317 From 58b1a45086b5f80f2b2842aa7ed0da51a64a302b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2021 11:15:23 +0100 Subject: sched: Collate affine_move_task() stoppers The SCA_MIGRATE_ENABLE and task_running() cases are almost identical, collapse them to avoid further duplication. Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") Cc: stable@kernel.org Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210224131355.500108964@infradead.org --- kernel/sched/core.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 088e8f492271..84b657f05625 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2239,30 +2239,23 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag return -EINVAL; } - if (flags & SCA_MIGRATE_ENABLE) { - - refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ - p->migration_flags &= ~MDF_PUSH; - task_rq_unlock(rq, p, rf); - - stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, - &pending->arg, &pending->stop_work); - - return 0; - } - if (task_running(rq, p) || p->state == TASK_WAKING) { /* - * Lessen races (and headaches) by delegating - * is_migration_disabled(p) checks to the stopper, which will - * run on the same CPU as said p. + * MIGRATE_ENABLE gets here because 'p == current', but for + * anything else we cannot do is_migration_disabled(), punt + * and have the stopper function handle it all race-free. */ + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + if (flags & SCA_MIGRATE_ENABLE) + p->migration_flags &= ~MDF_PUSH; task_rq_unlock(rq, p, rf); stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, &pending->arg, &pending->stop_work); + if (flags & SCA_MIGRATE_ENABLE) + return 0; } else { if (!is_migration_disabled(p)) { -- cgit v1.2.3-71-gd317 From 3f1bc119cd7fc987c8ed25ffb717f99403bb308c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2021 11:21:35 +0100 Subject: sched: Optimize migration_cpu_stop() When the purpose of migration_cpu_stop() is to migrate the task to 'any' valid CPU, don't migrate the task when it's already running on a valid CPU. Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") Cc: stable@kernel.org Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210224131355.569238629@infradead.org --- kernel/sched/core.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 84b657f05625..ac05afbd982e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1936,14 +1936,25 @@ static int migration_cpu_stop(void *data) complete = true; } - if (dest_cpu < 0) + if (dest_cpu < 0) { + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) + goto out; + dest_cpu = cpumask_any_distribute(&p->cpus_mask); + } if (task_on_rq_queued(p)) rq = __migrate_task(rq, &rf, p, dest_cpu); else p->wake_cpu = dest_cpu; + /* + * XXX __migrate_task() can fail, at which point we might end + * up running on a dodgy CPU, AFAICT this can only happen + * during CPU hotplug, at which point we'll get pushed out + * anyway, so it's probably not a big deal. + */ + } else if (pending) { /* * This happens when we get migrated between migrate_enable()'s -- cgit v1.2.3-71-gd317 From 9e81889c7648d48dd5fe13f41cbc99f3c362484a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2021 11:31:09 +0100 Subject: sched: Fix affine_move_task() self-concurrency Consider: sched_setaffinity(p, X); sched_setaffinity(p, Y); Then the first will install p->migration_pending = &my_pending; and issue stop_one_cpu_nowait(pending); and the second one will read p->migration_pending and _also_ issue: stop_one_cpu_nowait(pending), the _SAME_ @pending. This causes stopper list corruption. Add set_affinity_pending::stop_pending, to indicate if a stopper is in progress. Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") Cc: stable@kernel.org Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210224131355.649146419@infradead.org --- kernel/sched/core.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ac05afbd982e..4e4d100c1f7a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1864,6 +1864,7 @@ struct migration_arg { struct set_affinity_pending { refcount_t refs; + unsigned int stop_pending; struct completion done; struct cpu_stop_work stop_work; struct migration_arg arg; @@ -1982,12 +1983,15 @@ static int migration_cpu_stop(void *data) * determine is_migration_disabled() and so have to chase after * it. */ + WARN_ON_ONCE(!pending->stop_pending); task_rq_unlock(rq, p, &rf); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); return 0; } out: + if (pending) + pending->stop_pending = false; task_rq_unlock(rq, p, &rf); if (complete) @@ -2183,7 +2187,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag int dest_cpu, unsigned int flags) { struct set_affinity_pending my_pending = { }, *pending = NULL; - bool complete = false; + bool stop_pending, complete = false; /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { @@ -2256,14 +2260,19 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag * anything else we cannot do is_migration_disabled(), punt * and have the stopper function handle it all race-free. */ + stop_pending = pending->stop_pending; + if (!stop_pending) + pending->stop_pending = true; refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ if (flags & SCA_MIGRATE_ENABLE) p->migration_flags &= ~MDF_PUSH; task_rq_unlock(rq, p, rf); - stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, - &pending->arg, &pending->stop_work); + if (!stop_pending) { + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + } if (flags & SCA_MIGRATE_ENABLE) return 0; -- cgit v1.2.3-71-gd317 From 50caf9c14b1498c90cf808dbba2ca29bd32ccba4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2021 11:42:08 +0100 Subject: sched: Simplify set_affinity_pending refcounts Now that we have set_affinity_pending::stop_pending to indicate if a stopper is in progress, and we have the guarantee that if that stopper exists, it will (eventually) complete our @pending we can simplify the refcount scheme by no longer counting the stopper thread. Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") Cc: stable@kernel.org Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210224131355.724130207@infradead.org --- kernel/sched/core.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4e4d100c1f7a..98191218d891 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1862,6 +1862,10 @@ struct migration_arg { struct set_affinity_pending *pending; }; +/* + * @refs: number of wait_for_completion() + * @stop_pending: is @stop_work in use + */ struct set_affinity_pending { refcount_t refs; unsigned int stop_pending; @@ -1997,10 +2001,6 @@ out: if (complete) complete_all(&pending->done); - /* For pending->{arg,stop_work} */ - if (pending && refcount_dec_and_test(&pending->refs)) - wake_up_var(&pending->refs); - return 0; } @@ -2199,12 +2199,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag push_task = get_task_struct(p); } + /* + * If there are pending waiters, but no pending stop_work, + * then complete now. + */ pending = p->migration_pending; - if (pending) { - refcount_inc(&pending->refs); + if (pending && !pending->stop_pending) { p->migration_pending = NULL; complete = true; } + task_rq_unlock(rq, p, rf); if (push_task) { @@ -2213,7 +2217,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } if (complete) - goto do_complete; + complete_all(&pending->done); return 0; } @@ -2264,9 +2268,9 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag if (!stop_pending) pending->stop_pending = true; - refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ if (flags & SCA_MIGRATE_ENABLE) p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); if (!stop_pending) { @@ -2282,12 +2286,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag if (task_on_rq_queued(p)) rq = move_queued_task(rq, rf, p, dest_cpu); - p->migration_pending = NULL; - complete = true; + if (!pending->stop_pending) { + p->migration_pending = NULL; + complete = true; + } } task_rq_unlock(rq, p, rf); -do_complete: if (complete) complete_all(&pending->done); } @@ -2295,7 +2300,7 @@ do_complete: wait_for_completion(&pending->done); if (refcount_dec_and_test(&pending->refs)) - wake_up_var(&pending->refs); + wake_up_var(&pending->refs); /* No UaF, just an address */ /* * Block the original owner of &pending until all subsequent callers @@ -2303,6 +2308,9 @@ do_complete: */ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + /* ARGH */ + WARN_ON_ONCE(my_pending.stop_pending); + return 0; } -- cgit v1.2.3-71-gd317 From ce29ddc47b91f97e7f69a0fb7cbb5845f52a9825 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 17 Feb 2021 11:56:51 -0500 Subject: sched/membarrier: fix missing local execution of ipi_sync_rq_state() The function sync_runqueues_membarrier_state() should copy the membarrier state from the @mm received as parameter to each runqueue currently running tasks using that mm. However, the use of smp_call_function_many() skips the current runqueue, which is unintended. Replace by a call to on_each_cpu_mask(). Fixes: 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy load") Reported-by: Nadav Amit Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org # 5.4.x+ Link: https://lore.kernel.org/r/74F1E842-4A84-47BF-B6C2-5407DFDD4A4A@gmail.com --- kernel/sched/membarrier.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index acdae625c636..b5add64d9698 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -471,9 +471,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) } rcu_read_unlock(); - preempt_disable(); - smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); - preempt_enable(); + on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true); free_cpumask_var(tmpmask); cpus_read_unlock(); -- cgit v1.2.3-71-gd317 From a5398bffc01fe044848c5024e5e867e407f239b8 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 30 Nov 2020 11:38:40 -0800 Subject: perf/core: Flush PMU internal buffers for per-CPU events Sometimes the PMU internal buffers have to be flushed for per-CPU events during a context switch, e.g., large PEBS. Otherwise, the perf tool may report samples in locations that do not belong to the process where the samples are processed in, because PEBS does not tag samples with PID/TID. The current code only flush the buffers for a per-task event. It doesn't check a per-CPU event. Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the PMU internal buffers have to be flushed for this event during a context switch. Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx which is required to be flushed. Only need to invoke the sched_task() for per-CPU events in this patch. The per-task events have been handled in perf_event_context_sched_in/out already. Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches") Reported-by: Gabriel Marin Originally-by: Namhyung Kim Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20201130193842.10569-1-kan.liang@linux.intel.com --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 42 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fab42cfbd350..3f7f89ea5e51 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -606,6 +606,7 @@ struct swevent_hlist { #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 +#define PERF_ATTACH_SCHED_CB 0x20 struct perf_cgroup; struct perf_buffer; @@ -872,6 +873,7 @@ struct perf_cpu_context { struct list_head cgrp_cpuctx_entry; #endif + struct list_head sched_cb_entry; int sched_cb_usage; int online; diff --git a/kernel/events/core.c b/kernel/events/core.c index 0aeca5f3c0ac..03db40f6cba9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -3461,11 +3462,16 @@ unlock: } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); + void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - --cpuctx->sched_cb_usage; + this_cpu_dec(perf_sched_cb_usages); + + if (!--cpuctx->sched_cb_usage) + list_del(&cpuctx->sched_cb_entry); } @@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - cpuctx->sched_cb_usage++; + if (!cpuctx->sched_cb_usage++) + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + + this_cpu_inc(perf_sched_cb_usages); } /* @@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } +static void perf_pmu_sched_task(struct task_struct *prev, + struct task_struct *next, + bool sched_in) +{ + struct perf_cpu_context *cpuctx; + + if (prev == next) + return; + + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { + /* will be handled in perf_event_context_sched_in/out */ + if (cpuctx->task_ctx) + continue; + + __perf_pmu_sched_task(cpuctx, sched_in); + } +} + static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); @@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(task, next, false); + if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); @@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); + + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -4656,7 +4689,7 @@ static void unaccount_event(struct perf_event *event) if (event->parent) return; - if (event->attach_state & PERF_ATTACH_TASK) + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); @@ -11175,7 +11208,7 @@ static void account_event(struct perf_event *event) if (event->parent) return; - if (event->attach_state & PERF_ATTACH_TASK) + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); @@ -12972,6 +13005,7 @@ static void __init perf_event_init_all_cpus(void) #ifdef CONFIG_CGROUP_PERF INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); #endif + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } -- cgit v1.2.3-71-gd317 From 46eb1701c046cc18c032fa68f3c8ccbf24483ee4 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 23 Feb 2021 17:02:40 +0100 Subject: hrtimer: Update softirq_expires_next correctly after __hrtimer_get_next_event() hrtimer_force_reprogram() and hrtimer_interrupt() invokes __hrtimer_get_next_event() to find the earliest expiry time of hrtimer bases. __hrtimer_get_next_event() does not update cpu_base::[softirq_]_expires_next to preserve reprogramming logic. That needs to be done at the callsites. hrtimer_force_reprogram() updates cpu_base::softirq_expires_next only when the first expiring timer is a softirq timer and the soft interrupt is not activated. That's wrong because cpu_base::softirq_expires_next is left stale when the first expiring timer of all bases is a timer which expires in hard interrupt context. hrtimer_interrupt() does never update cpu_base::softirq_expires_next which is wrong too. That becomes a problem when clock_settime() sets CLOCK_REALTIME forward and the first soft expiring timer is in the CLOCK_REALTIME_SOFT base. Setting CLOCK_REALTIME forward moves the clock MONOTONIC based expiry time of that timer before the stale cpu_base::softirq_expires_next. cpu_base::softirq_expires_next is cached to make the check for raising the soft interrupt fast. In the above case the soft interrupt won't be raised until clock monotonic reaches the stale cpu_base::softirq_expires_next value. That's incorrect, but what's worse it that if the softirq timer becomes the first expiring timer of all clock bases after the hard expiry timer has been handled the reprogramming of the clockevent from hrtimer_interrupt() will result in an interrupt storm. That happens because the reprogramming does not use cpu_base::softirq_expires_next, it uses __hrtimer_get_next_event() which returns the actual expiry time. Once clock MONOTONIC reaches cpu_base::softirq_expires_next the soft interrupt is raised and the storm subsides. Change the logic in hrtimer_force_reprogram() to evaluate the soft and hard bases seperately, update softirq_expires_next and handle the case when a soft expiring timer is the first of all bases by comparing the expiry times and updating the required cpu base fields. Split this functionality into a separate function to be able to use it in hrtimer_interrupt() as well without copy paste. Fixes: 5da70160462e ("hrtimer: Implement support for softirq based hrtimers") Reported-by: Mikael Beckius Suggested-by: Thomas Gleixner Tested-by: Mikael Beckius Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210223160240.27518-1-anna-maria@linutronix.de --- kernel/time/hrtimer.c | 60 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 743c852e10f2..788b9d137de4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -546,8 +546,11 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, } /* - * Recomputes cpu_base::*next_timer and returns the earliest expires_next but - * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram. + * Recomputes cpu_base::*next_timer and returns the earliest expires_next + * but does not set cpu_base::*expires_next, that is done by + * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating + * cpu_base::*expires_next right away, reprogramming logic would no longer + * work. * * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, * those timers will get run whenever the softirq gets handled, at the end of @@ -588,6 +591,37 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ return expires_next; } +static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) +{ + ktime_t expires_next, soft = KTIME_MAX; + + /* + * If the soft interrupt has already been activated, ignore the + * soft bases. They will be handled in the already raised soft + * interrupt. + */ + if (!cpu_base->softirq_activated) { + soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + /* + * Update the soft expiry time. clock_settime() might have + * affected it. + */ + cpu_base->softirq_expires_next = soft; + } + + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); + /* + * If a softirq timer is expiring first, update cpu_base->next_timer + * and program the hardware with the soft expiry time. + */ + if (expires_next > soft) { + cpu_base->next_timer = cpu_base->softirq_next_timer; + expires_next = soft; + } + + return expires_next; +} + static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; @@ -628,23 +662,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; - /* - * Find the current next expiration time. - */ - expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); - - if (cpu_base->next_timer && cpu_base->next_timer->is_soft) { - /* - * When the softirq is activated, hrtimer has to be - * programmed with the first hard hrtimer because soft - * timer interrupt could occur too late. - */ - if (cpu_base->softirq_activated) - expires_next = __hrtimer_get_next_event(cpu_base, - HRTIMER_ACTIVE_HARD); - else - cpu_base->softirq_expires_next = expires_next; - } + expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; @@ -1644,8 +1662,8 @@ retry: __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); - /* Reevaluate the clock bases for the next expiry */ - expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); + /* Reevaluate the clock bases for the [soft] next expiry */ + expires_next = hrtimer_update_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. -- cgit v1.2.3-71-gd317 From 69dd4503a7e6bae3389b8e028e5768008be8f2d7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 16 Feb 2021 15:36:07 +0100 Subject: irqdomain: Remove debugfs_file from struct irq_domain There's no need to keep around a dentry pointer to a simple file that debugfs itself can look up when we need to remove it from the system. So simplify the code by deleting the variable and cleaning up the logic around the debugfs file. Cc: Marc Zyngier Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/YCvYV53ZdzQSWY6w@kroah.com --- include/linux/irqdomain.h | 4 ---- kernel/irq/irqdomain.c | 9 ++++----- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 42d196805f58..33cacc8af26d 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -150,7 +150,6 @@ struct irq_domain_chip_generic; * setting up one or more generic chips for interrupt controllers * drivers using the generic chip library which uses this pointer. * @parent: Pointer to parent irq_domain to support hierarchy irq_domains - * @debugfs_file: dentry for the domain debugfs file * * Revmap data, used internally by irq_domain * @revmap_direct_max_irq: The largest hwirq that can be set for controllers that @@ -174,9 +173,6 @@ struct irq_domain { #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_domain *parent; #endif -#ifdef CONFIG_GENERIC_IRQ_DEBUGFS - struct dentry *debugfs_file; -#endif /* reverse map data. The linear map gets appended to the irq_domain */ irq_hw_number_t hwirq_max; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 288151393a06..d10ab1d689d5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1898,16 +1898,15 @@ DEFINE_SHOW_ATTRIBUTE(irq_domain_debug); static void debugfs_add_domain_dir(struct irq_domain *d) { - if (!d->name || !domain_dir || d->debugfs_file) + if (!d->name || !domain_dir) return; - d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, - &irq_domain_debug_fops); + debugfs_create_file(d->name, 0444, domain_dir, d, + &irq_domain_debug_fops); } static void debugfs_remove_domain_dir(struct irq_domain *d) { - debugfs_remove(d->debugfs_file); - d->debugfs_file = NULL; + debugfs_remove(debugfs_lookup(d->name, domain_dir)); } void __init irq_domain_debugfs_init(struct dentry *root) -- cgit v1.2.3-71-gd317 From e22bc9b481a90d7898984ea17621f04a653e2cd1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Mar 2021 19:49:02 -0700 Subject: kernel: make IO threads unfreezable by default The io-wq threads were already marked as no-freeze, but the manager was not. On resume, we perpetually have signal_pending() being true, and hence the manager will loop and spin 100% of the time. Just mark the tasks created by create_io_thread() as PF_NOFREEZE by default, and remove any knowledge of it in io-wq and io_uring. Reported-by: Kevin Locke Tested-by: Kevin Locke Signed-off-by: Jens Axboe --- fs/io-wq.c | 3 +-- fs/io_uring.c | 1 - kernel/fork.c | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/fs/io-wq.c b/fs/io-wq.c index 3d7060ba547a..0ae9ecadf295 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -591,7 +591,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) tsk->pf_io_worker = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node)); - tsk->flags |= PF_NOFREEZE | PF_NO_SETAFFINITY; + tsk->flags |= PF_NO_SETAFFINITY; raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); @@ -709,7 +709,6 @@ static int io_wq_manager(void *data) set_current_state(TASK_INTERRUPTIBLE); io_wq_check_workers(wq); schedule_timeout(HZ); - try_to_freeze(); if (fatal_signal_pending(current)) set_bit(IO_WQ_BIT_EXIT, &wq->state); } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); diff --git a/fs/io_uring.c b/fs/io_uring.c index 62f998bf2ce8..14165e18020c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6733,7 +6733,6 @@ static int io_sq_thread(void *data) up_read(&sqd->rw_lock); schedule(); - try_to_freeze(); down_read(&sqd->rw_lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); diff --git a/kernel/fork.c b/kernel/fork.c index d3171e8e88e5..72e444cd0ffe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2436,6 +2436,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) if (!IS_ERR(tsk)) { sigfillset(&tsk->blocked); sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + tsk->flags |= PF_NOFREEZE; } return tsk; } -- cgit v1.2.3-71-gd317 From 82e69a121be4b1597ce758534816a8ee04c8b761 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 12 Mar 2021 21:07:15 -0800 Subject: mm/fork: clear PASID for new mm When a new mm is created, its PASID should be cleared, i.e. the PASID is initialized to its init state 0 on both ARM and X86. This patch was part of the series introducing mm->pasid, but got lost along the way [1]. It still makes sense to have it, because each address space has a different PASID. And the IOMMU code in iommu_sva_alloc_pasid() expects the pasid field of a new mm struct to be cleared. [1] https://lore.kernel.org/linux-iommu/YDgh53AcQHT+T3L0@otcwcpicx3.sc.intel.com/ Link: https://lkml.kernel.org/r/20210302103837.2562625-1-jean-philippe@linaro.org Signed-off-by: Fenghua Yu Signed-off-by: Jean-Philippe Brucker Reviewed-by: Tony Luck Cc: Jacob Pan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 1 + kernel/fork.c | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0974ad501a47..6613b26a8894 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -23,6 +23,7 @@ #endif #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) +#define INIT_PASID 0 struct address_space; struct mem_cgroup; diff --git a/kernel/fork.c b/kernel/fork.c index d3171e8e88e5..54cc905e5fe0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -994,6 +994,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } +static void mm_init_pasid(struct mm_struct *mm) +{ +#ifdef CONFIG_IOMMU_SUPPORT + mm->pasid = INIT_PASID; +#endif +} + static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES @@ -1024,6 +1031,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mm_init_pasid(mm); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); -- cgit v1.2.3-71-gd317 From c995f12ad8842dbf5cfed113fb52cdd083f5afd1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 14 Mar 2021 23:51:14 +0300 Subject: prctl: fix PR_SET_MM_AUXV kernel stack leak Doing a prctl(PR_SET_MM, PR_SET_MM_AUXV, addr, 1); will copy 1 byte from userspace to (quite big) on-stack array and then stash everything to mm->saved_auxv. AT_NULL terminator will be inserted at the very end. /proc/*/auxv handler will find that AT_NULL terminator and copy original stack contents to userspace. This devious scheme requires CAP_SYS_RESOURCE. Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index b09fe21e88ff..2e2e3f378d97 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2079,7 +2079,7 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, * up to the caller to provide sane values here, otherwise userspace * tools which use this vector might be unhappy. */ - unsigned long user_auxv[AT_VECTOR_SIZE]; + unsigned long user_auxv[AT_VECTOR_SIZE] = {}; if (len > sizeof(user_auxv)) return -EINVAL; -- cgit v1.2.3-71-gd317 From 5de2055d31ea88fd9ae9709ac95c372a505a60fa Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 16 Mar 2021 11:31:16 -0400 Subject: locking/ww_mutex: Simplify use_ww_ctx & ww_ctx handling The use_ww_ctx flag is passed to mutex_optimistic_spin(), but the function doesn't use it. The frequent use of the (use_ww_ctx && ww_ctx) combination is repetitive. In fact, ww_ctx should not be used at all if !use_ww_ctx. Simplify ww_mutex code by dropping use_ww_ctx from mutex_optimistic_spin() an clear ww_ctx if !use_ww_ctx. In this way, we can replace (use_ww_ctx && ww_ctx) by just (ww_ctx). Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Acked-by: Davidlohr Bueso Link: https://lore.kernel.org/r/20210316153119.13802-2-longman@redhat.com --- kernel/locking/mutex.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index adb935090768..622ebdfcd083 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -626,7 +626,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) */ static __always_inline bool mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, struct mutex_waiter *waiter) + struct mutex_waiter *waiter) { if (!waiter) { /* @@ -702,7 +702,7 @@ fail: #else static __always_inline bool mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, - const bool use_ww_ctx, struct mutex_waiter *waiter) + struct mutex_waiter *waiter) { return false; } @@ -922,6 +922,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct ww_mutex *ww; int ret; + if (!use_ww_ctx) + ww_ctx = NULL; + might_sleep(); #ifdef CONFIG_DEBUG_MUTEXES @@ -929,7 +932,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, #endif ww = container_of(lock, struct ww_mutex, base); - if (use_ww_ctx && ww_ctx) { + if (ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; @@ -946,10 +949,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); if (__mutex_trylock(lock) || - mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) { + mutex_optimistic_spin(lock, ww_ctx, NULL)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx && ww_ctx) + if (ww_ctx) ww_mutex_set_context_fastpath(ww, ww_ctx); preempt_enable(); return 0; @@ -960,7 +963,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * After waiting to acquire the wait_lock, try again. */ if (__mutex_trylock(lock)) { - if (use_ww_ctx && ww_ctx) + if (ww_ctx) __ww_mutex_check_waiters(lock, ww_ctx); goto skip_wait; @@ -1013,7 +1016,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - if (use_ww_ctx && ww_ctx) { + if (ww_ctx) { ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx); if (ret) goto err; @@ -1026,7 +1029,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * ww_mutex needs to always recheck its position since its waiter * list is not FIFO ordered. */ - if ((use_ww_ctx && ww_ctx) || !first) { + if (ww_ctx || !first) { first = __mutex_waiter_is_first(lock, &waiter); if (first) __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); @@ -1039,7 +1042,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * or we must see its unlock and acquire. */ if (__mutex_trylock(lock) || - (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter))) + (first && mutex_optimistic_spin(lock, ww_ctx, &waiter))) break; spin_lock(&lock->wait_lock); @@ -1048,7 +1051,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, acquired: __set_current_state(TASK_RUNNING); - if (use_ww_ctx && ww_ctx) { + if (ww_ctx) { /* * Wound-Wait; we stole the lock (!first_waiter), check the * waiters as anyone might want to wound us. @@ -1068,7 +1071,7 @@ skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx && ww_ctx) + if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); spin_unlock(&lock->wait_lock); -- cgit v1.2.3-71-gd317