From bddda606ec76550dd63592e32a6e87e7d32583f7 Mon Sep 17 00:00:00 2001 From: Srinivas Ramana Date: Thu, 20 Dec 2018 19:05:57 +0530 Subject: genirq: Make sure the initial affinity is not empty If all CPUs in the irq_default_affinity mask are offline when an interrupt is initialized then irq_setup_affinity() can set an empty affinity mask for a newly allocated interrupt. Fix this by falling back to cpu_online_mask in case the resulting affinity mask is zero. Signed-off-by: Srinivas Ramana Signed-off-by: Thomas Gleixner Cc: linux-arm-msm@vger.kernel.org Link: https://lkml.kernel.org/r/1545312957-8504-1-git-send-email-sramana@codeaurora.org --- kernel/irq/manage.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a4888ce4667a..84b54a17b95d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -393,6 +393,9 @@ int irq_setup_affinity(struct irq_desc *desc) } cpumask_and(&mask, cpu_online_mask, set); + if (cpumask_empty(&mask)) + cpumask_copy(&mask, cpu_online_mask); + if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); -- cgit v1.2.3-71-gd317 From 93ad0fc088c5b4631f796c995bdd27a082ef33a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Jan 2019 14:33:16 +0100 Subject: posix-cpu-timers: Unbreak timer rearming The recent commit which prevented a division by 0 issue in the alarm timer code broke posix CPU timers as an unwanted side effect. The reason is that the common rearm code checks for timer->it_interval being 0 now. What went unnoticed is that the posix cpu timer setup does not initialize timer->it_interval as it stores the interval in CPU timer specific storage. The reason for the separate storage is historical as the posix CPU timers always had a 64bit nanoseconds representation internally while timer->it_interval is type ktime_t which used to be a modified timespec representation on 32bit machines. Instead of reverting the offending commit and fixing the alarmtimer issue in the alarmtimer code, store the interval in timer->it_interval at CPU timer setup time so the common code check works. This also repairs the existing inconistency of the posix CPU timer code which kept a single shot timer armed despite of the interval being 0. The separate storage can be removed in mainline, but that needs to be a separate commit as the current one has to be backported to stable kernels. Fixes: 0e334db6bb4b ("posix-timers: Fix division by zero bug") Reported-by: H.J. Lu Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190111133500.840117406@linutronix.de --- kernel/time/posix-cpu-timers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8f0644af40be..80f955210861 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -685,6 +685,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * set up the signal and overrun bookkeeping. */ timer->it.cpu.incr = timespec64_to_ns(&new->it_interval); + timer->it_interval = ns_to_ktime(timer->it.cpu.incr); /* * This acts as a modification timestamp for the timer, -- cgit v1.2.3-71-gd317 From a811dc61559e0c8003f1086c2a4dc8e4d5ae4cb8 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Sat, 12 Jan 2019 11:24:20 -0700 Subject: seccomp: fix UAF in user-trap code On the failure path, we do an fput() of the listener fd if the filter fails to install (e.g. because of a TSYNC race that's lost, or if the thread is killed, etc.). fput() doesn't actually release the fd, it just ads it to a work queue. Then the thread proceeds to free the filter, even though the listener struct file has a reference to it. To fix this, on the failure path let's set the private data to null, so we know in ->release() to ignore the filter. Reported-by: syzbot+981c26489b2d1c6316ba@syzkaller.appspotmail.com Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Signed-off-by: Tycho Andersen Acked-by: Kees Cook Signed-off-by: James Morris --- kernel/seccomp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d7f538847b84..e815781ed751 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -976,6 +976,9 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) struct seccomp_filter *filter = file->private_data; struct seccomp_knotif *knotif; + if (!filter) + return 0; + mutex_lock(&filter->notify_lock); /* @@ -1300,6 +1303,7 @@ out: out_put_fd: if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { if (ret < 0) { + listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); } else { -- cgit v1.2.3-71-gd317 From 227a76b64718888c1687cc237463aa000ae6fb2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Jan 2019 21:14:08 +0100 Subject: swiotlb: clear io_tlb_start and io_tlb_end in swiotlb_exit Otherwise is_swiotlb_buffer will return false positives when we first initialize a swiotlb buffer, but then free it because we have an IOMMU available. Fixes: 55897af63091 ("dma-direct: merge swiotlb_dma_ops into the dma_direct code") Reported-by: Sibren Vasse Signed-off-by: Christoph Hellwig Tested-by: Sibren Vasse Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d6361776dc5c..1fb6fd68b9c7 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -378,6 +378,8 @@ void __init swiotlb_exit(void) memblock_free_late(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } + io_tlb_start = 0; + io_tlb_end = 0; io_tlb_nslabs = 0; max_segment = 0; } -- cgit v1.2.3-71-gd317 From b1e8818cabf407a5a2cec696411b0bdfd7fd12f0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 15 Jan 2019 17:07:47 -0800 Subject: bpf: btf: support 128 bit integer type Currently, btf only supports up to 64-bit integer. On the other hand, 128bit support for gcc and clang has existed for a long time. For example, both gcc 4.8 and llvm 3.7 supports types "__int128" and "unsigned __int128" for virtually all 64bit architectures including bpf. The requirement for __int128 support comes from two areas: . bpf program may use __int128. For example, some bcc tools (https://github.com/iovisor/bcc/tree/master/tools), mostly tcp v6 related, tcpstates.py, tcpaccept.py, etc., are using __int128 to represent the ipv6 addresses. . linux itself is using __int128 types. Hence supporting __int128 type in BTF is required for vmlinux BTF, which will be used by "compile once and run everywhere" and other projects. For 128bit integer, instead of base-10, hex numbers are pretty printed out as large decimal number is hard to decipher, e.g., for ipv6 addresses. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 104 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 85 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a2f53642592b..022ef9ca1296 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -157,7 +157,7 @@ * */ -#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) +#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2) #define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) #define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) #define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) @@ -525,7 +525,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) /* * Regular int is not a bit field and it must be either - * u8/u16/u32/u64. + * u8/u16/u32/u64 or __int128. */ static bool btf_type_int_is_regular(const struct btf_type *t) { @@ -538,7 +538,8 @@ static bool btf_type_int_is_regular(const struct btf_type *t) if (BITS_PER_BYTE_MASKED(nr_bits) || BTF_INT_OFFSET(int_data) || (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && + nr_bytes != (2 * sizeof(u64)))) { return false; } @@ -1063,9 +1064,9 @@ static int btf_int_check_member(struct btf_verifier_env *env, nr_copy_bits = BTF_INT_BITS(int_data) + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U64) { + if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 64"); + "nr_copy_bits exceeds 128"); return -EINVAL; } @@ -1119,9 +1120,9 @@ static int btf_int_check_kflag_member(struct btf_verifier_env *env, bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U64) { + if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 64"); + "nr_copy_bits exceeds 128"); return -EINVAL; } @@ -1168,9 +1169,9 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); - if (nr_bits > BITS_PER_U64) { + if (nr_bits > BITS_PER_U128) { btf_verifier_log_type(env, t, "nr_bits exceeds %zu", - BITS_PER_U64); + BITS_PER_U128); return -EINVAL; } @@ -1211,31 +1212,93 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } +static void btf_int128_print(struct seq_file *m, void *data) +{ + /* data points to a __int128 number. + * Suppose + * int128_num = *(__int128 *)data; + * The below formulas shows what upper_num and lower_num represents: + * upper_num = int128_num >> 64; + * lower_num = int128_num & 0xffffffffFFFFFFFFULL; + */ + u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = *(u64 *)data; + lower_num = *(u64 *)(data + 8); +#else + upper_num = *(u64 *)(data + 8); + lower_num = *(u64 *)data; +#endif + if (upper_num == 0) + seq_printf(m, "0x%llx", lower_num); + else + seq_printf(m, "0x%llx%016llx", upper_num, lower_num); +} + +static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, + u16 right_shift_bits) +{ + u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = print_num[0]; + lower_num = print_num[1]; +#else + upper_num = print_num[1]; + lower_num = print_num[0]; +#endif + + /* shake out un-needed bits by shift/or operations */ + if (left_shift_bits >= 64) { + upper_num = lower_num << (left_shift_bits - 64); + lower_num = 0; + } else { + upper_num = (upper_num << left_shift_bits) | + (lower_num >> (64 - left_shift_bits)); + lower_num = lower_num << left_shift_bits; + } + + if (right_shift_bits >= 64) { + lower_num = upper_num >> (right_shift_bits - 64); + upper_num = 0; + } else { + lower_num = (lower_num >> right_shift_bits) | + (upper_num << (64 - right_shift_bits)); + upper_num = upper_num >> right_shift_bits; + } + +#ifdef __BIG_ENDIAN_BITFIELD + print_num[0] = upper_num; + print_num[1] = lower_num; +#else + print_num[0] = lower_num; + print_num[1] = upper_num; +#endif +} + static void btf_bitfield_seq_show(void *data, u8 bits_offset, u8 nr_bits, struct seq_file *m) { u16 left_shift_bits, right_shift_bits; u8 nr_copy_bytes; u8 nr_copy_bits; - u64 print_num; + u64 print_num[2] = {}; nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - print_num = 0; - memcpy(&print_num, data, nr_copy_bytes); + memcpy(print_num, data, nr_copy_bytes); #ifdef __BIG_ENDIAN_BITFIELD left_shift_bits = bits_offset; #else - left_shift_bits = BITS_PER_U64 - nr_copy_bits; + left_shift_bits = BITS_PER_U128 - nr_copy_bits; #endif - right_shift_bits = BITS_PER_U64 - nr_bits; - - print_num <<= left_shift_bits; - print_num >>= right_shift_bits; + right_shift_bits = BITS_PER_U128 - nr_bits; - seq_printf(m, "0x%llx", print_num); + btf_int128_shift(print_num, left_shift_bits, right_shift_bits); + btf_int128_print(m, print_num); } @@ -1250,7 +1313,7 @@ static void btf_int_bits_seq_show(const struct btf *btf, /* * bits_offset is at most 7. - * BTF_INT_OFFSET() cannot exceed 64 bits. + * BTF_INT_OFFSET() cannot exceed 128 bits. */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); @@ -1274,6 +1337,9 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, } switch (nr_bits) { + case 128: + btf_int128_print(m, data); + break; case 64: if (sign) seq_printf(m, "%lld", *(s64 *)data); -- cgit v1.2.3-71-gd317 From d0b2818efbe27e6c2e0c52621c8db18eb5abb5e1 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 16 Jan 2019 10:43:01 -0800 Subject: bpf: fix a (false) compiler warning An older GCC compiler complains: kernel/bpf/verifier.c: In function 'bpf_check': kernel/bpf/verifier.c:4***:13: error: 'prev_offset' may be used uninitialized in this function [-Werror=maybe-uninitialized] } else if (krecord[i].insn_offset <= prev_offset) { ^ kernel/bpf/verifier.c:4***:38: note: 'prev_offset' was declared here u32 i, nfuncs, urec_size, min_size, prev_offset; Although the compiler is wrong here, the patch makes sure that prev_offset is always initialized, just to silence the warning. v2: fix a spelling error in the commit message. Signed-off-by: Peter Oskolkov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56674a7c3778..ce87198ecd01 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4997,13 +4997,14 @@ static int check_btf_func(struct bpf_verifier_env *env, const union bpf_attr *attr, union bpf_attr __user *uattr) { - u32 i, nfuncs, urec_size, min_size, prev_offset; + u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); struct bpf_func_info *krecord; const struct btf_type *type; struct bpf_prog *prog; const struct btf *btf; void __user *urecord; + u32 prev_offset = 0; int ret = 0; nfuncs = attr->func_info_cnt; -- cgit v1.2.3-71-gd317 From 0b698005a9d11c0e91141ec11a2c4918a129f703 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 16 Jan 2019 14:03:15 -0800 Subject: bpf: don't assume build-id length is always 20 bytes Build-id length is not fixed to 20, it can be (`man ld` /--build-id): * 128-bit (uuid) * 160-bit (sha1) * any length specified in ld --build-id=0xhexstring To fix the issue of missing BPF_STACK_BUILD_ID_VALID for shorter build-ids, assume that build-id is somewhere in the range of 1 .. 20. Set the remaining bytes to zero. v2: * don't introduce new "len = min(BPF_BUILD_ID_SIZE, nhdr->n_descsz)", we already know that nhdr->n_descsz <= BPF_BUILD_ID_SIZE if we enter this 'if' condition Fixes: 615755a77b24 ("bpf: extend stackmap to save binary_build_id+offset instead of address") Acked-by: Song Liu Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index d9e2483669d0..f9df545e92f6 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -180,11 +180,14 @@ static inline int stack_map_parse_build_id(void *page_addr, if (nhdr->n_type == BPF_BUILD_ID && nhdr->n_namesz == sizeof("GNU") && - nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + nhdr->n_descsz > 0 && + nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { memcpy(build_id, note_start + note_offs + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - BPF_BUILD_ID_SIZE); + nhdr->n_descsz); + memset(build_id + nhdr->n_descsz, 0, + BPF_BUILD_ID_SIZE - nhdr->n_descsz); return 0; } new_offs = note_offs + sizeof(Elf32_Nhdr) + -- cgit v1.2.3-71-gd317 From 4af396ae4836c4ecab61e975b8e61270c551894d Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 16 Jan 2019 14:03:16 -0800 Subject: bpf: zero out build_id for BPF_STACK_BUILD_ID_IP When returning BPF_STACK_BUILD_ID_IP from stack_map_get_build_id_offset, make sure that build_id field is empty. Since we are using percpu free list, there is a possibility that we might reuse some previous bpf_stack_build_id with non-zero build_id. Fixes: 615755a77b24 ("bpf: extend stackmap to save binary_build_id+offset instead of address") Acked-by: Song Liu Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index f9df545e92f6..d43b14535827 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -314,6 +314,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; + memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); } return; } @@ -324,6 +325,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; + memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] -- cgit v1.2.3-71-gd317 From 583c53185399cea5c51195064564d1c9ddc70cf3 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 16 Jan 2019 20:29:40 +0100 Subject: bpf: Make function btf_name_offset_valid static Initially in commit 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") the function 'btf_name_offset_valid' was introduced as static function it was later on changed to a non-static one, and then finally in commit 23127b33ec80 ("bpf: Create a new btf_name_by_offset() for non type name use case") the function prototype was removed. Revert back to original implementation and make the function static. Remove warning triggered with W=1: kernel/bpf/btf.c:470:6: warning: no previous prototype for 'btf_name_offset_valid' [-Wmissing-prototypes] Fixes: 23127b33ec80 ("bpf: Create a new btf_name_by_offset() for non type name use case") Signed-off-by: Mathieu Malaterre Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a2f53642592b..befe570be5ba 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -467,7 +467,7 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) return kind_ops[BTF_INFO_KIND(t->info)]; } -bool btf_name_offset_valid(const struct btf *btf, u32 offset) +static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { return BTF_STR_OFFSET_VALID(offset) && offset < btf->hdr.str_len; -- cgit v1.2.3-71-gd317 From c8dc79806e7f6cb6b0952aae1ce626c39905ad7e Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 16 Jan 2019 20:35:29 +0100 Subject: bpf: Annotate implicit fall through in cgroup_dev_func_proto There is a plan to build the kernel with -Wimplicit-fallthrough and this place in the code produced a warnings (W=1). This commit removes the following warning: kernel/bpf/cgroup.c:719:6: warning: this statement may fall through [-Wimplicit-fallthrough=] Signed-off-by: Mathieu Malaterre Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9425c2fb872f..ab612fe9862f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -718,6 +718,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); + /* fall through */ default: return NULL; } -- cgit v1.2.3-71-gd317 From 12fee4cd5be2c4a73cc13d7ad76eb2d2feda8a71 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 17 Jan 2019 11:00:09 +0800 Subject: genirq/irqdesc: Fix double increment in alloc_descs() The recent rework of alloc_descs() introduced a double increment of the loop counter. As a consequence only every second affinity mask is validated. Remove it. [ tglx: Massaged changelog ] Fixes: c410abbbacb9 ("genirq/affinity: Add is_managed to struct irq_affinity_desc") Signed-off-by: Huacai Chen Signed-off-by: Thomas Gleixner Cc: Fuxin Zhang Cc: Zhangjin Wu Cc: Huacai Chen Cc: Dou Liyang Link: https://lkml.kernel.org/r/1547694009-16261-1-git-send-email-chenhc@lemote.com --- kernel/irq/irqdesc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index ee062b7939d3..ef8ad36cadcf 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -457,7 +457,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, /* Validate affinity mask(s) */ if (affinity) { - for (i = 0; i < cnt; i++, i++) { + for (i = 0; i < cnt; i++) { if (cpumask_empty(&affinity[i].mask)) return -EINVAL; } -- cgit v1.2.3-71-gd317 From 9d5564ddcf2a0f5ba3fa1c3a1f8a1b59ad309553 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 17 Jan 2019 16:34:45 +0100 Subject: bpf: fix inner map masking to prevent oob under speculation During review I noticed that inner meta map setup for map in map is buggy in that it does not propagate all needed data from the reference map which the verifier is later accessing. In particular one such case is index masking to prevent out of bounds access under speculative execution due to missing the map's unpriv_array/index_mask field propagation. Fix this such that the verifier is generating the correct code for inlined lookups in case of unpriviledged use. Before patch (test_verifier's 'map in map access' dump): # bpftool prog dump xla id 3 0: (62) *(u32 *)(r10 -4) = 0 1: (bf) r2 = r10 2: (07) r2 += -4 3: (18) r1 = map[id:4] 5: (07) r1 += 272 | 6: (61) r0 = *(u32 *)(r2 +0) | 7: (35) if r0 >= 0x1 goto pc+6 | Inlined map in map lookup 8: (54) (u32) r0 &= (u32) 0 | with index masking for 9: (67) r0 <<= 3 | map->unpriv_array. 10: (0f) r0 += r1 | 11: (79) r0 = *(u64 *)(r0 +0) | 12: (15) if r0 == 0x0 goto pc+1 | 13: (05) goto pc+1 | 14: (b7) r0 = 0 | 15: (15) if r0 == 0x0 goto pc+11 16: (62) *(u32 *)(r10 -4) = 0 17: (bf) r2 = r10 18: (07) r2 += -4 19: (bf) r1 = r0 20: (07) r1 += 272 | 21: (61) r0 = *(u32 *)(r2 +0) | Index masking missing (!) 22: (35) if r0 >= 0x1 goto pc+3 | for inner map despite 23: (67) r0 <<= 3 | map->unpriv_array set. 24: (0f) r0 += r1 | 25: (05) goto pc+1 | 26: (b7) r0 = 0 | 27: (b7) r0 = 0 28: (95) exit After patch: # bpftool prog dump xla id 1 0: (62) *(u32 *)(r10 -4) = 0 1: (bf) r2 = r10 2: (07) r2 += -4 3: (18) r1 = map[id:2] 5: (07) r1 += 272 | 6: (61) r0 = *(u32 *)(r2 +0) | 7: (35) if r0 >= 0x1 goto pc+6 | Same inlined map in map lookup 8: (54) (u32) r0 &= (u32) 0 | with index masking due to 9: (67) r0 <<= 3 | map->unpriv_array. 10: (0f) r0 += r1 | 11: (79) r0 = *(u64 *)(r0 +0) | 12: (15) if r0 == 0x0 goto pc+1 | 13: (05) goto pc+1 | 14: (b7) r0 = 0 | 15: (15) if r0 == 0x0 goto pc+12 16: (62) *(u32 *)(r10 -4) = 0 17: (bf) r2 = r10 18: (07) r2 += -4 19: (bf) r1 = r0 20: (07) r1 += 272 | 21: (61) r0 = *(u32 *)(r2 +0) | 22: (35) if r0 >= 0x1 goto pc+4 | Now fixed inlined inner map 23: (54) (u32) r0 &= (u32) 0 | lookup with proper index masking 24: (67) r0 <<= 3 | for map->unpriv_array. 25: (0f) r0 += r1 | 26: (05) goto pc+1 | 27: (b7) r0 = 0 | 28: (b7) r0 = 0 29: (95) exit Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/map_in_map.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 99d243e1ad6e..52378d3e34b3 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; + u32 inner_map_meta_size; struct fd f; f = fdget(inner_map_ufd); @@ -36,7 +37,12 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-EINVAL); } - inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + inner_map_meta_size = sizeof(*inner_map_meta); + /* In some cases verifier needs to access beyond just base map. */ + if (inner_map->ops == &array_map_ops) + inner_map_meta_size = sizeof(struct bpf_array); + + inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) { fdput(f); return ERR_PTR(-ENOMEM); @@ -46,9 +52,16 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->key_size = inner_map->key_size; inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; - inner_map_meta->ops = inner_map->ops; inner_map_meta->max_entries = inner_map->max_entries; + /* Misc members not needed in bpf_map_meta_equal() check. */ + inner_map_meta->ops = inner_map->ops; + if (inner_map->ops == &array_map_ops) { + inner_map_meta->unpriv_array = inner_map->unpriv_array; + container_of(inner_map_meta, struct bpf_array, map)->index_mask = + container_of(inner_map, struct bpf_array, map)->index_mask; + } + fdput(f); return inner_map_meta; } -- cgit v1.2.3-71-gd317 From 6dc080eeb2ba01973bfff0d79844d7a59e12542e Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Fri, 30 Nov 2018 20:40:56 +0530 Subject: sched/wait: Fix rcuwait_wake_up() ordering For some peculiar reason rcuwait_wake_up() has the right barrier in the comment, but not in the code. This mistake has been observed to cause a deadlock in the following situation: P1 P2 percpu_up_read() percpu_down_write() rcu_sync_is_idle() // false rcu_sync_enter() ... __percpu_up_read() [S] ,- __this_cpu_dec(*sem->read_count) | smp_rmb(); [L] | task = rcu_dereference(w->task) // NULL | | [S] w->task = current | smp_mb(); | [L] readers_active_check() // fail `-> Where the smp_rmb() (obviously) fails to constrain the store. [ peterz: Added changelog. ] Signed-off-by: Prateek Sood Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andrea Parri Acked-by: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 8f95c90ceb54 ("sched/wait, RCU: Introduce rcuwait machinery") Link: https://lkml.kernel.org/r/1543590656-7157-1-git-send-email-prsood@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 284f2fe9a293..3fb7be001964 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -307,7 +307,7 @@ void rcuwait_wake_up(struct rcuwait *w) * MB (A) MB (B) * [L] cond [L] tsk */ - smp_rmb(); /* (B) */ + smp_mb(); /* (B) */ /* * Avoid using task_rcu_dereference() magic as long as we are careful, -- cgit v1.2.3-71-gd317 From e6018c0f5c996e61639adce6a0697391a2861916 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Dec 2018 10:14:53 +0100 Subject: sched/wake_q: Document wake_q_add() The only guarantee provided by wake_q_add() is that a wakeup will happen after it, it does _NOT_ guarantee the wakeup will be delayed until the matching wake_up_q(). If wake_q_add() fails the cmpxchg() a concurrent wakeup is pending and that can happen at any time after the cmpxchg(). This means we should not rely on the wakeup happening at wake_q_up(), but should be ready for wake_q_add() to issue the wakeup. The delay; if provided (most likely); should only result in more efficient behaviour. Reported-by: Yongji Xie Signed-off-by: Peter Zijlstra (Intel) Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Signed-off-by: Ingo Molnar --- include/linux/sched/wake_q.h | 6 +++++- kernel/sched/core.c | 12 ++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 10b19a192b2d..545f37138057 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -24,9 +24,13 @@ * called near the end of a function. Otherwise, the list can be * re-initialized for later re-use by wake_q_init(). * - * Note that this can cause spurious wakeups. schedule() callers + * NOTE that this can cause spurious wakeups. schedule() callers * must ensure the call is done inside a loop, confirming that the * wakeup condition has in fact occurred. + * + * NOTE that there is no guarantee the wakeup will happen any later than the + * wake_q_add() location. Therefore task must be ready to be woken at the + * location of the wake_q_add(). */ #include diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a674c7db2f29..cc814933f7d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -396,6 +396,18 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif +/** + * wake_q_add() - queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + */ void wake_q_add(struct wake_q_head *head, struct task_struct *task) { struct wake_q_node *node = &task->wake_q; -- cgit v1.2.3-71-gd317 From 4c4e3731564c8945ac5ac90fc2a1e1f21cb79c92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Dec 2018 10:14:53 +0100 Subject: sched/wake_q: Fix wakeup ordering for wake_q Notable cmpxchg() does not provide ordering when it fails, however wake_q_add() requires ordering in this specific case too. Without this it would be possible for the concurrent wakeup to not observe our prior state. Andrea Parri provided: C wake_up_q-wake_q_add { int next = 0; int y = 0; } P0(int *next, int *y) { int r0; /* in wake_up_q() */ WRITE_ONCE(*next, 1); /* node->next = NULL */ smp_mb(); /* implied by wake_up_process() */ r0 = READ_ONCE(*y); } P1(int *next, int *y) { int r1; /* in wake_q_add() */ WRITE_ONCE(*y, 1); /* wake_cond = true */ smp_mb__before_atomic(); r1 = cmpxchg_relaxed(next, 1, 2); } exists (0:r0=0 /\ 1:r1=0) This "exists" clause cannot be satisfied according to the LKMM: Test wake_up_q-wake_q_add Allowed States 3 0:r0=0; 1:r1=1; 0:r0=1; 1:r1=0; 0:r0=1; 1:r1=1; No Witnesses Positive: 0 Negative: 3 Condition exists (0:r0=0 /\ 1:r1=0) Observation wake_up_q-wake_q_add Never 0 3 Reported-by: Yongji Xie Signed-off-by: Peter Zijlstra (Intel) Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cc814933f7d6..d8d76a65cfdd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -417,10 +417,11 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * its already queued (either by us or someone else) and will get the * wakeup due to that. * - * This cmpxchg() executes a full barrier, which pairs with the full - * barrier executed by the wakeup in wake_up_q(). + * In order to ensure that a pending wakeup will observe our pending + * state, even in the failed case, an explicit smp_mb() must be used. */ - if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + smp_mb__before_atomic(); + if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)) return; get_task_struct(task); -- cgit v1.2.3-71-gd317 From b061c38bef43406df8e73c5be06cbfacad5ee6ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Nov 2018 14:44:49 +0100 Subject: futex: Fix (possible) missed wakeup We must not rely on wake_q_add() to delay the wakeup; in particular commit: 1d0dcb3ad9d3 ("futex: Implement lockless wakeups") moved wake_q_add() before smp_store_release(&q->lock_ptr, NULL), which could result in futex_wait() waking before observing ->lock_ptr == NULL and going back to sleep again. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 1d0dcb3ad9d3 ("futex: Implement lockless wakeups") Signed-off-by: Ingo Molnar --- kernel/futex.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index be3bff2315ff..fdd312da0992 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1452,11 +1452,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return; - /* - * Queue the task for later wakeup for after we've released - * the hb->lock. wake_q_add() grabs reference to p. - */ - wake_q_add(wake_q, p); + get_task_struct(p); __unqueue_futex(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL @@ -1466,6 +1462,13 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) * plist_del in __unqueue_futex(). */ smp_store_release(&q->lock_ptr, NULL); + + /* + * Queue the task for later wakeup for after we've released + * the hb->lock. wake_q_add() grabs reference to p. + */ + wake_q_add(wake_q, p); + put_task_struct(p); } /* -- cgit v1.2.3-71-gd317 From e158488be27b157802753a59b336142dc0eb0380 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 29 Nov 2018 20:50:30 +0800 Subject: locking/rwsem: Fix (possible) missed wakeup Because wake_q_add() can imply an immediate wakeup (cmpxchg failure case), we must not rely on the wakeup being delayed. However, commit: e38513905eea ("locking/rwsem: Rework zeroing reader waiter->task") relies on exactly that behaviour in that the wakeup must not happen until after we clear waiter->task. [ peterz: Added changelog. ] Signed-off-by: Xie Yongji Signed-off-by: Zhang Yu Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: e38513905eea ("locking/rwsem: Rework zeroing reader waiter->task") Link: https://lkml.kernel.org/r/1543495830-2644-1-git-send-email-xieyongji@baidu.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 09b180063ee1..50d9af615dc4 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -198,15 +198,22 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, woken++; tsk = waiter->task; - wake_q_add(wake_q, tsk); + get_task_struct(tsk); list_del(&waiter->list); /* - * Ensure that the last operation is setting the reader + * Ensure calling get_task_struct() before setting the reader * waiter to nil such that rwsem_down_read_failed() cannot * race with do_exit() by always holding a reference count * to the task to wakeup. */ smp_store_release(&waiter->task, NULL); + /* + * Ensure issuing the wakeup (either by us or someone else) + * after setting the reader waiter to nil. + */ + wake_q_add(wake_q, tsk); + /* wake_q_add() already take the task ref */ + put_task_struct(tsk); } adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; -- cgit v1.2.3-71-gd317 From 2cbd95a5c4fb855a4177c0343a880cc2091f500d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:18 -0800 Subject: bpf: change parameters of call/branch offset adjustment In preparation for code removal change parameters to branch and call adjustment functions to be more universal. The current parameters assume we are patching a single instruction with a longer set. A diagram may help reading the change, this is for the patch single case, patching instruction 1 with a replacement of 4: ____ 0 |____| 1 |____| <-- pos ^ 2 | | <-- end old ^ | 3 | | | delta | len 4 |____| | | (patch region) 5 | | <-- end new v v 6 |____| end_old = pos + 1 end_new = pos + delta + 1 If we are before the patch region - curr variable and the target are fully in old coordinates (hence comparing against end_old). If we are after the region curr is in new coordinates (hence the comparison to end_new) but target is in mixed coordinates, so we just check if it falls before end_new, and if so it needs the adjustment. Note that we will not fix up branches which land in removed region in case of removal, which should be okay, as we are only going to remove dead code. Signed-off-by: Jakub Kicinski Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f908b9356025..ad08ba341197 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -307,15 +307,16 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, - u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, u32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; + s32 delta = end_new - end_old; s64 imm = insn->imm; - if (curr < pos && curr + imm + 1 > pos) + if (curr < pos && curr + imm + 1 >= end_old) imm += delta; - else if (curr > pos + delta && curr + imm + 1 <= pos + delta) + else if (curr >= end_new && curr + imm + 1 < end_new) imm -= delta; if (imm < imm_min || imm > imm_max) return -ERANGE; @@ -324,15 +325,16 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, return 0; } -static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, - u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, u32 curr, const bool probe_pass) { const s32 off_min = S16_MIN, off_max = S16_MAX; + s32 delta = end_new - end_old; s32 off = insn->off; - if (curr < pos && curr + off + 1 > pos) + if (curr < pos && curr + off + 1 >= end_old) off += delta; - else if (curr > pos + delta && curr + off + 1 <= pos + delta) + else if (curr >= end_new && curr + off + 1 < end_new) off -= delta; if (off < off_min || off > off_max) return -ERANGE; @@ -341,10 +343,10 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, return 0; } -static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, - const bool probe_pass) +static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, + s32 end_new, const bool probe_pass) { - u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0); + u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0); struct bpf_insn *insn = prog->insnsi; int ret = 0; @@ -356,8 +358,8 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, * do any other adjustments. Therefore skip the patchlet. */ if (probe_pass && i == pos) { - i += delta + 1; - insn++; + i = end_new; + insn = prog->insnsi + end_old; } code = insn->code; if (BPF_CLASS(code) != BPF_JMP || @@ -367,11 +369,11 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, if (BPF_OP(code) == BPF_CALL) { if (insn->src_reg != BPF_PSEUDO_CALL) continue; - ret = bpf_adj_delta_to_imm(insn, pos, delta, i, - probe_pass); + ret = bpf_adj_delta_to_imm(insn, pos, end_old, + end_new, i, probe_pass); } else { - ret = bpf_adj_delta_to_off(insn, pos, delta, i, - probe_pass); + ret = bpf_adj_delta_to_off(insn, pos, end_old, + end_new, i, probe_pass); } if (ret) break; @@ -421,7 +423,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && - bpf_adj_branches(prog, off, insn_delta, true)) + bpf_adj_branches(prog, off, off + 1, off + len, true)) return NULL; /* Several new instructions need to be inserted. Make room @@ -453,7 +455,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * the ship has sailed to reverse to the original state. An * overflow cannot happen at this point. */ - BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); + BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false)); bpf_adj_linfo(prog_adj, off, insn_delta); -- cgit v1.2.3-71-gd317 From e2ae4ca266a1c9a0163738129506dbc63d5cca80 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:19 -0800 Subject: bpf: verifier: hard wire branches to dead code Loading programs with dead code becomes more and more common, as people begin to patch constants at load time. Turn conditional jumps to unconditional ones, to avoid potential branch misprediction penalty. This optimization is enabled for privileged users only. For branches which just fall through we could just mark them as not seen and have dead code removal take care of them, but that seems less clean. v0.2: - don't call capable(CAP_SYS_ADMIN) twice (Jiong). v3: - fix GCC warning; Signed-off-by: Jakub Kicinski Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ce87198ecd01..bf1f98e8beb6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6458,6 +6458,40 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) } } +static bool insn_is_cond_jump(u8 code) +{ + u8 op; + + if (BPF_CLASS(code) != BPF_JMP) + return false; + + op = BPF_OP(code); + return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; +} + +static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + struct bpf_insn *insn = env->prog->insnsi; + const int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!insn_is_cond_jump(insn->code)) + continue; + + if (!aux_data[i + 1].seen) + ja.off = insn->off; + else if (!aux_data[i + 1 + insn->off].seen) + ja.off = 0; + else + continue; + + memcpy(insn, &ja, sizeof(ja)); + } +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -7149,6 +7183,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, struct bpf_verifier_env *env; struct bpf_verifier_log *log; int ret = -EINVAL; + bool is_priv; /* no program is valid */ if (ARRAY_SIZE(bpf_verifier_ops) == 0) @@ -7195,6 +7230,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; + is_priv = capable(CAP_SYS_ADMIN); + env->allow_ptr_leaks = is_priv; + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; @@ -7212,8 +7250,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (!env->explored_states) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = check_subprogs(env); if (ret < 0) goto skip_full_check; @@ -7243,6 +7279,11 @@ skip_full_check: ret = check_max_stack_depth(env); /* instruction rewrites happen after this point */ + if (is_priv) { + if (ret == 0) + opt_hard_wire_dead_code_branches(env); + } + if (ret == 0) sanitize_dead_code(env); -- cgit v1.2.3-71-gd317 From 52875a04f4b26e7ef30a288ea096f7cfec0e93cd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:20 -0800 Subject: bpf: verifier: remove dead code Instead of overwriting dead code with jmp -1 instructions remove it completely for root. Adjust verifier state and line info appropriately. v2: - adjust func_info (Alexei); - make sure first instruction retains line info (Alexei). v4: (Yonghong) - remove unnecessary if (!insn to remove) checks; - always keep last line info if first live instruction lacks one. v5: (Martin Lau) - improve and clarify comments. Signed-off-by: Jakub Kicinski Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 1 + kernel/bpf/core.c | 12 ++++ kernel/bpf/verifier.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 186 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index ad106d845b22..be9af6b4a9e4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -778,6 +778,7 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); void bpf_clear_redirect_map(struct bpf_map *map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ad08ba341197..2a81b8af3748 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -462,6 +462,18 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, return prog_adj; } +int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) +{ + /* Branch offsets can't overflow when program is shrinking, no need + * to call bpf_adj_branches(..., true) here + */ + memmove(prog->insnsi + off, prog->insnsi + off + cnt, + sizeof(struct bpf_insn) * (prog->len - off - cnt)); + prog->len -= cnt; + + return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); +} + void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bf1f98e8beb6..099b2541f87f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6432,6 +6432,150 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } +static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, + u32 off, u32 cnt) +{ + int i, j; + + /* find first prog starting at or after off (first to remove) */ + for (i = 0; i < env->subprog_cnt; i++) + if (env->subprog_info[i].start >= off) + break; + /* find first prog starting at or after off + cnt (first to stay) */ + for (j = i; j < env->subprog_cnt; j++) + if (env->subprog_info[j].start >= off + cnt) + break; + /* if j doesn't start exactly at off + cnt, we are just removing + * the front of previous prog + */ + if (env->subprog_info[j].start != off + cnt) + j--; + + if (j > i) { + struct bpf_prog_aux *aux = env->prog->aux; + int move; + + /* move fake 'exit' subprog as well */ + move = env->subprog_cnt + 1 - j; + + memmove(env->subprog_info + i, + env->subprog_info + j, + sizeof(*env->subprog_info) * move); + env->subprog_cnt -= j - i; + + /* remove func_info */ + if (aux->func_info) { + move = aux->func_info_cnt - j; + + memmove(aux->func_info + i, + aux->func_info + j, + sizeof(*aux->func_info) * move); + aux->func_info_cnt -= j - i; + /* func_info->insn_off is set after all code rewrites, + * in adjust_btf_func() - no need to adjust + */ + } + } else { + /* convert i from "first prog to remove" to "first to adjust" */ + if (env->subprog_info[i].start == off) + i++; + } + + /* update fake 'exit' subprog as well */ + for (; i <= env->subprog_cnt; i++) + env->subprog_info[i].start -= cnt; + + return 0; +} + +static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, + u32 cnt) +{ + struct bpf_prog *prog = env->prog; + u32 i, l_off, l_cnt, nr_linfo; + struct bpf_line_info *linfo; + + nr_linfo = prog->aux->nr_linfo; + if (!nr_linfo) + return 0; + + linfo = prog->aux->linfo; + + /* find first line info to remove, count lines to be removed */ + for (i = 0; i < nr_linfo; i++) + if (linfo[i].insn_off >= off) + break; + + l_off = i; + l_cnt = 0; + for (; i < nr_linfo; i++) + if (linfo[i].insn_off < off + cnt) + l_cnt++; + else + break; + + /* First live insn doesn't match first live linfo, it needs to "inherit" + * last removed linfo. prog is already modified, so prog->len == off + * means no live instructions after (tail of the program was removed). + */ + if (prog->len != off && l_cnt && + (i == nr_linfo || linfo[i].insn_off != off + cnt)) { + l_cnt--; + linfo[--i].insn_off = off + cnt; + } + + /* remove the line info which refer to the removed instructions */ + if (l_cnt) { + memmove(linfo + l_off, linfo + i, + sizeof(*linfo) * (nr_linfo - i)); + + prog->aux->nr_linfo -= l_cnt; + nr_linfo = prog->aux->nr_linfo; + } + + /* pull all linfo[i].insn_off >= off + cnt in by cnt */ + for (i = l_off; i < nr_linfo; i++) + linfo[i].insn_off -= cnt; + + /* fix up all subprogs (incl. 'exit') which start >= off */ + for (i = 0; i <= env->subprog_cnt; i++) + if (env->subprog_info[i].linfo_idx > l_off) { + /* program may have started in the removed region but + * may not be fully removed + */ + if (env->subprog_info[i].linfo_idx >= l_off + l_cnt) + env->subprog_info[i].linfo_idx -= l_cnt; + else + env->subprog_info[i].linfo_idx = l_off; + } + + return 0; +} + +static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + unsigned int orig_prog_len = env->prog->len; + int err; + + err = bpf_remove_insns(env->prog, off, cnt); + if (err) + return err; + + err = adjust_subprog_starts_after_remove(env, off, cnt); + if (err) + return err; + + err = bpf_adj_linfo_after_remove(env, off, cnt); + if (err) + return err; + + memmove(aux_data + off, aux_data + off + cnt, + sizeof(*aux_data) * (orig_prog_len - off - cnt)); + + return 0; +} + /* The verifier does more data flow analysis than llvm and will not * explore branches that are dead at run time. Malicious programs can * have dead code too. Therefore replace all dead at-run-time code @@ -6492,6 +6636,30 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) } } +static int opt_remove_dead_code(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + int insn_cnt = env->prog->len; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + int j; + + j = 0; + while (i + j < insn_cnt && !aux_data[i + j].seen) + j++; + if (!j) + continue; + + err = verifier_remove_insns(env, i, j); + if (err) + return err; + insn_cnt = env->prog->len; + } + + return 0; +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -7282,11 +7450,13 @@ skip_full_check: if (is_priv) { if (ret == 0) opt_hard_wire_dead_code_branches(env); + if (ret == 0) + ret = opt_remove_dead_code(env); + } else { + if (ret == 0) + sanitize_dead_code(env); } - if (ret == 0) - sanitize_dead_code(env); - if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); -- cgit v1.2.3-71-gd317 From a1b14abc009d9c13be355dbd4a4c4d47816ad3db Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:21 -0800 Subject: bpf: verifier: remove unconditional branches by 0 Unconditional branches by 0 instructions are basically noops but they can result from earlier optimizations, e.g. a conditional jumps which would never be taken or a conditional jump around dead code. Remove those branches. v0.2: - s/opt_remove_dead_branches/opt_remove_nops/ (Jiong). Signed-off-by: Jakub Kicinski Reviewed-by: Jiong Wang Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 099b2541f87f..f39bca188a5c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6660,6 +6660,27 @@ static int opt_remove_dead_code(struct bpf_verifier_env *env) return 0; } +static int opt_remove_nops(struct bpf_verifier_env *env) +{ + const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + if (memcmp(&insn[i], &ja, sizeof(ja))) + continue; + + err = verifier_remove_insns(env, i, 1); + if (err) + return err; + insn_cnt--; + i--; + } + + return 0; +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -7452,6 +7473,8 @@ skip_full_check: opt_hard_wire_dead_code_branches(env); if (ret == 0) ret = opt_remove_dead_code(env); + if (ret == 0) + ret = opt_remove_nops(env); } else { if (ret == 0) sanitize_dead_code(env); -- cgit v1.2.3-71-gd317 From 9e4c24e7ee7dfd3898269519103e823892b730d8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:23 -0800 Subject: bpf: verifier: record original instruction index The communication between the verifier and advanced JITs is based on instruction indexes. We have to keep them stable throughout the optimizations otherwise referring to a particular instruction gets messy quickly. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 573cca00a0e6..f3ae00ee5516 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -187,6 +187,7 @@ struct bpf_insn_aux_data { int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ u8 alu_state; /* used in combination with alu_limit */ + unsigned int orig_idx; /* original instruction index */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f39bca188a5c..f2c49b4235df 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7371,7 +7371,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, { struct bpf_verifier_env *env; struct bpf_verifier_log *log; - int ret = -EINVAL; + int i, len, ret = -EINVAL; bool is_priv; /* no program is valid */ @@ -7386,12 +7386,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, return -ENOMEM; log = &env->log; + len = (*prog)->len; env->insn_aux_data = - vzalloc(array_size(sizeof(struct bpf_insn_aux_data), - (*prog)->len)); + vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len)); ret = -ENOMEM; if (!env->insn_aux_data) goto err_free_env; + for (i = 0; i < len; i++) + env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; -- cgit v1.2.3-71-gd317 From 08ca90afba255d05dc3253caa44056e7aecbe8c5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:24 -0800 Subject: bpf: notify offload JITs about optimizations Let offload JITs know when instructions are replaced and optimized out, so they can update their state appropriately. The optimizations are best effort, if JIT returns an error from any callback verifier will stop notifying it as state may now be out of sync, but the verifier continues making progress. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ include/linux/bpf_verifier.h | 5 +++++ kernel/bpf/offload.c | 35 +++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 6 ++++++ 4 files changed, 53 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e734f163bd0b..3851529062ec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -268,9 +268,15 @@ struct bpf_verifier_ops { }; struct bpf_prog_offload_ops { + /* verifier basic callbacks */ int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); + /* verifier optimization callbacks (called after .finalize) */ + int (*replace_insn)(struct bpf_verifier_env *env, u32 off, + struct bpf_insn *insn); + int (*remove_insns)(struct bpf_verifier_env *env, u32 off, u32 cnt); + /* program management callbacks */ int (*prepare)(struct bpf_prog *prog); int (*translate)(struct bpf_prog *prog); void (*destroy)(struct bpf_prog *prog); @@ -283,6 +289,7 @@ struct bpf_prog_offload { void *dev_priv; struct list_head offloads; bool dev_state; + bool opt_failed; void *jited_image; u32 jited_len; }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f3ae00ee5516..0620e418dde5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -266,5 +266,10 @@ int bpf_prog_offload_verifier_prep(struct bpf_prog *prog); int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int bpf_prog_offload_finalize(struct bpf_verifier_env *env); +void +bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, + struct bpf_insn *insn); +void +bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 54cf2b9c44a4..39dba8c90331 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -173,6 +173,41 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env) return ret; } +void +bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, + struct bpf_insn *insn) +{ + const struct bpf_prog_offload_ops *ops; + struct bpf_prog_offload *offload; + int ret = -EOPNOTSUPP; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + ops = offload->offdev->ops; + if (!offload->opt_failed && ops->replace_insn) + ret = ops->replace_insn(env, off, insn); + offload->opt_failed |= ret; + } + up_read(&bpf_devs_lock); +} + +void +bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ + struct bpf_prog_offload *offload; + int ret = -EOPNOTSUPP; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + if (!offload->opt_failed && offload->offdev->ops->remove_insns) + ret = offload->offdev->ops->remove_insns(env, off, cnt); + offload->opt_failed |= ret; + } + up_read(&bpf_devs_lock); +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f2c49b4235df..8cfe39ef770f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6558,6 +6558,9 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) unsigned int orig_prog_len = env->prog->len; int err; + if (bpf_prog_is_dev_bound(env->prog->aux)) + bpf_prog_offload_remove_insns(env, off, cnt); + err = bpf_remove_insns(env->prog, off, cnt); if (err) return err; @@ -6632,6 +6635,9 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) else continue; + if (bpf_prog_is_dev_bound(env->prog->aux)) + bpf_prog_offload_replace_insn(env, i, &ja); + memcpy(insn, &ja, sizeof(ja)); } } -- cgit v1.2.3-71-gd317 From a72dafafbd5f11c6ea3a9682d64da1074f28eb67 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:26:00 -0500 Subject: bpf: refactor verifier min/max code for condition jump The current min/max code does both signed and unsigned comparisons against the input argument "val" which is "u64" and there is explicit type casting when the comparison is signed. As we will need slightly more complexer type casting when JMP32 introduced, it is better to host the signed type casting. This makes the code more clean with ignorable runtime overhead. Also, code for J*GE/GT/LT/LE and JEQ/JNE are very similar, this patch combine them. The main purpose for this refactor is to make sure the min/max code will still be readable and with minimum code duplication after JMP32 introduced. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 172 +++++++++++++++++++++++++++++--------------------- 1 file changed, 99 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8cfe39ef770f..eae6cb1fe653 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4033,9 +4033,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, */ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) { + s64 sval; + if (__is_pointer_value(false, reg)) return -1; + sval = (s64)val; + switch (opcode) { case BPF_JEQ: if (tnum_is_const(reg->var_off)) @@ -4058,9 +4062,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSGT: - if (reg->smin_value > (s64)val) + if (reg->smin_value > sval) return 1; - else if (reg->smax_value < (s64)val) + else if (reg->smax_value < sval) return 0; break; case BPF_JLT: @@ -4070,9 +4074,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSLT: - if (reg->smax_value < (s64)val) + if (reg->smax_value < sval) return 1; - else if (reg->smin_value >= (s64)val) + else if (reg->smin_value >= sval) return 0; break; case BPF_JGE: @@ -4082,9 +4086,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSGE: - if (reg->smin_value >= (s64)val) + if (reg->smin_value >= sval) return 1; - else if (reg->smax_value < (s64)val) + else if (reg->smax_value < sval) return 0; break; case BPF_JLE: @@ -4094,9 +4098,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSLE: - if (reg->smax_value <= (s64)val) + if (reg->smax_value <= sval) return 1; - else if (reg->smin_value > (s64)val) + else if (reg->smin_value > sval) return 0; break; } @@ -4113,6 +4117,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + s64 sval; + /* If the dst_reg is a pointer, we can't learn anything about its * variable offset from the compare (unless src_reg were a pointer into * the same object, but we don't bother with that. @@ -4122,19 +4128,22 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, if (__is_pointer_value(false, false_reg)) return; + sval = (s64)val; + switch (opcode) { case BPF_JEQ: - /* If this is false then we know nothing Jon Snow, but if it is - * true then we know for sure. - */ - __mark_reg_known(true_reg, val); - break; case BPF_JNE: - /* If this is true we know nothing Jon Snow, but if it is false - * we know the value for sure; + { + struct bpf_reg_state *reg = + opcode == BPF_JEQ ? true_reg : false_reg; + + /* For BPF_JEQ, if this is false we know nothing Jon Snow, but + * if it is true we know the value for sure. Likewise for + * BPF_JNE. */ - __mark_reg_known(false_reg, val); + __mark_reg_known(reg, val); break; + } case BPF_JSET: false_reg->var_off = tnum_and(false_reg->var_off, tnum_const(~val)); @@ -4142,38 +4151,46 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, true_reg->var_off = tnum_or(true_reg->var_off, tnum_const(val)); break; - case BPF_JGT: - false_reg->umax_value = min(false_reg->umax_value, val); - true_reg->umin_value = max(true_reg->umin_value, val + 1); - break; - case BPF_JSGT: - false_reg->smax_value = min_t(s64, false_reg->smax_value, val); - true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); - break; - case BPF_JLT: - false_reg->umin_value = max(false_reg->umin_value, val); - true_reg->umax_value = min(true_reg->umax_value, val - 1); - break; - case BPF_JSLT: - false_reg->smin_value = max_t(s64, false_reg->smin_value, val); - true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); - break; case BPF_JGE: - false_reg->umax_value = min(false_reg->umax_value, val - 1); - true_reg->umin_value = max(true_reg->umin_value, val); + case BPF_JGT: + { + u64 false_umax = opcode == BPF_JGT ? val : val - 1; + u64 true_umin = opcode == BPF_JGT ? val + 1 : val; + + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); break; + } case BPF_JSGE: - false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); - true_reg->smin_value = max_t(s64, true_reg->smin_value, val); + case BPF_JSGT: + { + s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); break; + } case BPF_JLE: - false_reg->umin_value = max(false_reg->umin_value, val + 1); - true_reg->umax_value = min(true_reg->umax_value, val); + case BPF_JLT: + { + u64 false_umin = opcode == BPF_JLT ? val : val + 1; + u64 true_umax = opcode == BPF_JLT ? val - 1 : val; + + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); break; + } case BPF_JSLE: - false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); - true_reg->smax_value = min_t(s64, true_reg->smax_value, val); + case BPF_JSLT: + { + s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); break; + } default: break; } @@ -4198,22 +4215,23 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + s64 sval; + if (__is_pointer_value(false, false_reg)) return; + sval = (s64)val; + switch (opcode) { case BPF_JEQ: - /* If this is false then we know nothing Jon Snow, but if it is - * true then we know for sure. - */ - __mark_reg_known(true_reg, val); - break; case BPF_JNE: - /* If this is true we know nothing Jon Snow, but if it is false - * we know the value for sure; - */ - __mark_reg_known(false_reg, val); + { + struct bpf_reg_state *reg = + opcode == BPF_JEQ ? true_reg : false_reg; + + __mark_reg_known(reg, val); break; + } case BPF_JSET: false_reg->var_off = tnum_and(false_reg->var_off, tnum_const(~val)); @@ -4221,38 +4239,46 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, true_reg->var_off = tnum_or(true_reg->var_off, tnum_const(val)); break; - case BPF_JGT: - true_reg->umax_value = min(true_reg->umax_value, val - 1); - false_reg->umin_value = max(false_reg->umin_value, val); - break; - case BPF_JSGT: - true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); - false_reg->smin_value = max_t(s64, false_reg->smin_value, val); - break; - case BPF_JLT: - true_reg->umin_value = max(true_reg->umin_value, val + 1); - false_reg->umax_value = min(false_reg->umax_value, val); - break; - case BPF_JSLT: - true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); - false_reg->smax_value = min_t(s64, false_reg->smax_value, val); - break; case BPF_JGE: - true_reg->umax_value = min(true_reg->umax_value, val); - false_reg->umin_value = max(false_reg->umin_value, val + 1); + case BPF_JGT: + { + u64 false_umin = opcode == BPF_JGT ? val : val + 1; + u64 true_umax = opcode == BPF_JGT ? val - 1 : val; + + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); break; + } case BPF_JSGE: - true_reg->smax_value = min_t(s64, true_reg->smax_value, val); - false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); + case BPF_JSGT: + { + s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval; + + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); break; + } case BPF_JLE: - true_reg->umin_value = max(true_reg->umin_value, val); - false_reg->umax_value = min(false_reg->umax_value, val - 1); + case BPF_JLT: + { + u64 false_umax = opcode == BPF_JLT ? val : val - 1; + u64 true_umin = opcode == BPF_JLT ? val + 1 : val; + + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); break; + } case BPF_JSLE: - true_reg->smin_value = max_t(s64, true_reg->smin_value, val); - false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); + case BPF_JSLT: + { + s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval; + + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); break; + } default: break; } -- cgit v1.2.3-71-gd317 From 092ed0968bb648cd18e8a0430cd0a8a71727315c Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:26:01 -0500 Subject: bpf: verifier support JMP32 This patch teach verifier about the new BPF_JMP32 instruction class. Verifier need to treat it similar as the existing BPF_JMP class. A BPF_JMP32 insn needs to go through all checks that have been done on BPF_JMP. Also, verifier is doing runtime optimizations based on the extra info conditional jump instruction could offer, especially when the comparison is between constant and register that the value range of the register could be improved based on the comparison results. These code are updated accordingly. Acked-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 3 +- kernel/bpf/verifier.c | 203 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 173 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2a81b8af3748..1e443ba97310 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -362,7 +362,8 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, insn = prog->insnsi + end_old; } code = insn->code; - if (BPF_CLASS(code) != BPF_JMP || + if ((BPF_CLASS(code) != BPF_JMP && + BPF_CLASS(code) != BPF_JMP32) || BPF_OP(code) == BPF_EXIT) continue; /* Adjust offset of jmps if we cross patch boundaries. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eae6cb1fe653..8c1c21cd50b4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1095,7 +1095,7 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; - if (BPF_CLASS(code) != BPF_JMP) + if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; @@ -4031,14 +4031,49 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * 0 - branch will not be taken and fall-through to next insn * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] */ -static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, + bool is_jmp32) { + struct bpf_reg_state reg_lo; s64 sval; if (__is_pointer_value(false, reg)) return -1; - sval = (s64)val; + if (is_jmp32) { + reg_lo = *reg; + reg = ®_lo; + /* For JMP32, only low 32 bits are compared, coerce_reg_to_size + * could truncate high bits and update umin/umax according to + * information of low bits. + */ + coerce_reg_to_size(reg, 4); + /* smin/smax need special handling. For example, after coerce, + * if smin_value is 0x00000000ffffffffLL, the value is -1 when + * used as operand to JMP32. It is a negative number from s32's + * point of view, while it is a positive number when seen as + * s64. The smin/smax are kept as s64, therefore, when used with + * JMP32, they need to be transformed into s32, then sign + * extended back to s64. + * + * Also, smin/smax were copied from umin/umax. If umin/umax has + * different sign bit, then min/max relationship doesn't + * maintain after casting into s32, for this case, set smin/smax + * to safest range. + */ + if ((reg->umax_value ^ reg->umin_value) & + (1ULL << 31)) { + reg->smin_value = S32_MIN; + reg->smax_value = S32_MAX; + } + reg->smin_value = (s64)(s32)reg->smin_value; + reg->smax_value = (s64)(s32)reg->smax_value; + + val = (u32)val; + sval = (s64)(s32)val; + } else { + sval = (s64)val; + } switch (opcode) { case BPF_JEQ: @@ -4108,6 +4143,29 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return -1; } +/* Generate min value of the high 32-bit from TNUM info. */ +static u64 gen_hi_min(struct tnum var) +{ + return var.value & ~0xffffffffULL; +} + +/* Generate max value of the high 32-bit from TNUM info. */ +static u64 gen_hi_max(struct tnum var) +{ + return (var.value | var.mask) & ~0xffffffffULL; +} + +/* Return true if VAL is compared with a s64 sign extended from s32, and they + * are with the same signedness. + */ +static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) +{ + return ((s32)sval >= 0 && + reg->smin_value >= 0 && reg->smax_value <= S32_MAX) || + ((s32)sval < 0 && + reg->smax_value <= 0 && reg->smin_value >= S32_MIN); +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -4115,7 +4173,7 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) */ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, - u8 opcode) + u8 opcode, bool is_jmp32) { s64 sval; @@ -4128,7 +4186,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, if (__is_pointer_value(false, false_reg)) return; - sval = (s64)val; + val = is_jmp32 ? (u32)val : val; + sval = is_jmp32 ? (s64)(s32)val : (s64)val; switch (opcode) { case BPF_JEQ: @@ -4141,7 +4200,15 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, * if it is true we know the value for sure. Likewise for * BPF_JNE. */ - __mark_reg_known(reg, val); + if (is_jmp32) { + u64 old_v = reg->var_off.value; + u64 hi_mask = ~0xffffffffULL; + + reg->var_off.value = (old_v & hi_mask) | val; + reg->var_off.mask &= hi_mask; + } else { + __mark_reg_known(reg, val); + } break; } case BPF_JSET: @@ -4157,6 +4224,10 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, u64 false_umax = opcode == BPF_JGT ? val : val - 1; u64 true_umin = opcode == BPF_JGT ? val + 1 : val; + if (is_jmp32) { + false_umax += gen_hi_max(false_reg->var_off); + true_umin += gen_hi_min(true_reg->var_off); + } false_reg->umax_value = min(false_reg->umax_value, false_umax); true_reg->umin_value = max(true_reg->umin_value, true_umin); break; @@ -4167,6 +4238,11 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + /* If the full s64 was not sign-extended from s32 then don't + * deduct further info. + */ + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; false_reg->smax_value = min(false_reg->smax_value, false_smax); true_reg->smin_value = max(true_reg->smin_value, true_smin); break; @@ -4177,6 +4253,10 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, u64 false_umin = opcode == BPF_JLT ? val : val + 1; u64 true_umax = opcode == BPF_JLT ? val - 1 : val; + if (is_jmp32) { + false_umin += gen_hi_min(false_reg->var_off); + true_umax += gen_hi_max(true_reg->var_off); + } false_reg->umin_value = max(false_reg->umin_value, false_umin); true_reg->umax_value = min(true_reg->umax_value, true_umax); break; @@ -4187,6 +4267,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; false_reg->smin_value = max(false_reg->smin_value, false_smin); true_reg->smax_value = min(true_reg->smax_value, true_smax); break; @@ -4213,14 +4295,15 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, */ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, - u8 opcode) + u8 opcode, bool is_jmp32) { s64 sval; if (__is_pointer_value(false, false_reg)) return; - sval = (s64)val; + val = is_jmp32 ? (u32)val : val; + sval = is_jmp32 ? (s64)(s32)val : (s64)val; switch (opcode) { case BPF_JEQ: @@ -4229,7 +4312,15 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *reg = opcode == BPF_JEQ ? true_reg : false_reg; - __mark_reg_known(reg, val); + if (is_jmp32) { + u64 old_v = reg->var_off.value; + u64 hi_mask = ~0xffffffffULL; + + reg->var_off.value = (old_v & hi_mask) | val; + reg->var_off.mask &= hi_mask; + } else { + __mark_reg_known(reg, val); + } break; } case BPF_JSET: @@ -4245,6 +4336,10 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, u64 false_umin = opcode == BPF_JGT ? val : val + 1; u64 true_umax = opcode == BPF_JGT ? val - 1 : val; + if (is_jmp32) { + false_umin += gen_hi_min(false_reg->var_off); + true_umax += gen_hi_max(true_reg->var_off); + } false_reg->umin_value = max(false_reg->umin_value, false_umin); true_reg->umax_value = min(true_reg->umax_value, true_umax); break; @@ -4255,6 +4350,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1; s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval; + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; false_reg->smin_value = max(false_reg->smin_value, false_smin); true_reg->smax_value = min(true_reg->smax_value, true_smax); break; @@ -4265,6 +4362,10 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, u64 false_umax = opcode == BPF_JLT ? val : val - 1; u64 true_umin = opcode == BPF_JLT ? val + 1 : val; + if (is_jmp32) { + false_umax += gen_hi_max(false_reg->var_off); + true_umin += gen_hi_min(true_reg->var_off); + } false_reg->umax_value = min(false_reg->umax_value, false_umax); true_reg->umin_value = max(true_reg->umin_value, true_umin); break; @@ -4275,6 +4376,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1; s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval; + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; false_reg->smax_value = min(false_reg->smax_value, false_smax); true_reg->smin_value = max(true_reg->smin_value, true_smin); break; @@ -4416,6 +4519,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, if (BPF_SRC(insn->code) != BPF_X) return false; + /* Pointers are always 64-bit. */ + if (BPF_CLASS(insn->code) == BPF_JMP32) + return false; + switch (BPF_OP(insn->code)) { case BPF_JGT: if ((dst_reg->type == PTR_TO_PACKET && @@ -4508,16 +4615,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs; u8 opcode = BPF_OP(insn->code); + bool is_jmp32; int err; - if (opcode > BPF_JSLE) { - verbose(env, "invalid BPF_JMP opcode %x\n", opcode); + /* Only conditional jumps are expected to reach here. */ + if (opcode == BPF_JA || opcode > BPF_JSLE) { + verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } @@ -4533,7 +4642,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } else { if (insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } } @@ -4544,9 +4653,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; dst_reg = ®s[insn->dst_reg]; + is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; if (BPF_SRC(insn->code) == BPF_K) { - int pred = is_branch_taken(dst_reg, insn->imm, opcode); + int pred = is_branch_taken(dst_reg, insn->imm, opcode, + is_jmp32); if (pred == 1) { /* only follow the goto, ignore fall-through */ @@ -4574,30 +4685,51 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * comparable. */ if (BPF_SRC(insn->code) == BPF_X) { + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state lo_reg0 = *dst_reg; + struct bpf_reg_state lo_reg1 = *src_reg; + struct bpf_reg_state *src_lo, *dst_lo; + + dst_lo = &lo_reg0; + src_lo = &lo_reg1; + coerce_reg_to_size(dst_lo, 4); + coerce_reg_to_size(src_lo, 4); + if (dst_reg->type == SCALAR_VALUE && - regs[insn->src_reg].type == SCALAR_VALUE) { - if (tnum_is_const(regs[insn->src_reg].var_off)) + src_reg->type == SCALAR_VALUE) { + if (tnum_is_const(src_reg->var_off) || + (is_jmp32 && tnum_is_const(src_lo->var_off))) reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, regs[insn->src_reg].var_off.value, - opcode); - else if (tnum_is_const(dst_reg->var_off)) + dst_reg, + is_jmp32 + ? src_lo->var_off.value + : src_reg->var_off.value, + opcode, is_jmp32); + else if (tnum_is_const(dst_reg->var_off) || + (is_jmp32 && tnum_is_const(dst_lo->var_off))) reg_set_min_max_inv(&other_branch_regs[insn->src_reg], - ®s[insn->src_reg], - dst_reg->var_off.value, opcode); - else if (opcode == BPF_JEQ || opcode == BPF_JNE) + src_reg, + is_jmp32 + ? dst_lo->var_off.value + : dst_reg->var_off.value, + opcode, is_jmp32); + else if (!is_jmp32 && + (opcode == BPF_JEQ || opcode == BPF_JNE)) /* Comparing for equality, we can combine knowledge */ reg_combine_min_max(&other_branch_regs[insn->src_reg], &other_branch_regs[insn->dst_reg], - ®s[insn->src_reg], - ®s[insn->dst_reg], opcode); + src_reg, dst_reg, opcode); } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, insn->imm, opcode); + dst_reg, insn->imm, opcode, is_jmp32); } - /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ - if (BPF_SRC(insn->code) == BPF_K && + /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). + * NOTE: these optimizations below are related with pointer comparison + * which will never be JMP32. + */ + if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && reg_type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either @@ -4926,7 +5058,8 @@ peek_stack: goto check_state; t = insn_stack[cur_stack - 1]; - if (BPF_CLASS(insns[t].code) == BPF_JMP) { + if (BPF_CLASS(insns[t].code) == BPF_JMP || + BPF_CLASS(insns[t].code) == BPF_JMP32) { u8 opcode = BPF_OP(insns[t].code); if (opcode == BPF_EXIT) { @@ -6082,7 +6215,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - } else if (class == BPF_JMP) { + } else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { @@ -6090,7 +6223,8 @@ static int do_check(struct bpf_verifier_env *env) insn->off != 0 || (insn->src_reg != BPF_REG_0 && insn->src_reg != BPF_PSEUDO_CALL) || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } @@ -6106,7 +6240,8 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } @@ -6118,7 +6253,8 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } @@ -6635,6 +6771,9 @@ static bool insn_is_cond_jump(u8 code) { u8 op; + if (BPF_CLASS(code) == BPF_JMP32) + return true; + if (BPF_CLASS(code) != BPF_JMP) return false; -- cgit v1.2.3-71-gd317 From 56cbd82ef0b3dc47a16beeebc8d9a9a9269093dc Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:26:02 -0500 Subject: bpf: disassembler support JMP32 This patch teaches disassembler about JMP32. There are two places to update: - Class 0x6 now used by BPF_JMP32, not "unused". - BPF_JMP32 need to show comparison operands properly. The disassemble format is to add an extra "(32)" before the operands if it is a sub-register. A better disassemble format for both JMP32 and ALU32 just show the register prefix as "w" instead of "r", this is the format using by LLVM assembler. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/disasm.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d6b76377cb6e..de73f55e42fd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -67,7 +67,7 @@ const char *const bpf_class_string[8] = { [BPF_STX] = "stx", [BPF_ALU] = "alu", [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", + [BPF_JMP32] = "jmp32", [BPF_ALU64] = "alu64", }; @@ -136,23 +136,22 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, else print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", - insn->code, insn->dst_reg, - class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', + insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", + class == BPF_ALU ? 'w' : 'r', insn->src_reg); } else { - verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d %s %d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", insn->imm); } } else if (class == BPF_STX) { @@ -220,7 +219,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } - } else if (class == BPF_JMP) { + } else if (class == BPF_JMP32 || class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { @@ -244,13 +243,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_EXIT)) { verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, + verbose(cbs->private_data, + "(%02x) if %c%d %s %c%d goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], + class == BPF_JMP32 ? 'w' : 'r', insn->src_reg, insn->off); } else { - verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, + verbose(cbs->private_data, + "(%02x) if %c%d %s 0x%x goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } -- cgit v1.2.3-71-gd317 From 503a8865a47752d0ac2ff642f07e96e8b2103178 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:26:04 -0500 Subject: bpf: interpreter support for JMP32 This patch implements interpreting new JMP32 instructions. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 197 +++++++++++++++++------------------------------------- 1 file changed, 63 insertions(+), 134 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1e443ba97310..bba11c2565ee 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1145,6 +1145,31 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_2(JMP, CALL), \ /* Exit instruction. */ \ INSN_2(JMP, EXIT), \ + /* 32-bit Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP32, JEQ, X), \ + INSN_3(JMP32, JNE, X), \ + INSN_3(JMP32, JGT, X), \ + INSN_3(JMP32, JLT, X), \ + INSN_3(JMP32, JGE, X), \ + INSN_3(JMP32, JLE, X), \ + INSN_3(JMP32, JSGT, X), \ + INSN_3(JMP32, JSLT, X), \ + INSN_3(JMP32, JSGE, X), \ + INSN_3(JMP32, JSLE, X), \ + INSN_3(JMP32, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP32, JEQ, K), \ + INSN_3(JMP32, JNE, K), \ + INSN_3(JMP32, JGT, K), \ + INSN_3(JMP32, JLT, K), \ + INSN_3(JMP32, JGE, K), \ + INSN_3(JMP32, JLE, K), \ + INSN_3(JMP32, JSGT, K), \ + INSN_3(JMP32, JSLT, K), \ + INSN_3(JMP32, JSGE, K), \ + INSN_3(JMP32, JSLE, K), \ + INSN_3(JMP32, JSET, K), \ /* Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP, JEQ, X), \ @@ -1405,145 +1430,49 @@ select_insn: out: CONT; } - /* JMP */ JMP_JA: insn += insn->off; CONT; - JMP_JEQ_X: - if (DST == SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JEQ_K: - if (DST == IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_X: - if (DST != SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_K: - if (DST != IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_X: - if (DST > SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_K: - if (DST > IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLT_X: - if (DST < SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLT_K: - if (DST < IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_X: - if (DST >= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_K: - if (DST >= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLE_X: - if (DST <= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLE_K: - if (DST <= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_X: - if (((s64) DST) > ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_K: - if (((s64) DST) > ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLT_X: - if (((s64) DST) < ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLT_K: - if (((s64) DST) < ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_X: - if (((s64) DST) >= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_K: - if (((s64) DST) >= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLE_X: - if (((s64) DST) <= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLE_K: - if (((s64) DST) <= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_X: - if (DST & SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_K: - if (DST & IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; JMP_EXIT: return BPF_R0; - + /* JMP */ +#define COND_JMP(SIGN, OPCODE, CMP_OP) \ + JMP_##OPCODE##_X: \ + if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP32_##OPCODE##_X: \ + if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP_##OPCODE##_K: \ + if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP32_##OPCODE##_K: \ + if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; + COND_JMP(u, JEQ, ==) + COND_JMP(u, JNE, !=) + COND_JMP(u, JGT, >) + COND_JMP(u, JLT, <) + COND_JMP(u, JGE, >=) + COND_JMP(u, JLE, <=) + COND_JMP(u, JSET, &) + COND_JMP(s, JSGT, >) + COND_JMP(s, JSLT, <) + COND_JMP(s, JSGE, >=) + COND_JMP(s, JSLE, <=) +#undef COND_JMP /* STX and ST and LDX*/ #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ -- cgit v1.2.3-71-gd317 From a7b76c8857692b0fce063b94ed83da11c396d341 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:26:05 -0500 Subject: bpf: JIT blinds support JMP32 This patch adds JIT blinds support for JMP32. Like BPF_JMP_REG/IMM, JMP32 version are needed for building raw bpf insn. They are added to both include/linux/filter.h and tools/include/linux/filter.h. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 20 ++++++++++++++++++++ kernel/bpf/core.c | 21 +++++++++++++++++++++ tools/include/linux/filter.h | 20 ++++++++++++++++++++ 3 files changed, 61 insertions(+) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index be9af6b4a9e4..e4b473f85b46 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -277,6 +277,26 @@ struct sock_reuseport; .off = OFF, \ .imm = IMM }) +/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + /* Unconditional jumps, goto pc + off16 */ #define BPF_JMP_A(OFF) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bba11c2565ee..a7bcb23bee84 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -949,6 +949,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; + case BPF_JMP32 | BPF_JEQ | BPF_K: + case BPF_JMP32 | BPF_JNE | BPF_K: + case BPF_JMP32 | BPF_JGT | BPF_K: + case BPF_JMP32 | BPF_JLT | BPF_K: + case BPF_JMP32 | BPF_JGE | BPF_K: + case BPF_JMP32 | BPF_JLE | BPF_K: + case BPF_JMP32 | BPF_JSGT | BPF_K: + case BPF_JMP32 | BPF_JSLT | BPF_K: + case BPF_JMP32 | BPF_JSGE | BPF_K: + case BPF_JMP32 | BPF_JSLE | BPF_K: + case BPF_JMP32 | BPF_JSET | BPF_K: + /* Accommodate for extra offset in case of a backjump. */ + off = from->off; + if (off < 0) + off -= 2; + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, + off); + break; + case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index af55acf73e75..cce0b02c0e28 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -199,6 +199,16 @@ .off = OFF, \ .imm = 0 }) +/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ @@ -209,6 +219,16 @@ .off = OFF, \ .imm = IMM }) +/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + /* Unconditional jumps, goto pc + off16 */ #define BPF_JMP_A(OFF) \ -- cgit v1.2.3-71-gd317