From 71d19214776e61b33da48f7c1b46e522c7f78221 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sun, 26 Apr 2020 09:15:25 -0700 Subject: bpf: add bpf_ktime_get_boot_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a device like a cellphone which is constantly suspending and resuming CLOCK_MONOTONIC is not particularly useful for keeping track of or reacting to external network events. Instead you want to use CLOCK_BOOTTIME. Hence add bpf_ktime_get_boot_ns() as a mirror of bpf_ktime_get_ns() based around CLOCK_BOOTTIME instead of CLOCK_MONOTONIC. Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..e875c95d3ced 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -797,6 +797,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: -- cgit v1.2.3-71-gd317 From 32927393dc1ccd60fb2bdc05b9e8e88753761469 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:38 +0200 Subject: sysctl: pass kernel pointers to ->proc_handler Instead of having all the sysctl handlers deal with user pointers, which is rather hairy in terms of the BPF interaction, copy the input to and from userspace in common code. This also means that the strings are always NUL-terminated by the common code, making the API a little bit safer. As most handler just pass through the data to one of the common handlers a lot of the changes are mechnical. Signed-off-by: Christoph Hellwig Acked-by: Andrey Ignatov Signed-off-by: Al Viro --- arch/arm64/kernel/armv8_deprecated.c | 2 +- arch/arm64/kernel/fpsimd.c | 3 +- arch/mips/lasat/sysctl.c | 13 +- arch/s390/appldata/appldata_base.c | 11 +- arch/s390/kernel/debug.c | 2 +- arch/s390/kernel/topology.c | 2 +- arch/s390/mm/cmm.c | 12 +- arch/x86/kernel/itmt.c | 3 +- drivers/cdrom/cdrom.c | 2 +- drivers/char/random.c | 2 +- drivers/macintosh/mac_hid.c | 3 +- drivers/parport/procfs.c | 39 +++--- fs/dcache.c | 2 +- fs/drop_caches.c | 2 +- fs/file_table.c | 4 +- fs/fscache/main.c | 3 +- fs/inode.c | 2 +- fs/proc/proc_sysctl.c | 47 ++++--- fs/quota/dquot.c | 2 +- fs/xfs/xfs_sysctl.c | 4 +- include/linux/bpf-cgroup.h | 9 +- include/linux/compaction.h | 2 +- include/linux/fs.h | 6 +- include/linux/ftrace.h | 3 +- include/linux/hugetlb.h | 15 +- include/linux/kprobes.h | 2 +- include/linux/latencytop.h | 4 +- include/linux/mm.h | 12 +- include/linux/mmzone.h | 23 ++- include/linux/nmi.h | 15 +- include/linux/perf_event.h | 13 +- include/linux/printk.h | 2 +- include/linux/sched/sysctl.h | 44 ++---- include/linux/security.h | 2 +- include/linux/sysctl.h | 53 +++---- include/linux/timer.h | 3 +- include/linux/vmstat.h | 8 +- include/linux/writeback.h | 28 ++-- ipc/ipc_sysctl.c | 10 +- ipc/mq_sysctl.c | 4 +- kernel/bpf/cgroup.c | 35 ++--- kernel/events/callchain.c | 2 +- kernel/events/core.c | 6 +- kernel/kprobes.c | 2 +- kernel/latencytop.c | 4 +- kernel/pid_namespace.c | 2 +- kernel/printk/printk.c | 2 +- kernel/sched/core.c | 9 +- kernel/sched/fair.c | 3 +- kernel/sched/rt.c | 10 +- kernel/sched/topology.c | 2 +- kernel/seccomp.c | 2 +- kernel/sysctl.c | 239 ++++++++++++-------------------- kernel/time/timer.c | 3 +- kernel/trace/trace.c | 2 +- kernel/umh.c | 2 +- kernel/utsname_sysctl.c | 2 +- kernel/watchdog.c | 12 +- mm/compaction.c | 2 +- mm/hugetlb.c | 9 +- mm/page-writeback.c | 16 +-- mm/page_alloc.c | 30 ++-- mm/util.c | 10 +- mm/vmstat.c | 4 +- net/bridge/br_netfilter_hooks.c | 2 +- net/core/neighbour.c | 28 ++-- net/core/sysctl_net_core.c | 27 ++-- net/decnet/dn_dev.c | 7 +- net/decnet/sysctl_net_decnet.c | 27 ++-- net/ipv4/devinet.c | 9 +- net/ipv4/route.c | 3 +- net/ipv4/sysctl_net_ipv4.c | 38 ++--- net/ipv6/addrconf.c | 33 ++--- net/ipv6/ndisc.c | 3 +- net/ipv6/route.c | 5 +- net/ipv6/sysctl_net_ipv6.c | 3 +- net/mpls/af_mpls.c | 5 +- net/netfilter/ipvs/ip_vs_ctl.c | 6 +- net/netfilter/nf_conntrack_standalone.c | 2 +- net/netfilter/nf_log.c | 2 +- net/phonet/sysctl.c | 3 +- net/rds/tcp.c | 6 +- net/sctp/sysctl.c | 32 ++--- net/sunrpc/sysctl.c | 29 ++-- net/sunrpc/xprtrdma/svc_rdma.c | 7 +- security/apparmor/lsm.c | 2 +- security/min_addr.c | 2 +- security/yama/yama_lsm.c | 2 +- 88 files changed, 458 insertions(+), 653 deletions(-) (limited to 'kernel/trace') diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index c19aa81ddc8c..7364de008bab 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -203,7 +203,7 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops) } static int emulation_proc_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 94289d126993..35cb5e66c504 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -341,8 +341,7 @@ static unsigned int find_supported_vector_length(unsigned int vl) #ifdef CONFIG_SYSCTL static int sve_proc_do_default_vl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int vl = sve_default_vl; diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index e666fe26c50d..2119541a5b8b 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -95,16 +95,15 @@ int proc_lasat_ip(struct ctl_table *table, int write, len = 0; p = buffer; while (len < *lenp) { - if (get_user(c, p++)) - return -EFAULT; + c = *p; + p++; if (c == 0 || c == '\n') break; len++; } if (len >= sizeof(ipbuf)-1) len = sizeof(ipbuf) - 1; - if (copy_from_user(ipbuf, buffer, len)) - return -EFAULT; + memcpy(ipbuf, buffer, len); ipbuf[len] = 0; *ppos += *lenp; /* Now see if we can convert it to a valid IP */ @@ -122,11 +121,9 @@ int proc_lasat_ip(struct ctl_table *table, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, ipbuf, len)) - return -EFAULT; + memcpy(buffer, ipbuf, len); if (len < *lenp) { - if (put_user('\n', ((char *) buffer) + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; len++; } *lenp = len; diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index aa738cad1338..d74a4c7d5df6 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -51,10 +51,9 @@ static struct platform_device *appldata_pdev; */ static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata"; static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table_header *appldata_sysctl_header; static struct ctl_table appldata_table[] = { @@ -217,7 +216,7 @@ static void __appldata_vtimer_setup(int cmd) */ static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int timer_active = appldata_timer_active; int rc; @@ -250,7 +249,7 @@ appldata_timer_handler(struct ctl_table *ctl, int write, */ static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int interval = appldata_interval; int rc; @@ -280,7 +279,7 @@ appldata_interval_handler(struct ctl_table *ctl, int write, */ static int appldata_generic_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct appldata_ops *ops = NULL, *tmp_ops; struct list_head *lh; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 6d321f5f101d..636446003a06 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -867,7 +867,7 @@ static int debug_active = 1; * if debug_active is already off */ static int s390dbf_procactive(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) return proc_dointvec(table, write, buffer, lenp, ppos); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 5f70cefc13e4..332b542548cd 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -594,7 +594,7 @@ static int __init topology_setup(char *str) early_param("topology", topology_setup); static int topology_ctl_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int enabled = topology_is_enabled(); int new_mode; diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index ae989b740376..36bce727897b 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -245,7 +245,7 @@ static int cmm_skip_blanks(char *cp, char **endp) } static int cmm_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_pages(); struct ctl_table ctl_entry = { @@ -264,7 +264,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_timed_pages(); @@ -284,7 +284,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timeout_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; long nr, seconds; @@ -297,8 +297,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, if (write) { len = min(*lenp, sizeof(buf)); - if (copy_from_user(buf, buffer, len)) - return -EFAULT; + memcpy(buf, buffer, len); buf[len - 1] = '\0'; cmm_skip_blanks(buf, &p); nr = simple_strtoul(p, &p, 0); @@ -311,8 +310,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, cmm_timeout_pages, cmm_timeout_seconds); if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, buf, len)) - return -EFAULT; + memcpy(buffer, buf, len); *lenp = len; *ppos += len; } diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1cb3ca9bba49..1afbdd1dd777 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -39,8 +39,7 @@ static bool __read_mostly sched_itmt_capable; unsigned int __read_mostly sysctl_sched_itmt_enabled; static int sched_itmt_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old_sysctl; int ret; diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index faca0f346fff..e3bbe108eb54 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3631,7 +3631,7 @@ static void cdrom_update_settings(void) } static int cdrom_sysctl_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/drivers/char/random.c b/drivers/char/random.c index 0d10e31fd342..1e0db78b83ba 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -2057,7 +2057,7 @@ static char sysctl_bootid[16]; * sysctl system call, as 16 bytes of binary data. */ static int proc_do_uuid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; unsigned char buf[64], tmp_uuid[16], *uuid; diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 7af0c536d568..28b8581b44dd 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -183,8 +183,7 @@ static void mac_hid_stop_emulation(void) } static int mac_hid_toggle_emumouse(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int old_val = *valp; diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c index 48804049d697..ee7b5daabfd4 100644 --- a/drivers/parport/procfs.c +++ b/drivers/parport/procfs.c @@ -34,7 +34,7 @@ #define PARPORT_MAX_SPINTIME_VALUE 1000 static int do_active_device(struct ctl_table *table, int write, - void __user *result, size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[256]; @@ -65,13 +65,13 @@ static int do_active_device(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #ifdef CONFIG_PARPORT_1284 static int do_autoprobe(struct ctl_table *table, int write, - void __user *result, size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport_device_info *info = table->extra2; const char *str; @@ -108,13 +108,13 @@ static int do_autoprobe(struct ctl_table *table, int write, *ppos += len; - return copy_to_user (result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #endif /* IEEE1284.3 support. */ static int do_hardware_base_addr(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -136,13 +136,12 @@ static int do_hardware_base_addr(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_irq(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -164,13 +163,12 @@ static int do_hardware_irq(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_dma(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -192,13 +190,12 @@ static int do_hardware_dma(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_modes(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[40]; @@ -231,8 +228,8 @@ static int do_hardware_modes(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #define PARPORT_PORT_DIR(CHILD) { .procname = NULL, .mode = 0555, .child = CHILD } diff --git a/fs/dcache.c b/fs/dcache.c index b280e07e162b..8dd4d8d7bd0b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -165,7 +165,7 @@ static long get_nr_dentry_negative(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, +int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index dc1a1d5d825b..f00fcc4a4f72 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -47,7 +47,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } int drop_caches_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/fs/file_table.c b/fs/file_table.c index 30d55c9a1744..3b612535391f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -80,14 +80,14 @@ EXPORT_SYMBOL_GPL(get_max_files); */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 59c2494efda3..c1e6cc9091aa 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -51,8 +51,7 @@ static unsigned fscache_op_max_active = 2; static struct ctl_table_header *fscache_sysctl_header; static int fscache_max_active_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct workqueue_struct **wqp = table->extra1; unsigned int *datap = table->data; diff --git a/fs/inode.c b/fs/inode.c index 93d9252a00ab..cc6e701b7e5d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -108,7 +108,7 @@ long get_nr_dirty_inodes(void) */ #ifdef CONFIG_SYSCTL int proc_nr_inodes(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b6f5d459b087..df2143e05c57 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -539,13 +539,13 @@ out: return err; } -static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, +static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, size_t count, loff_t *ppos, int write) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; - void *new_buf = NULL; + void *kbuf; ssize_t error; if (IS_ERR(head)) @@ -564,27 +564,38 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; - error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count, - ppos, &new_buf); + if (write) { + kbuf = memdup_user_nul(ubuf, count); + if (IS_ERR(kbuf)) { + error = PTR_ERR(kbuf); + goto out; + } + } else { + error = -ENOMEM; + kbuf = kzalloc(count, GFP_KERNEL); + if (!kbuf) + goto out; + } + + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, + ppos); if (error) - goto out; + goto out_free_buf; /* careful: calling conventions are nasty here */ - if (new_buf) { - mm_segment_t old_fs; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - error = table->proc_handler(table, write, (void __user *)new_buf, - &count, ppos); - set_fs(old_fs); - kfree(new_buf); - } else { - error = table->proc_handler(table, write, buf, &count, ppos); + error = table->proc_handler(table, write, kbuf, &count, ppos); + if (error) + goto out_free_buf; + + if (!write) { + error = -EFAULT; + if (copy_to_user(ubuf, kbuf, count)) + goto out_free_buf; } - if (!error) - error = count; + error = count; +out_free_buf: + kfree(kbuf); out: sysctl_head_finish(head); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index b6a4f692d345..7b4bac91146b 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2841,7 +2841,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = { EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); static int do_proc_dqstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int type = (unsigned long *)table->data - dqstats.stat; s64 value = percpu_counter_sum(&dqstats.counter[type]); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 31b3bdbd2eba..021ef96d0542 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -13,7 +13,7 @@ STATIC int xfs_stats_clear_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { @@ -33,7 +33,7 @@ STATIC int xfs_panic_mask_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c11b413d5b1a..0b41fd5fc96b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -138,8 +138,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, @@ -302,12 +301,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf) \ +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ - buf, count, pos, nbuf, \ + buf, count, pos, \ BPF_CGROUP_SYSCTL); \ __ret; \ }) @@ -429,7 +428,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 4b898cdbdf05..a0eabfbeb0e1 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -86,7 +86,7 @@ static inline unsigned long compact_gap(unsigned int order) #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos); + void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..9b028d260649 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3536,11 +3536,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_inodes(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); #define __FMODE_EXEC ((__force int) FMODE_EXEC) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db95244a62d4..ddfc377de0d2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1005,8 +1005,7 @@ extern void disable_trace_on_warning(void); extern int __disable_trace_on_warning; int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #else /* CONFIG_TRACING */ static inline void disable_trace_on_warning(void) { } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 43a1cef8f0f1..92c21c5ccc58 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -105,14 +105,13 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); -int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); - -#ifdef CONFIG_NUMA -int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -#endif +int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 04bdaf01112c..594265bfd390 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -312,7 +312,7 @@ DEFINE_INSN_CACHE_OPS(optinsn); #ifdef CONFIG_SYSCTL extern int sysctl_kprobes_optimization; extern int proc_kprobes_optimization_handler(struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *length, loff_t *ppos); #endif extern void wait_for_kprobe_optimizer(void); diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index 9022f0c2e2e4..abe3d95f795b 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -38,8 +38,8 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) void clear_tsk_latency_tracing(struct task_struct *p); -extern int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c4e7e76dedd..a7b1ef8ed970 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -201,10 +201,10 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; -extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); -extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); +int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) @@ -2957,8 +2957,8 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #endif void drop_slab(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b2af594ef0f7..93cf20f41e26 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -910,22 +910,21 @@ static inline int is_highmem(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, + size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, + size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); - -extern int numa_zonelist_order_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); +int numa_zonelist_order_handler(struct ctl_table *, int, + void *, size_t *, loff_t *); extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 9003e29cde46..750c7f395ca9 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -202,16 +202,11 @@ static inline void watchdog_update_hrtimer_threshold(u64 period) { } #endif struct ctl_table; -extern int proc_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_nmi_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_soft_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_watchdog_thresh(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_watchdog_cpumask(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int proc_watchdog(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_nmi_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_soft_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_watchdog_thresh(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_watchdog_cpumask(struct ctl_table *, int, void *, size_t *, loff_t *); #ifdef CONFIG_HAVE_ACPI_APEI_NMI #include diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9c3e7619c929..347ea379622a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1280,15 +1280,12 @@ extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); -extern int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - +int perf_proc_update_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 diff --git a/include/linux/printk.h b/include/linux/printk.h index e061635e0409..fcde0772ec98 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -189,7 +189,7 @@ extern int printk_delay_msec; extern int dmesg_restrict; extern int -devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf, +devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf, size_t *lenp, loff_t *ppos); extern void wake_up_klogd(void); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d4f6215ee03f..7b4d3a49b6c5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -12,9 +12,8 @@ extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; extern unsigned long sysctl_hung_task_check_interval_secs; extern int sysctl_hung_task_warnings; -extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); +int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; @@ -43,8 +42,7 @@ extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_nr_migrate; int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos); + void *buffer, size_t *length, loff_t *ppos); #endif /* @@ -72,33 +70,21 @@ extern unsigned int sysctl_sched_autogroup_enabled; extern int sysctl_sched_rr_timeslice; extern int sched_rr_timeslice; -extern int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -extern int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -#ifdef CONFIG_UCLAMP_TASK -extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -#endif - -extern int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -extern int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern unsigned int sysctl_sched_energy_aware; -extern int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int sched_energy_aware_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #endif #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/security.h b/include/linux/security.h index a8d9310472df..6aa229b252ce 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -211,7 +211,7 @@ struct request_sock; #ifdef CONFIG_MMU extern int mmap_min_addr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #endif /* security_inode_init_security callback function to write xattrs */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 36143ca40b56..f2401e45a3c2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -44,35 +44,26 @@ struct ctl_dir; extern const int sysctl_vals[]; -typedef int proc_handler (struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -extern int proc_dostring(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_douintvec(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_minmax(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int proc_dointvec_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_doulongvec_minmax(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, - void __user *, size_t *, loff_t *); -extern int proc_do_large_bitmap(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); + +int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *, + loff_t *); +int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *, + loff_t *); +int proc_doulongvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *, + size_t *, loff_t *); +int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_do_static_key(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); /* * Register a set of sysctl names by calling register_sysctl_table @@ -246,7 +237,7 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, #endif /* CONFIG_SYSCTL */ -int sysctl_max_threads(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #endif /* _LINUX_SYSCTL_H */ diff --git a/include/linux/timer.h b/include/linux/timer.h index 0dc19a8c39c9..07910ae5ddd9 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -201,8 +201,7 @@ struct ctl_table; extern unsigned int sysctl_timer_migration; int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #endif unsigned long __round_jiffies(unsigned long j, int cpu); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 292485f3d24d..cb507151710f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -16,8 +16,8 @@ extern int sysctl_stat_interval; #define DISABLE_NUMA_STAT 0 extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); -extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, - int write, void __user *buffer, size_t *length, loff_t *ppos); +int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { @@ -274,8 +274,8 @@ void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; -int vmstat_refresh(struct ctl_table *, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, + loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a19d845dd7eb..f8a7e1a850fb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -362,24 +362,18 @@ extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; -extern int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int dirty_background_ratio_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_background_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_ratio_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int dirtytime_interval_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -struct ctl_table; -int dirty_writeback_centisecs_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index affd66537e87..d1b8644bfb88 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -24,7 +24,7 @@ static void *get_ipc(struct ctl_table *table) #ifdef CONFIG_PROC_SYSCTL static int proc_ipc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -35,7 +35,7 @@ static int proc_ipc_dointvec(struct ctl_table *table, int write, } static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -46,7 +46,7 @@ static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, } static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = current->nsproxy->ipc_ns; int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -59,7 +59,7 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, } static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; memcpy(&ipc_table, table, sizeof(ipc_table)); @@ -70,7 +70,7 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, } static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; int dummy = 0; diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 7c00f28923a8..72a92a08c848 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -19,7 +19,7 @@ static void *get_mq(struct ctl_table *table) } static int proc_mq_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); @@ -29,7 +29,7 @@ static int proc_mq_dointvec(struct ctl_table *table, int write, } static int proc_mq_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..977bc69bb1c5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1137,16 +1137,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) - * @buf: pointer to buffer passed by user space + * @buf: pointer to buffer (in and out) * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @new_buf: pointer to pointer to new buffer that will be allocated if program - * overrides new value provided by user space on sysctl write - * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -1157,8 +1154,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { @@ -1173,36 +1169,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .new_updated = 0, }; struct cgroup *cgrp; + loff_t pos = 0; int ret; ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); - if (ctx.cur_val) { - mm_segment_t old_fs; - loff_t pos = 0; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, - &ctx.cur_len, &pos)) { - /* Let BPF program decide how to proceed. */ - ctx.cur_len = 0; - } - set_fs(old_fs); - } else { + if (!ctx.cur_val || + table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { /* Let BPF program decide how to proceed. */ ctx.cur_len = 0; } - if (write && buf && *pcount) { + if (write && *buf && *pcount) { /* BPF program should be able to override new value with a * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); - if (!ctx.new_val || - copy_from_user(ctx.new_val, buf, ctx.new_len)) + if (ctx.new_val) { + memcpy(ctx.new_val, *buf, ctx.new_len); + } else { /* Let BPF program decide how to proceed. */ ctx.new_len = 0; + } } rcu_read_lock(); @@ -1213,7 +1201,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { - *new_buf = ctx.new_val; + kfree(*buf); + *buf = ctx.new_val; *pcount = ctx.new_len; } else { kfree(ctx.new_val); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..bdb1533ada81 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -236,7 +236,7 @@ exit_put: * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index bc9b98a9af9a..f86d46f2c4d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -437,8 +437,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -462,8 +461,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..ffbe03a45c16 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void) static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 8d1c15832e55..166d7bf49666 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void) return 0; } -int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int err; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..3ccaba5f15c0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..471f649b5868 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a61a3b8eaa9..5c589a2e4d19 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1110,8 +1110,7 @@ static void uclamp_update_root_tg(void) { } #endif int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; int old_min, old_max; @@ -2723,7 +2722,7 @@ void set_numabalancing_state(bool enabled) #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2797,8 +2796,8 @@ static void __init init_schedstats(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02f323b85b6d..b6077fd5b32f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index df11d88c9895..45da29de3ecc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2714,9 +2714,8 @@ static void sched_rt_do_global(void) def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); } -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_period, old_runtime; static DEFINE_MUTEX(mutex); @@ -2754,9 +2753,8 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; static DEFINE_MUTEX(mutex); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..fa64b2ee9fe6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -209,7 +209,7 @@ bool sched_energy_update; #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 55a6184f5990..d653d8426de9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, } static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3fafca3ced98..e961286d0e14 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -208,12 +208,10 @@ static int max_extfrag_threshold = 1000; #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, - char __user *buffer, - size_t *lenp, loff_t *ppos) + char *buffer, size_t *lenp, loff_t *ppos) { size_t len; - char __user *p; - char c; + char c, *p; if (!data || !maxlen || !*lenp) { *lenp = 0; @@ -238,8 +236,7 @@ static int _proc_do_string(char *data, int maxlen, int write, *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { - if (get_user(c, p++)) - return -EFAULT; + c = *(p++); if (c == 0 || c == '\n') break; data[len++] = c; @@ -261,11 +258,9 @@ static int _proc_do_string(char *data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, data, len)) - return -EFAULT; + memcpy(buffer, data, len); if (len < *lenp) { - if (put_user('\n', buffer + len)) - return -EFAULT; + buffer[len] = '\n'; len++; } *lenp = len; @@ -326,13 +321,13 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, * Returns 0 on success. */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) proc_first_pos_non_zero_ignore(ppos, table); - return _proc_do_string((char *)(table->data), table->maxlen, write, - (char __user *)buffer, lenp, ppos); + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, + ppos); } static size_t proc_skip_spaces(char **buf) @@ -463,11 +458,10 @@ static int proc_get_long(char **buf, size_t *size, * @val: the integer to be converted * @neg: sign of the number, %TRUE for negative * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes written. + * In case of success @buf and @size are updated with the amount of bytes + * written. */ -static int proc_put_long(void __user **buf, size_t *size, unsigned long val, - bool neg) +static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg) { int len; char tmp[TMPBUFLEN], *p = tmp; @@ -476,24 +470,22 @@ static int proc_put_long(void __user **buf, size_t *size, unsigned long val, len = strlen(tmp); if (len > *size) len = *size; - if (copy_to_user(*buf, tmp, len)) - return -EFAULT; + memcpy(*buf, tmp, len); *size -= len; *buf += len; - return 0; } #undef TMPBUFLEN -static int proc_put_char(void __user **buf, size_t *size, char c) +static void proc_put_char(void **buf, size_t *size, char c) { if (*size) { - char __user **buffer = (char __user **)buf; - if (put_user(c, *buffer)) - return -EFAULT; - (*size)--, (*buffer)++; + char **buffer = (char **)buf; + **buffer = c; + + (*size)--; + (*buffer)++; *buf = *buffer; } - return 0; } static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, @@ -541,7 +533,7 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -549,7 +541,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, { int *i, vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -569,9 +561,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first=0) { @@ -598,24 +588,17 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, break; } if (!first) - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - err = proc_put_long(&buffer, &left, lval, neg); - if (err) - break; + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, lval, neg); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err && left) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -623,7 +606,7 @@ out: } static int do_proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -634,7 +617,7 @@ static int do_proc_dointvec(struct ctl_table *table, int write, static int do_proc_douintvec_w(unsigned int *tbl_data, struct ctl_table *table, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -645,7 +628,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, int err = 0; size_t left; bool neg; - char *kbuf = NULL, *p; + char *p = buffer; left = *lenp; @@ -655,10 +638,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return -EINVAL; - left -= proc_skip_spaces(&p); if (!left) { err = -EINVAL; @@ -682,7 +661,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, left -= proc_skip_spaces(&p); out_free: - kfree(kbuf); if (err) return -EINVAL; @@ -694,7 +672,7 @@ bail_early: return err; } -static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, +static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -712,11 +690,11 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, goto out; } - err = proc_put_long(&buffer, &left, lval, false); - if (err || !left) + proc_put_long(&buffer, &left, lval, false); + if (!left) goto out; - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); out: *lenp -= left; @@ -726,7 +704,7 @@ out: } static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -762,7 +740,7 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, } static int do_proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), @@ -785,16 +763,15 @@ static int do_proc_douintvec(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); } #ifdef CONFIG_COMPACTION static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos) + int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, old; @@ -826,8 +803,8 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, * * Returns 0 on success. */ -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_douintvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_douintvec_conv, NULL); @@ -838,7 +815,7 @@ int proc_douintvec(struct ctl_table *table, int write, * This means we can safely use a temporary. */ static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long tmptaint = get_taint(); @@ -870,7 +847,7 @@ static int proc_taint(struct ctl_table *table, int write, #ifdef CONFIG_PRINTK static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -936,7 +913,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, @@ -1005,7 +982,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_douintvec_minmax_conv_param param = { .min = (unsigned int *) table->extra1, @@ -1036,7 +1013,7 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, } static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_dopipe_max_size_conv, NULL); @@ -1057,7 +1034,7 @@ static void validate_coredump_safety(void) } static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) @@ -1067,7 +1044,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, #ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dostring(table, write, buffer, lenp, ppos); if (!error) @@ -1078,7 +1055,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int tmp, ret; @@ -1096,16 +1073,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, } #endif -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) +static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -1124,9 +1099,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first = 0) { @@ -1154,26 +1127,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int *i = val; } else { val = convdiv * (*i) / convmul; - if (!first) { - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - } - err = proc_put_long(&buffer, &left, val, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, val, false); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -1181,10 +1146,8 @@ out: } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) + void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, + unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, buffer, lenp, ppos, convmul, convdiv); @@ -1207,7 +1170,7 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } @@ -1230,8 +1193,7 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); @@ -1325,7 +1287,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success. */ int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); @@ -1347,7 +1309,7 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); @@ -1369,15 +1331,15 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct pid *new_pid; pid_t tmp; @@ -1416,7 +1378,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err = 0; bool first = 1; @@ -1432,7 +1394,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - char *kbuf, *p; + char *p = buffer; size_t skipped = 0; if (left > PAGE_SIZE - 1) { @@ -1441,15 +1403,9 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, skipped = *lenp - left; } - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); - if (!tmp_bitmap) { - kfree(kbuf); + if (!tmp_bitmap) return -ENOMEM; - } proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; @@ -1513,7 +1469,6 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, first = 0; proc_skip_char(&p, &left, '\n'); } - kfree(kbuf); left += skipped; } else { unsigned long bit_a, bit_b = 0; @@ -1525,27 +1480,17 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bit_b = find_next_zero_bit(bitmap, bitmap_len, bit_a + 1) - 1; - if (!first) { - err = proc_put_char(&buffer, &left, ','); - if (err) - break; - } - err = proc_put_long(&buffer, &left, bit_a, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, ','); + proc_put_long(&buffer, &left, bit_a, false); if (bit_a != bit_b) { - err = proc_put_char(&buffer, &left, '-'); - if (err) - break; - err = proc_put_long(&buffer, &left, bit_b, false); - if (err) - break; + proc_put_char(&buffer, &left, '-'); + proc_put_long(&buffer, &left, bit_b, false); } first = 0; bit_b++; } - if (!err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); } if (!err) { @@ -1566,68 +1511,67 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #else /* CONFIG_PROC_SYSCTL */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { - return -ENOSYS; + return -ENOSYS; } int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } @@ -1636,8 +1580,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #if defined(CONFIG_SYSCTL) int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static DEFINE_MUTEX(static_key_mutex); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5221abb4594..398e6eadb861 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -249,8 +249,7 @@ void timers_update_nohz(void) } int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d2b98812625..167a74a15b1a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2661,7 +2661,7 @@ static void output_printk(struct trace_event_buffer *fbuffer) } int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..9788ed481a6a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -630,7 +630,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 3732c888a949..4ca61d49885b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table) * to observe. Should this be in kernel/sys.c ???? */ static int proc_do_uts_string(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b6b1f54a7837..53ff2c81b084 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -661,7 +661,7 @@ static void proc_watchdog_update(void) * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; @@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * /proc/sys/kernel/watchdog */ int proc_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/nmi_watchdog */ int proc_nmi_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!nmi_watchdog_available && write) return -ENOTSUPP; @@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/soft_watchdog */ int proc_soft_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/watchdog_thresh */ int proc_watchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old; @@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, * been brought online, if desired. */ int proc_watchdog_cpumask(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err; diff --git a/mm/compaction.c b/mm/compaction.c index 46f0fcc93081..d8cfb7b99a83 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2463,7 +2463,7 @@ int sysctl_compact_memory; * /proc/sys/vm/compact_memory */ int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { if (write) compact_nodes(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cd459155d28a..2277c5728b1f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3352,7 +3352,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp = h->max_huge_pages; @@ -3375,7 +3375,7 @@ out: } int hugetlb_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(false, table, write, @@ -3384,7 +3384,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, #ifdef CONFIG_NUMA int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, buffer, length, ppos); @@ -3392,8 +3392,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, #endif /* CONFIG_NUMA */ int hugetlb_overcommit_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7326b54ab728..d3ee4c4dafac 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -512,8 +512,7 @@ bool node_dirty_ok(struct pglist_data *pgdat) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -524,8 +523,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, } int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -535,9 +533,8 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write, return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; @@ -551,8 +548,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, } int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; @@ -1972,7 +1968,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 62c1550cd43e..0c43e9ae5004 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5546,21 +5546,11 @@ char numa_zonelist_order[] = "Node"; * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { - char *str; - int ret; - - if (!write) - return proc_dostring(table, write, buffer, length, ppos); - str = memdup_user_nul(buffer, 16); - if (IS_ERR(str)) - return PTR_ERR(str); - - ret = __parse_numa_zonelist_order(str); - kfree(str); - return ret; + if (write) + return __parse_numa_zonelist_order(buffer); + return proc_dostring(table, write, buffer, length, ppos); } @@ -7963,7 +7953,7 @@ core_initcall(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -7979,7 +7969,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, } int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8009,7 +7999,7 @@ static void setup_min_unmapped_ratio(void) int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8036,7 +8026,7 @@ static void setup_min_slab_ratio(void) } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8060,7 +8050,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); @@ -8082,7 +8072,7 @@ static void __zone_pcp_update(struct zone *zone) * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int old_percpu_pagelist_fraction; diff --git a/mm/util.c b/mm/util.c index 988d11e6c17c..8defc8ec141f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -717,9 +717,8 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -int overcommit_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -729,9 +728,8 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, return ret; } -int overcommit_kbytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..c03a8c914922 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -76,7 +76,7 @@ static void invalid_numa_statistics(void) static DEFINE_MUTEX(vm_numa_stat_lock); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -1751,7 +1751,7 @@ static void refresh_vm_stats(struct work_struct *work) } int vmstat_refresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long val; int err; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 59980ecfc962..04c3f9a82650 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1027,7 +1027,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 39d37d0ef575..3f2263e79e4b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3379,7 +3379,7 @@ EXPORT_SYMBOL(neigh_app_ns); static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; @@ -3443,8 +3443,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) } static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; @@ -3457,8 +3457,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, return ret; } -int neigh_proc_dointvec(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -3467,8 +3467,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, } EXPORT_SYMBOL(neigh_proc_dointvec); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); @@ -3479,8 +3478,8 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); @@ -3489,8 +3488,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, } int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); @@ -3500,8 +3498,8 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); @@ -3510,8 +3508,8 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, } static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 9f9e00ba3ad7..0ddb13a6282b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; @@ -115,8 +115,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; @@ -180,10 +179,7 @@ write_unlock: } if (len < *lenp) kbuf[len++] = '\n'; - if (copy_to_user(buffer, kbuf, len)) { - ret = -EFAULT; - goto done; - } + memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; } @@ -194,8 +190,7 @@ done: } static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; @@ -217,7 +212,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_SCHED static int set_default_qdisc(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { @@ -236,7 +231,7 @@ static int set_default_qdisc(struct ctl_table *table, int write, #endif static int proc_do_dev_weight(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -251,7 +246,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, } static int proc_do_rss_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; @@ -264,7 +259,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write, #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; @@ -291,8 +286,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -303,8 +297,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, static int proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index cca7ae712995..65abcf1b3210 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -160,8 +160,8 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU static int min_priority[1]; static int max_priority[] = { 127 }; /* From DECnet spec */ -static int dn_forwarding_proc(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +static int dn_forwarding_proc(struct ctl_table *, int, void *, size_t *, + loff_t *); static struct dn_dev_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table dn_dev_vars[5]; @@ -245,8 +245,7 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) } static int dn_forwarding_proc(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { #ifdef CONFIG_DECNET_ROUTER struct net_device *dev = table->extra1; diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 55bf64a22b59..deae519bdeec 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -134,8 +134,7 @@ static int parse_addr(__le16 *addr, char *str) } static int dn_node_address_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char addr[DN_ASCBUF_LEN]; size_t len; @@ -148,10 +147,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, if (write) { len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); - - if (copy_from_user(addr, buffer, len)) - return -EFAULT; - + memcpy(addr, buffer, len); addr[len] = 0; strip_it(addr); @@ -173,11 +169,9 @@ static int dn_node_address_handler(struct ctl_table *table, int write, len = strlen(addr); addr[len++] = '\n'; - if (len > *lenp) len = *lenp; - - if (copy_to_user(buffer, addr, len)) - return -EFAULT; - + if (len > *lenp) + len = *lenp; + memcpy(buffer, addr, len); *lenp = len; *ppos += len; @@ -185,8 +179,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, } static int dn_def_dev_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { size_t len; struct net_device *dev; @@ -201,9 +194,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (*lenp > 16) return -E2BIG; - if (copy_from_user(devname, buffer, *lenp)) - return -EFAULT; - + memcpy(devname, buffer, *lenp); devname[*lenp] = 0; strip_it(devname); @@ -238,9 +229,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, devname, len)) - return -EFAULT; - + memcpy(buffer, devname, len); *lenp = len; *ppos += len; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 30fa42f5997d..a118978d222c 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2361,8 +2361,7 @@ static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) } static int devinet_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -2414,8 +2413,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, } static int devinet_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -2458,8 +2456,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, } static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 788c69d9bfe0..041f4dcac440 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3336,8 +3336,7 @@ static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81b267e990a1..868e317cc324 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -71,8 +71,7 @@ static void set_local_port_range(struct net *net, int range[2]) /* Validate changes from /proc interface. */ static int ipv4_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.ip_local_ports.range); @@ -107,7 +106,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, /* Validate changes from /proc interface. */ static int ipv4_privileged_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_ip_prot_sock); @@ -168,8 +167,7 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig /* Validate changes from /proc interface. */ static int ipv4_ping_group_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct user_namespace *user_ns = current_user_ns(); int ret; @@ -204,8 +202,7 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, } static int ipv4_fwd_update_priority(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; @@ -221,7 +218,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write, } static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, ipv4.tcp_congestion_control); @@ -241,9 +238,8 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, } static int proc_tcp_available_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; int ret; @@ -258,9 +254,8 @@ static int proc_tcp_available_congestion_control(struct ctl_table *ctl, } static int proc_allowed_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; int ret; @@ -296,8 +291,7 @@ static int sscanf_key(char *buf, __le32 *key) } static int proc_tcp_fastopen_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); @@ -399,7 +393,7 @@ static void proc_configure_early_demux(int enabled, int protocol) } static int proc_tcp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -415,7 +409,7 @@ static int proc_tcp_early_demux(struct ctl_table *table, int write, } static int proc_udp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -431,8 +425,7 @@ static int proc_udp_early_demux(struct ctl_table *table, int write, } static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, @@ -447,8 +440,7 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, } static int proc_tcp_available_ulp(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, }; @@ -466,7 +458,7 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 24e319dfb510..9d0e89bccb90 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6108,9 +6108,8 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL -static -int addrconf_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_forward(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6134,9 +6133,8 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct inet6_dev *idev = ctl->extra1; int min_mtu = IPV6_MIN_MTU; @@ -6206,9 +6204,8 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) return 0; } -static -int addrconf_sysctl_disable(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_disable(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6232,9 +6229,8 @@ int addrconf_sysctl_disable(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int ret; @@ -6275,7 +6271,7 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, } static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -6337,7 +6333,7 @@ out: } static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int err; @@ -6404,8 +6400,7 @@ out: static int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -6505,10 +6500,8 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) return 0; } -static -int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1ecd4e9b0bdf..58f1255295d3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1835,7 +1835,8 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, } } -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 310cbddaa533..acdb31e38412 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6088,9 +6088,8 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) #ifdef CONFIG_SYSCTL -static -int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int delay; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 63b657aa8d29..fac2135aa47b 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -26,8 +26,7 @@ static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 4701edffb1f7..a42e4ed5ab0e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1362,8 +1362,7 @@ done: (&((struct mpls_dev *)0)->field) static int mpls_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int oval = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -2594,7 +2593,7 @@ nolabels: } static int mpls_platform_labels(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int platform_labels = net->mpls.platform_labels; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 8d14a1acbc37..412656c34f20 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1736,7 +1736,7 @@ static int three = 3; static int proc_do_defense_mode(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; @@ -1763,7 +1763,7 @@ proc_do_defense_mode(struct ctl_table *table, int write, static int proc_do_sync_threshold(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val[2]; @@ -1788,7 +1788,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, static int proc_do_sync_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9b57330c81f8..31b027b12ff3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -517,7 +517,7 @@ static unsigned int nf_conntrack_htable_size_user __read_mostly; static int nf_conntrack_hash_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index bb25d4c794c7..6cb9f9474b05 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -414,7 +414,7 @@ static struct ctl_table nf_log_sysctl_ftable[] = { }; static int nf_log_proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; char buf[NFLOGGER_NAME_LEN]; diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c index 251e750fd9aa..0d0bf41381c2 100644 --- a/net/phonet/sysctl.c +++ b/net/phonet/sysctl.c @@ -49,8 +49,7 @@ void phonet_get_local_port_range(int *min, int *max) } static int proc_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int range[2] = {local_port_range[0], local_port_range[1]}; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 66121bc6f34e..46782fac4c16 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -62,8 +62,7 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos); + void *buffer, size_t *lenp, loff_t *fpos); static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; @@ -676,8 +675,7 @@ static void rds_tcp_sysctl_reset(struct net *net) } static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos) + void *buffer, size_t *lenp, loff_t *fpos) { struct net *net = current->nsproxy->net_ns; int err; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 4740aa70e652..c16c80963e55 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -43,20 +43,15 @@ static unsigned long max_autoclose_max = ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table sctp_table[] = { { @@ -343,8 +338,7 @@ static struct ctl_table sctp_net_table[] = { }; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; @@ -389,8 +383,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -418,8 +411,7 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -447,8 +439,7 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, } static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) pr_warn_once("Changing rto_alpha or rto_beta may lead to " @@ -458,8 +449,7 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, } static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index d75f17b56f0e..999eee1ed61c 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -60,7 +60,7 @@ rpc_unregister_sysctl(void) } static int proc_do_xprt(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; size_t len; @@ -70,15 +70,15 @@ static int proc_do_xprt(struct ctl_table *table, int write, return 0; } len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + return memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); } static int -proc_dodebug(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp, + loff_t *ppos) { - char tmpbuf[20], c, *s = NULL; - char __user *p; + char tmpbuf[20], *s = NULL; + char *p; unsigned int value; size_t left, len; @@ -90,18 +90,17 @@ proc_dodebug(struct ctl_table *table, int write, left = *lenp; if (write) { - if (!access_ok(buffer, left)) - return -EFAULT; p = buffer; - while (left && __get_user(c, p) >= 0 && isspace(c)) - left--, p++; + while (left && isspace(*p)) { + left--; + p++; + } if (!left) goto done; if (left > sizeof(tmpbuf) - 1) return -EINVAL; - if (copy_from_user(tmpbuf, p, left)) - return -EFAULT; + memcpy(tmpbuf, p, left); tmpbuf[left] = '\0'; value = simple_strtol(tmpbuf, &s, 0); @@ -121,11 +120,9 @@ proc_dodebug(struct ctl_table *table, int write, len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data); if (len > left) len = left; - if (copy_to_user(buffer, tmpbuf, len)) - return -EFAULT; + memcpy(buffer, tmpbuf, len); if ((left -= len) > 0) { - if (put_user('\n', (char __user *)buffer + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; left--; } } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 97bca509a391..526da5d4710b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -80,8 +80,7 @@ atomic_t rdma_stat_sq_prod; * current value. */ static int read_reset_stat(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { atomic_t *stat = (atomic_t *)table->data; @@ -103,8 +102,8 @@ static int read_reset_stat(struct ctl_table *table, int write, len -= *ppos; if (len > *lenp) len = *lenp; - if (len && copy_to_user(buffer, str_buf, len)) - return -EFAULT; + if (len) + memcpy(buffer, str_buf, len); *lenp = len; *ppos += len; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index b621ad74f54a..27e371b44dad 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1696,7 +1696,7 @@ static int __init alloc_buffers(void) #ifdef CONFIG_SYSCTL static int apparmor_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!policy_admin_capable(NULL)) return -EPERM; diff --git a/security/min_addr.c b/security/min_addr.c index 94d2b0cf0e7b..88c9a6a21f47 100644 --- a/security/min_addr.c +++ b/security/min_addr.c @@ -30,7 +30,7 @@ static void update_mmap_min_addr(void) * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly */ int mmap_min_addr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 94dc346370b1..536c99646f6a 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -430,7 +430,7 @@ static struct security_hook_list yama_hooks[] __lsm_ro_after_init = { #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; -- cgit v1.2.3-71-gd317 From 492e639f0c222784e2e0f121966375f641c61b15 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:14 -0700 Subject: bpf: Add bpf_seq_printf and bpf_seq_write helpers Two helpers bpf_seq_printf and bpf_seq_write, are added for writing data to the seq_file buffer. bpf_seq_printf supports common format string flag/width/type fields so at least I can get identical results for netlink and ipv6_route targets. For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW specifically indicates a write failure due to overflow, which means the object will be repeated in the next bpf invocation if object collection stays the same. Note that if the object collection is changed, depending how collection traversal is done, even if the object still in the collection, it may not be visited. For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to read kernel memory. Reading kernel memory may fail in the following two cases: - invalid kernel address, or - valid kernel address but requiring a major fault If reading kernel memory failed, the %s string will be an empty string and %p{i,I}{4,6} will be all 0. Not returning error to bpf program is consistent with what bpf_trace_printk() does for now. bpf_seq_printf may return -EBUSY meaning that internal percpu buffer for memory copy of strings or other pointees is not available. Bpf program can return 1 to indicate it wants the same object to be repeated. Right now, this should not happen on no-RT kernels since migrate_disable(), which guards bpf prog call, calls preempt_disable(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com --- include/uapi/linux/bpf.h | 39 +++++++- kernel/trace/bpf_trace.c | 214 +++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 + tools/include/uapi/linux/bpf.h | 39 +++++++- 4 files changed, 292 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 708763f702e1..9d1932e23cec 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3077,6 +3077,41 @@ union bpf_attr { * See: clock_gettime(CLOCK_BOOTTIME) * Return * Current *ktime*. + * + * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * seq_printf uses seq_file seq_printf() to print out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a u64 array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the *data* size in term of bytes. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what bpf_trace_printk() does for now. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EBUSY** Percpu memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * * **-EINVAL** Invalid arguments, or invalid/unsupported formats. + * * **-E2BIG** Too many format specifiers. + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * + * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * seq_write uses seq_file seq_write() to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3204,7 +3239,9 @@ union bpf_attr { FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ FN(sk_assign), \ - FN(ktime_get_boot_ns), + FN(ktime_get_boot_ns), \ + FN(seq_printf), \ + FN(seq_write), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e875c95d3ced..d961428fb5b6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -457,6 +457,212 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +#define MAX_SEQ_PRINTF_VARARGS 12 +#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 +#define MAX_SEQ_PRINTF_STR_LEN 128 + +struct bpf_seq_printf_buf { + char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); +static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); + +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, + const void *, data, u32, data_len) +{ + int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; + int i, buf_used, copy_size, num_args; + u64 params[MAX_SEQ_PRINTF_VARARGS]; + struct bpf_seq_printf_buf *bufs; + const u64 *args = data; + + buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); + if (WARN_ON_ONCE(buf_used > 1)) { + err = -EBUSY; + goto out; + } + + bufs = this_cpu_ptr(&bpf_seq_printf_buf); + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + goto out; + + if (data_len & 7) + goto out; + + for (i = 0; i < fmt_size; i++) { + if (fmt[i] == '%') { + if (fmt[i + 1] == '%') + i++; + else if (!data || !data_len) + goto out; + } + } + + num_args = data_len / 8; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + /* only printable ascii for now. */ + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto out; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { + err = -E2BIG; + goto out; + } + + if (fmt_cnt >= num_args) { + err = -EINVAL; + goto out; + } + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + + /* skip optional "[0 +-][num]" width formating field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 's') { + /* try our best to copy */ + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + err = strncpy_from_unsafe(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + MAX_SEQ_PRINTF_STR_LEN); + if (err < 0) + bufs->buf[memcpy_cnt][0] = '\0'; + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'p') { + if (fmt[i + 1] == 0 || + fmt[i + 1] == 'K' || + fmt[i + 1] == 'x') { + /* just kernel pointers */ + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + continue; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { + err = -EINVAL; + goto out; + } + if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { + err = -EINVAL; + goto out; + } + + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + + err = probe_kernel_read(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + copy_size); + if (err < 0) + memset(bufs->buf[memcpy_cnt], 0, copy_size); + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + i += 2; + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'l') { + i++; + if (fmt[i] == 'l') + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x') { + err = -EINVAL; + goto out; + } + + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + } + + /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give + * all of them to seq_printf(). + */ + seq_printf(m, fmt, params[0], params[1], params[2], params[3], + params[4], params[5], params[6], params[7], params[8], + params[9], params[10], params[11]); + + err = seq_has_overflowed(m) ? -EOVERFLOW : 0; +out: + this_cpu_dec(bpf_seq_printf_buf_used); + return err; +} + +static int bpf_seq_printf_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_printf_proto = { + .func = bpf_seq_printf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_printf_btf_ids, +}; + +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) +{ + return seq_write(m, data, len) ? -EOVERFLOW : 0; +} + +static int bpf_seq_write_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_write_proto = { + .func = bpf_seq_write, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_write_btf_ids, +}; + static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) @@ -1226,6 +1432,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; #endif + case BPF_FUNC_seq_printf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_proto : + NULL; + case BPF_FUNC_seq_write: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_write_proto : + NULL; default: return raw_tp_prog_func_proto(func_id, prog); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index f43d193aff3a..ded304c96a05 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -414,6 +414,7 @@ class PrinterHelpers(Printer): 'struct sk_reuseport_md', 'struct sockaddr', 'struct tcphdr', + 'struct seq_file', 'struct __sk_buff', 'struct sk_msg_md', @@ -450,6 +451,7 @@ class PrinterHelpers(Printer): 'struct sk_reuseport_md', 'struct sockaddr', 'struct tcphdr', + 'struct seq_file', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 708763f702e1..9d1932e23cec 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3077,6 +3077,41 @@ union bpf_attr { * See: clock_gettime(CLOCK_BOOTTIME) * Return * Current *ktime*. + * + * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * seq_printf uses seq_file seq_printf() to print out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a u64 array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the *data* size in term of bytes. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what bpf_trace_printk() does for now. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EBUSY** Percpu memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * * **-EINVAL** Invalid arguments, or invalid/unsupported formats. + * * **-E2BIG** Too many format specifiers. + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * + * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * seq_write uses seq_file seq_write() to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3204,7 +3239,9 @@ union bpf_attr { FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ FN(sk_assign), \ - FN(ktime_get_boot_ns), + FN(ktime_get_boot_ns), \ + FN(seq_printf), \ + FN(seq_write), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3-71-gd317 From 2c78ee898d8f10ae6fb2fa23a3fbaec96b1b7366 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:54 -0700 Subject: bpf: Implement CAP_BPF Implement permissions as stated in uapi/linux/capability.h In order to do that the verifier allow_ptr_leaks flag is split into four flags and they are set as: env->allow_ptr_leaks = bpf_allow_ptr_leaks(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); The first three currently equivalent to perfmon_capable(), since leaking kernel pointers and reading kernel memory via side channel attacks is roughly equivalent to reading kernel memory with cap_perfmon. 'bpf_capable' enables bounded loops, precision tracking, bpf to bpf calls and other verifier features. 'allow_ptr_leaks' enable ptr leaks, ptr conversions, subtraction of pointers. 'bypass_spec_v1' disables speculative analysis in the verifier, run time mitigations in bpf array, and enables indirect variable access in bpf programs. 'bypass_spec_v4' disables emission of sanitation code by the verifier. That means that the networking BPF program loaded with CAP_BPF + CAP_NET_ADMIN will have speculative checks done by the verifier and other spectre mitigation applied. Such networking BPF program will not be able to leak kernel pointers and will not be able to access arbitrary kernel memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-3-alexei.starovoitov@gmail.com --- drivers/media/rc/bpf-lirc.c | 2 +- include/linux/bpf.h | 18 ++++++++- include/linux/bpf_verifier.h | 3 ++ kernel/bpf/arraymap.c | 10 ++--- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/cpumap.c | 2 +- kernel/bpf/hashtab.c | 4 +- kernel/bpf/helpers.c | 4 +- kernel/bpf/lpm_trie.c | 2 +- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/queue_stack_maps.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/stackmap.c | 2 +- kernel/bpf/syscall.c | 89 +++++++++++++++++++++++++++++++++---------- kernel/bpf/verifier.c | 37 +++++++++--------- kernel/trace/bpf_trace.c | 3 ++ net/core/bpf_sk_storage.c | 4 +- net/core/filter.c | 4 +- 19 files changed, 134 insertions(+), 60 deletions(-) (limited to 'kernel/trace') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 069c42f22a8c..5bb144435c16 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) + if (perfmon_capable()) return bpf_get_trace_printk_proto(); /* fall through */ default: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c45d198ac38c..efe8836b5c48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -19,6 +19,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -119,7 +120,7 @@ struct bpf_map { struct bpf_map_memory memory; char name[BPF_OBJ_NAME_LEN]; u32 btf_vmlinux_value_type_id; - bool unpriv_array; + bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ /* 22 bytes hole */ @@ -1095,6 +1096,21 @@ struct bpf_map *bpf_map_get_curr_or_next(u32 *id); extern int sysctl_unprivileged_bpf_disabled; +static inline bool bpf_allow_ptr_leaks(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v1(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v4(void) +{ + return perfmon_capable(); +} + int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6abd5a778fcd..ea833087e853 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -375,6 +375,9 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool bpf_capable; + bool bypass_spec_v1; + bool bypass_spec_v4; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..1d5bb0d983b2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); + bool bypass_spec_v1 = bpf_bypass_spec_v1(); u64 cost, array_size, mask64; struct bpf_map_memory mem; struct bpf_array *array; @@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) mask64 -= 1; index_mask = mask64; - if (unpriv) { + if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ @@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); } array->index_mask = index_mask; - array->map.unpriv_array = unpriv; + array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); @@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { @@ -1053,7 +1053,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 26cb51f2db72..c6b0decaa46a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_map *map; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6aa11de67315..c40ff4cf9880 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return; bpf_prog_ksym_set_addr(fp); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a71790dab12d..8b85bfddfac7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u64 cost; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d541c8486c95..b4b288a3c3c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !capable(CAP_SYS_ADMIN)) + if (lru && !bpf_capable()) /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. + * maps. Hence, limit to CAP_BPF. */ return -EPERM; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5c0290e0696e..886949fdcece 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -633,7 +633,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return NULL; switch (func_id) { @@ -642,6 +642,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: + if (!perfmon_capable()) + return NULL; return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 65c236cf341e..c8cc4e4cf98d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) u64 cost = sizeof(*trie), cost_per_node; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index b3c48d1533cb..17738c93bec8 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { - inner_map_meta->unpriv_array = inner_map->unpriv_array; + inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 30e1373fd437..05c8e043b9d2 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; /* check sanity of attributes */ diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 01badd3eda7a..21cde24386db 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) struct bpf_map_memory mem; u64 array_size; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); array_size = sizeof(*array); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index db76339fe358..7b8381ce40a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index de2a75500233..79bcd8d056d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1534,7 +1534,7 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { err = -EPERM; goto err_put; } @@ -2009,6 +2009,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } +static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + case BPF_PROG_TYPE_CGROUP_SKB: + /* always unpriv */ + case BPF_PROG_TYPE_SK_REUSEPORT: + /* equivalent to SOCKET_FILTER. need CAP_BPF only */ + default: + return false; + } +} + +static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + default: + return false; + } +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd @@ -2031,7 +2080,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return -EPERM; /* copy eBPF program license from user space */ @@ -2044,11 +2093,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) is_gpl = license_is_gpl_compatible(license); if (attr->insn_cnt == 0 || - attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) + return -EPERM; + + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); @@ -2682,6 +2736,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -2747,9 +2806,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; @@ -2804,9 +2860,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; @@ -2819,6 +2872,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; return skb_flow_dissector_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2882,8 +2937,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -3184,7 +3237,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; @@ -3543,7 +3596,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; return btf_new_fd(attr); @@ -3766,9 +3819,6 @@ static int link_create(union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; @@ -3817,9 +3867,6 @@ static int link_update(union bpf_attr *attr) u32 flags; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; @@ -3988,7 +4035,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr; int err; - if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a3f2af756fd6..180933f6fba9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1295,7 +1295,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; + reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -1427,8 +1427,9 @@ static int check_subprogs(struct bpf_verifier_env *env) continue; if (insn[i].src_reg != BPF_PSEUDO_CALL) continue; - if (!env->allow_ptr_leaks) { - verbose(env, "function calls to other bpf functions are allowed for root only\n"); + if (!env->bpf_capable) { + verbose(env, + "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } ret = add_subprog(env, i + insn[i].imm + 1); @@ -1962,8 +1963,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bool new_marks = false; int i, err; - if (!env->allow_ptr_leaks) - /* backtracking is root only for now */ + if (!env->bpf_capable) return 0; func = st->frame[st->curframe]; @@ -2211,7 +2211,7 @@ static int check_stack_write(struct bpf_verifier_env *env, reg = &cur->regs[value_regno]; if (reg && size == BPF_REG_SIZE && register_is_const(reg) && - !register_is_null(reg) && env->allow_ptr_leaks) { + !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit * stack slot address like [fp - 8]. Other spill of @@ -2237,7 +2237,7 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EINVAL; } - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v4) { bool sanitize = false; if (state->stack[spi].slot_type[0] == STACK_SPILL && @@ -3432,7 +3432,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, * Spectre masking for stack ALU. * See also retrieve_ptr_limit(). */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -4435,10 +4435,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); return 0; } @@ -4807,7 +4807,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -5117,7 +5117,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { if (dst_reg->type == PTR_TO_MAP_VALUE && check_map_access(env, dst, dst_reg->off, 1, false)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " @@ -7244,7 +7244,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->allow_ptr_leaks) + if (loop_ok && env->bpf_capable) return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -8353,7 +8353,7 @@ next: if (env->max_states_per_insn < states_cnt) env->max_states_per_insn = states_cnt; - if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return push_jmp_history(env, cur); if (!add_new_state) @@ -10014,7 +10014,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->allow_ptr_leaks && !expect_blinding && + if (env->bpf_capable && !expect_blinding && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -10758,7 +10758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - is_priv = capable(CAP_SYS_ADMIN); + is_priv = bpf_capable(); if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { mutex_lock(&bpf_verifier_lock); @@ -10799,7 +10799,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = is_priv; + env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->bypass_spec_v1 = bpf_bypass_spec_v1(); + env->bypass_spec_v4 = bpf_bypass_spec_v4(); + env->bpf_capable = bpf_capable(); if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d961428fb5b6..9a84d7fb4869 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { static const struct bpf_func_proto *bpf_get_probe_write_proto(void) { + if (!capable(CAP_SYS_ADMIN)) + return NULL; + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", current->comm, task_pid_nr(current)); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 756b63b6f7b3..d2c4d16dadba 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -625,7 +625,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; if (attr->value_size > MAX_VALUE_SIZE) @@ -978,7 +978,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); nla_for_each_nested(nla, nla_stgs, rem) { diff --git a/net/core/filter.c b/net/core/filter.c index a85eb538d4d6..f8a3c7e9d027 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6687,7 +6687,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; } @@ -6699,7 +6699,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; default: -- cgit v1.2.3-71-gd317 From f470378c7562a2818b45ed11c98973f2b89eedd3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:50:55 -0700 Subject: bpf: Extend bpf_base_func_proto helpers with probe_* and *current_task* Often it is useful when applying policy to know something about the task. If the administrator has CAP_SYS_ADMIN rights then they can use kprobe + networking hook and link the two programs together to accomplish this. However, this is a bit clunky and also means we have to call both the network program and kprobe program when we could just use a single program and avoid passing metadata through sk_msg/skb->cb, socket, maps, etc. To accomplish this add probe_* helpers to bpf_base_func_proto programs guarded by a perfmon_capable() check. New supported helpers are the following, BPF_FUNC_get_current_task BPF_FUNC_probe_read_user BPF_FUNC_probe_read_kernel BPF_FUNC_probe_read_user_str BPF_FUNC_probe_read_kernel_str Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033905529.12355.4368381069655254932.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 24 ++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 10 +++++----- 2 files changed, 29 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 886949fdcece..bb4fb634275e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -601,6 +601,12 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +const struct bpf_func_proto bpf_get_current_task_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_proto __weak; +const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; +const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; + const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -647,6 +653,24 @@ bpf_base_func_proto(enum bpf_func_id func_id) return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + default: + break; + } + + if (!perfmon_capable()) + return NULL; + + switch (func_id) { + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; + case BPF_FUNC_probe_read_user: + return &bpf_probe_read_user_proto; + case BPF_FUNC_probe_read_kernel: + return &bpf_probe_read_kernel_proto; + case BPF_FUNC_probe_read_user_str: + return &bpf_probe_read_user_str_proto; + case BPF_FUNC_probe_read_kernel_str: + return &bpf_probe_read_kernel_str_proto; default: return NULL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9531f54d0a3a..187cd6995bbb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -147,7 +147,7 @@ BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, return ret; } -static const struct bpf_func_proto bpf_probe_read_user_proto = { +const struct bpf_func_proto bpf_probe_read_user_proto = { .func = bpf_probe_read_user, .gpl_only = true, .ret_type = RET_INTEGER, @@ -167,7 +167,7 @@ BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, return ret; } -static const struct bpf_func_proto bpf_probe_read_user_str_proto = { +const struct bpf_func_proto bpf_probe_read_user_str_proto = { .func = bpf_probe_read_user_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -198,7 +198,7 @@ BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); } -static const struct bpf_func_proto bpf_probe_read_kernel_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_proto = { .func = bpf_probe_read_kernel, .gpl_only = true, .ret_type = RET_INTEGER, @@ -253,7 +253,7 @@ BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); } -static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { +const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .func = bpf_probe_read_kernel_str, .gpl_only = true, .ret_type = RET_INTEGER, @@ -907,7 +907,7 @@ BPF_CALL_0(bpf_get_current_task) return (long) current; } -static const struct bpf_func_proto bpf_get_current_task_proto = { +const struct bpf_func_proto bpf_get_current_task_proto = { .func = bpf_get_current_task, .gpl_only = true, .ret_type = RET_INTEGER, -- cgit v1.2.3-71-gd317 From 457f44363a8894135c85b7a9afd2bd8196db24ab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:20 -0700 Subject: bpf: Implement BPF ring buffer and verifier support for it This commit adds a new MPSC ring buffer implementation into BPF ecosystem, which allows multiple CPUs to submit data to a single shared ring buffer. On the consumption side, only single consumer is assumed. Motivation ---------- There are two distinctive motivators for this work, which are not satisfied by existing perf buffer, which prompted creation of a new ring buffer implementation. - more efficient memory utilization by sharing ring buffer across CPUs; - preserving ordering of events that happen sequentially in time, even across multiple CPUs (e.g., fork/exec/exit events for a task). These two problems are independent, but perf buffer fails to satisfy both. Both are a result of a choice to have per-CPU perf ring buffer. Both can be also solved by having an MPSC implementation of ring buffer. The ordering problem could technically be solved for perf buffer with some in-kernel counting, but given the first one requires an MPSC buffer, the same solution would solve the second problem automatically. Semantics and APIs ------------------ Single ring buffer is presented to BPF programs as an instance of BPF map of type BPF_MAP_TYPE_RINGBUF. Two other alternatives considered, but ultimately rejected. One way would be to, similar to BPF_MAP_TYPE_PERF_EVENT_ARRAY, make BPF_MAP_TYPE_RINGBUF could represent an array of ring buffers, but not enforce "same CPU only" rule. This would be more familiar interface compatible with existing perf buffer use in BPF, but would fail if application needed more advanced logic to lookup ring buffer by arbitrary key. HASH_OF_MAPS addresses this with current approach. Additionally, given the performance of BPF ringbuf, many use cases would just opt into a simple single ring buffer shared among all CPUs, for which current approach would be an overkill. Another approach could introduce a new concept, alongside BPF map, to represent generic "container" object, which doesn't necessarily have key/value interface with lookup/update/delete operations. This approach would add a lot of extra infrastructure that has to be built for observability and verifier support. It would also add another concept that BPF developers would have to familiarize themselves with, new syntax in libbpf, etc. But then would really provide no additional benefits over the approach of using a map. BPF_MAP_TYPE_RINGBUF doesn't support lookup/update/delete operations, but so doesn't few other map types (e.g., queue and stack; array doesn't support delete, etc). The approach chosen has an advantage of re-using existing BPF map infrastructure (introspection APIs in kernel, libbpf support, etc), being familiar concept (no need to teach users a new type of object in BPF program), and utilizing existing tooling (bpftool). For common scenario of using a single ring buffer for all CPUs, it's as simple and straightforward, as would be with a dedicated "container" object. On the other hand, by being a map, it can be combined with ARRAY_OF_MAPS and HASH_OF_MAPS map-in-maps to implement a wide variety of topologies, from one ring buffer for each CPU (e.g., as a replacement for perf buffer use cases), to a complicated application hashing/sharding of ring buffers (e.g., having a small pool of ring buffers with hashed task's tgid being a look up key to preserve order, but reduce contention). Key and value sizes are enforced to be zero. max_entries is used to specify the size of ring buffer and has to be a power of 2 value. There are a bunch of similarities between perf buffer (BPF_MAP_TYPE_PERF_EVENT_ARRAY) and new BPF ring buffer semantics: - variable-length records; - if there is no more space left in ring buffer, reservation fails, no blocking; - memory-mappable data area for user-space applications for ease of consumption and high performance; - epoll notifications for new incoming data; - but still the ability to do busy polling for new data to achieve the lowest latency, if necessary. BPF ringbuf provides two sets of APIs to BPF programs: - bpf_ringbuf_output() allows to *copy* data from one place to a ring buffer, similarly to bpf_perf_event_output(); - bpf_ringbuf_reserve()/bpf_ringbuf_commit()/bpf_ringbuf_discard() APIs split the whole process into two steps. First, a fixed amount of space is reserved. If successful, a pointer to a data inside ring buffer data area is returned, which BPF programs can use similarly to a data inside array/hash maps. Once ready, this piece of memory is either committed or discarded. Discard is similar to commit, but makes consumer ignore the record. bpf_ringbuf_output() has disadvantage of incurring extra memory copy, because record has to be prepared in some other place first. But it allows to submit records of the length that's not known to verifier beforehand. It also closely matches bpf_perf_event_output(), so will simplify migration significantly. bpf_ringbuf_reserve() avoids the extra copy of memory by providing a memory pointer directly to ring buffer memory. In a lot of cases records are larger than BPF stack space allows, so many programs have use extra per-CPU array as a temporary heap for preparing sample. bpf_ringbuf_reserve() avoid this needs completely. But in exchange, it only allows a known constant size of memory to be reserved, such that verifier can verify that BPF program can't access memory outside its reserved record space. bpf_ringbuf_output(), while slightly slower due to extra memory copy, covers some use cases that are not suitable for bpf_ringbuf_reserve(). The difference between commit and discard is very small. Discard just marks a record as discarded, and such records are supposed to be ignored by consumer code. Discard is useful for some advanced use-cases, such as ensuring all-or-nothing multi-record submission, or emulating temporary malloc()/free() within single BPF program invocation. Each reserved record is tracked by verifier through existing reference-tracking logic, similar to socket ref-tracking. It is thus impossible to reserve a record, but forget to submit (or discard) it. bpf_ringbuf_query() helper allows to query various properties of ring buffer. Currently 4 are supported: - BPF_RB_AVAIL_DATA returns amount of unconsumed data in ring buffer; - BPF_RB_RING_SIZE returns the size of ring buffer; - BPF_RB_CONS_POS/BPF_RB_PROD_POS returns current logical possition of consumer/producer, respectively. Returned values are momentarily snapshots of ring buffer state and could be off by the time helper returns, so this should be used only for debugging/reporting reasons or for implementing various heuristics, that take into account highly-changeable nature of some of those characteristics. One such heuristic might involve more fine-grained control over poll/epoll notifications about new data availability in ring buffer. Together with BPF_RB_NO_WAKEUP/BPF_RB_FORCE_WAKEUP flags for output/commit/discard helpers, it allows BPF program a high degree of control and, e.g., more efficient batched notifications. Default self-balancing strategy, though, should be adequate for most applications and will work reliable and efficiently already. Design and implementation ------------------------- This reserve/commit schema allows a natural way for multiple producers, either on different CPUs or even on the same CPU/in the same BPF program, to reserve independent records and work with them without blocking other producers. This means that if BPF program was interruped by another BPF program sharing the same ring buffer, they will both get a record reserved (provided there is enough space left) and can work with it and submit it independently. This applies to NMI context as well, except that due to using a spinlock during reservation, in NMI context, bpf_ringbuf_reserve() might fail to get a lock, in which case reservation will fail even if ring buffer is not full. The ring buffer itself internally is implemented as a power-of-2 sized circular buffer, with two logical and ever-increasing counters (which might wrap around on 32-bit architectures, that's not a problem): - consumer counter shows up to which logical position consumer consumed the data; - producer counter denotes amount of data reserved by all producers. Each time a record is reserved, producer that "owns" the record will successfully advance producer counter. At that point, data is still not yet ready to be consumed, though. Each record has 8 byte header, which contains the length of reserved record, as well as two extra bits: busy bit to denote that record is still being worked on, and discard bit, which might be set at commit time if record is discarded. In the latter case, consumer is supposed to skip the record and move on to the next one. Record header also encodes record's relative offset from the beginning of ring buffer data area (in pages). This allows bpf_ringbuf_commit()/bpf_ringbuf_discard() to accept only the pointer to the record itself, without requiring also the pointer to ring buffer itself. Ring buffer memory location will be restored from record metadata header. This significantly simplifies verifier, as well as improving API usability. Producer counter increments are serialized under spinlock, so there is a strict ordering between reservations. Commits, on the other hand, are completely lockless and independent. All records become available to consumer in the order of reservations, but only after all previous records where already committed. It is thus possible for slow producers to temporarily hold off submitted records, that were reserved later. Reservation/commit/consumer protocol is verified by litmus tests in Documentation/litmus-test/bpf-rb. One interesting implementation bit, that significantly simplifies (and thus speeds up as well) implementation of both producers and consumers is how data area is mapped twice contiguously back-to-back in the virtual memory. This allows to not take any special measures for samples that have to wrap around at the end of the circular buffer data area, because the next page after the last data page would be first data page again, and thus the sample will still appear completely contiguous in virtual memory. See comment and a simple ASCII diagram showing this visually in bpf_ringbuf_area_alloc(). Another feature that distinguishes BPF ringbuf from perf ring buffer is a self-pacing notifications of new data being availability. bpf_ringbuf_commit() implementation will send a notification of new record being available after commit only if consumer has already caught up right up to the record being committed. If not, consumer still has to catch up and thus will see new data anyways without needing an extra poll notification. Benchmarks (see tools/testing/selftests/bpf/benchs/bench_ringbuf.c) show that this allows to achieve a very high throughput without having to resort to tricks like "notify only every Nth sample", which are necessary with perf buffer. For extreme cases, when BPF program wants more manual control of notifications, commit/discard/output helpers accept BPF_RB_NO_WAKEUP and BPF_RB_FORCE_WAKEUP flags, which give full control over notifications of data availability, but require extra caution and diligence in using this API. Comparison to alternatives -------------------------- Before considering implementing BPF ring buffer from scratch existing alternatives in kernel were evaluated, but didn't seem to meet the needs. They largely fell into few categores: - per-CPU buffers (perf, ftrace, etc), which don't satisfy two motivations outlined above (ordering and memory consumption); - linked list-based implementations; while some were multi-producer designs, consuming these from user-space would be very complicated and most probably not performant; memory-mapping contiguous piece of memory is simpler and more performant for user-space consumers; - io_uring is SPSC, but also requires fixed-sized elements. Naively turning SPSC queue into MPSC w/ lock would have subpar performance compared to locked reserve + lockless commit, as with BPF ring buffer. Fixed sized elements would be too limiting for BPF programs, given existing BPF programs heavily rely on variable-sized perf buffer already; - specialized implementations (like a new printk ring buffer, [0]) with lots of printk-specific limitations and implications, that didn't seem to fit well for intended use with BPF programs. [0] https://lwn.net/Articles/779550/ Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-2-andriin@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 13 + include/linux/bpf_types.h | 1 + include/linux/bpf_verifier.h | 4 + include/uapi/linux/bpf.h | 84 +++- kernel/bpf/Makefile | 2 +- kernel/bpf/helpers.c | 10 + kernel/bpf/ringbuf.c | 501 +++++++++++++++++++++ kernel/bpf/syscall.c | 12 + kernel/bpf/verifier.c | 195 ++++++-- kernel/trace/bpf_trace.c | 10 + tools/include/uapi/linux/bpf.h | 84 +++- tools/testing/selftests/bpf/verifier/and.c | 4 +- .../testing/selftests/bpf/verifier/array_access.c | 4 +- tools/testing/selftests/bpf/verifier/bounds.c | 6 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- .../selftests/bpf/verifier/direct_value_access.c | 4 +- .../selftests/bpf/verifier/helper_access_var_len.c | 2 +- .../selftests/bpf/verifier/helper_value_access.c | 6 +- .../selftests/bpf/verifier/value_ptr_arith.c | 8 +- 19 files changed, 882 insertions(+), 70 deletions(-) create mode 100644 kernel/bpf/ringbuf.c (limited to 'kernel/trace') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index efe8836b5c48..e5884f7f801c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -90,6 +90,8 @@ struct bpf_map_ops { int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); + __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts); }; struct bpf_map_memory { @@ -244,6 +246,9 @@ enum bpf_arg_type { ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ + ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ + ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ + ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ }; /* type of values returned from helper functions */ @@ -255,6 +260,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ + RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -322,6 +328,8 @@ enum bpf_reg_type { PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ PTR_TO_BTF_ID, /* reg points to kernel struct */ PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ + PTR_TO_MEM, /* reg points to valid memory region */ + PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -1611,6 +1619,11 @@ extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_event_output_data_proto; +extern const struct bpf_func_proto bpf_ringbuf_output_proto; +extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; +extern const struct bpf_func_proto bpf_ringbuf_submit_proto; +extern const struct bpf_func_proto bpf_ringbuf_discard_proto; +extern const struct bpf_func_proto bpf_ringbuf_query_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 29d22752fc87..fa8e1b552acd 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -118,6 +118,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) #if defined(CONFIG_BPF_JIT) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ea833087e853..ca08db4ffb5f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -54,6 +54,8 @@ struct bpf_reg_state { u32 btf_id; /* for PTR_TO_BTF_ID */ + u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ + /* Max size from any of the above. */ unsigned long raw; }; @@ -63,6 +65,8 @@ struct bpf_reg_state { * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. + * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation + * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54b93f8b49b8..974ca6e948e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -147,6 +147,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, }; /* Note that tracing related programs such as @@ -3157,6 +3158,59 @@ union bpf_attr { * **bpf_sk_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * 0, on success; + * < 0, on error. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; + * - BPF_RB_RING_SIZE - the size of ring buffer; + * - BPF_RB_CONS_POS - consumer position (can wrap around); + * - BPF_RB_PROD_POS - producer(s) position (can wrap around); + * Data returned is just a momentary snapshots of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if flags are not recognized. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3288,7 +3342,12 @@ union bpf_attr { FN(seq_printf), \ FN(seq_write), \ FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), + FN(sk_ancestor_cgroup_id), \ + FN(ringbuf_output), \ + FN(ringbuf_reserve), \ + FN(ringbuf_submit), \ + FN(ringbuf_discard), \ + FN(ringbuf_query), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3398,6 +3457,29 @@ enum { BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 375b933010dd..8fca02f64811 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bb4fb634275e..be43ab3e619f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -635,6 +635,16 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: break; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c new file mode 100644 index 000000000000..180414bb0d3e --- /dev/null +++ b/kernel/bpf/ringbuf.c @@ -0,0 +1,501 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) + +/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ +#define RINGBUF_PGOFF \ + (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) +/* consumer page and producer page */ +#define RINGBUF_POS_PAGES 2 + +#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) + +/* Maximum size of ring buffer area is limited by 32-bit page offset within + * record header, counted in pages. Reserve 8 bits for extensibility, and take + * into account few extra pages for consumer/producer pages and + * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single + * ring buffer. + */ +#define RINGBUF_MAX_DATA_SZ \ + (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) + +struct bpf_ringbuf { + wait_queue_head_t waitq; + struct irq_work work; + u64 mask; + struct page **pages; + int nr_pages; + spinlock_t spinlock ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to allow + * mapping consumer page as r/w, but restrict producer page to r/o. + * This protects producer position from being modified by user-space + * application and ruining in-kernel position tracking. + */ + unsigned long consumer_pos __aligned(PAGE_SIZE); + unsigned long producer_pos __aligned(PAGE_SIZE); + char data[] __aligned(PAGE_SIZE); +}; + +struct bpf_ringbuf_map { + struct bpf_map map; + struct bpf_map_memory memory; + struct bpf_ringbuf *rb; +}; + +/* 8-byte ring buffer record header structure */ +struct bpf_ringbuf_hdr { + u32 len; + u32 pg_off; +}; + +static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) +{ + const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO; + int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; + int nr_data_pages = data_sz >> PAGE_SHIFT; + int nr_pages = nr_meta_pages + nr_data_pages; + struct page **pages, *page; + struct bpf_ringbuf *rb; + size_t array_size; + int i; + + /* Each data page is mapped twice to allow "virtual" + * continuous read of samples wrapping around the end of ring + * buffer area: + * ------------------------------------------------------ + * | meta pages | real data pages | same data pages | + * ------------------------------------------------------ + * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | + * ------------------------------------------------------ + * | | TA DA | TA DA | + * ------------------------------------------------------ + * ^^^^^^^ + * | + * Here, no need to worry about special handling of wrapped-around + * data due to double-mapped data pages. This works both in kernel and + * when mmap()'ed in user-space, simplifying both kernel and + * user-space implementations significantly. + */ + array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); + if (array_size > PAGE_SIZE) + pages = vmalloc_node(array_size, numa_node); + else + pages = kmalloc_node(array_size, flags, numa_node); + if (!pages) + return NULL; + + for (i = 0; i < nr_pages; i++) { + page = alloc_pages_node(numa_node, flags, 0); + if (!page) { + nr_pages = i; + goto err_free_pages; + } + pages[i] = page; + if (i >= nr_meta_pages) + pages[nr_data_pages + i] = page; + } + + rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, + VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + if (rb) { + rb->pages = pages; + rb->nr_pages = nr_pages; + return rb; + } + +err_free_pages: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); + return NULL; +} + +static void bpf_ringbuf_notify(struct irq_work *work) +{ + struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); + + wake_up_all(&rb->waitq); +} + +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +{ + struct bpf_ringbuf *rb; + + if (!data_sz || !PAGE_ALIGNED(data_sz)) + return ERR_PTR(-EINVAL); + +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (data_sz > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + + rb = bpf_ringbuf_area_alloc(data_sz, numa_node); + if (!rb) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&rb->spinlock); + init_waitqueue_head(&rb->waitq); + init_irq_work(&rb->work, bpf_ringbuf_notify); + + rb->mask = data_sz - 1; + rb->consumer_pos = 0; + rb->producer_pos = 0; + + return rb; +} + +static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) +{ + struct bpf_ringbuf_map *rb_map; + u64 cost; + int err; + + if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->key_size || attr->value_size || + attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + return ERR_PTR(-EINVAL); + + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + if (!rb_map) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rb_map->map, attr); + + cost = sizeof(struct bpf_ringbuf_map) + + sizeof(struct bpf_ringbuf) + + attr->max_entries; + err = bpf_map_charge_init(&rb_map->map.memory, cost); + if (err) + goto err_free_map; + + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + if (IS_ERR(rb_map->rb)) { + err = PTR_ERR(rb_map->rb); + goto err_uncharge; + } + + return &rb_map->map; + +err_uncharge: + bpf_map_charge_finish(&rb_map->map.memory); +err_free_map: + kfree(rb_map); + return ERR_PTR(err); +} + +static void bpf_ringbuf_free(struct bpf_ringbuf *rb) +{ + /* copy pages pointer and nr_pages to local variable, as we are going + * to unmap rb itself with vunmap() below + */ + struct page **pages = rb->pages; + int i, nr_pages = rb->nr_pages; + + vunmap(rb); + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); +} + +static void ringbuf_map_free(struct bpf_map *map) +{ + struct bpf_ringbuf_map *rb_map; + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + bpf_ringbuf_free(rb_map->rb); + kfree(rb_map); +} + +static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-ENOTSUPP); +} + +static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) +{ + size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; + + /* consumer page + producer page + 2 x data pages */ + return RINGBUF_POS_PAGES + 2 * data_pages; +} + +static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + size_t mmap_sz; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; + + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) + return -EINVAL; + + return remap_vmalloc_range(vma, rb_map->rb, + vma->vm_pgoff + RINGBUF_PGOFF); +} + +static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) +{ + unsigned long cons_pos, prod_pos; + + cons_pos = smp_load_acquire(&rb->consumer_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; +} + +static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +const struct bpf_map_ops ringbuf_map_ops = { + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap, + .map_poll = ringbuf_map_poll, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, +}; + +/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, + * calculate offset from record metadata to ring buffer in pages, rounded + * down. This page offset is stored as part of record metadata and allows to + * restore struct bpf_ringbuf * from record pointer. This page offset is + * stored at offset 4 of record metadata header. + */ +static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, + struct bpf_ringbuf_hdr *hdr) +{ + return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; +} + +/* Given pointer to ring buffer record header, restore pointer to struct + * bpf_ringbuf itself by using page offset stored at offset 4 + */ +static struct bpf_ringbuf * +bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) +{ + unsigned long addr = (unsigned long)(void *)hdr; + unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; + + return (void*)((addr & PAGE_MASK) - off); +} + +static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) +{ + unsigned long cons_pos, prod_pos, new_prod_pos, flags; + u32 len, pg_off; + struct bpf_ringbuf_hdr *hdr; + + if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) + return NULL; + + len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + cons_pos = smp_load_acquire(&rb->consumer_pos); + + if (in_nmi()) { + if (!spin_trylock_irqsave(&rb->spinlock, flags)) + return NULL; + } else { + spin_lock_irqsave(&rb->spinlock, flags); + } + + prod_pos = rb->producer_pos; + new_prod_pos = prod_pos + len; + + /* check for out of ringbuf space by ensuring producer position + * doesn't advance more than (ringbuf_size - 1) ahead + */ + if (new_prod_pos - cons_pos > rb->mask) { + spin_unlock_irqrestore(&rb->spinlock, flags); + return NULL; + } + + hdr = (void *)rb->data + (prod_pos & rb->mask); + pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pg_off = pg_off; + + /* pairs with consumer's smp_load_acquire() */ + smp_store_release(&rb->producer_pos, new_prod_pos); + + spin_unlock_irqrestore(&rb->spinlock, flags); + + return (void *)hdr + BPF_RINGBUF_HDR_SZ; +} + +BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + + if (unlikely(flags)) + return 0; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); +} + +const struct bpf_func_proto bpf_ringbuf_reserve_proto = { + .func = bpf_ringbuf_reserve, + .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) +{ + unsigned long rec_pos, cons_pos; + struct bpf_ringbuf_hdr *hdr; + struct bpf_ringbuf *rb; + u32 new_len; + + hdr = sample - BPF_RINGBUF_HDR_SZ; + rb = bpf_ringbuf_restore_from_rec(hdr); + new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* update record header with correct final size prefix */ + xchg(&hdr->len, new_len); + + /* if consumer caught up and is waiting for our record, notify about + * new data availability + */ + rec_pos = (void *)hdr - (void *)rb->data; + cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) + irq_work_queue(&rb->work); +} + +BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_proto = { + .func = bpf_ringbuf_submit, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, true /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_proto = { + .func = bpf_ringbuf_discard, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, + u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + void *rec; + + if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) + return -EINVAL; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + rec = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!rec) + return -EAGAIN; + + memcpy(rec, data, size); + bpf_ringbuf_commit(rec, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_output_proto = { + .func = bpf_ringbuf_output, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) +{ + struct bpf_ringbuf *rb; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + switch (flags) { + case BPF_RB_AVAIL_DATA: + return ringbuf_avail_data_sz(rb); + case BPF_RB_RING_SIZE: + return rb->mask + 1; + case BPF_RB_CONS_POS: + return smp_load_acquire(&rb->consumer_pos); + case BPF_RB_PROD_POS: + return smp_load_acquire(&rb->producer_pos); + default: + return 0; + } +} + +const struct bpf_func_proto bpf_ringbuf_query_proto = { + .func = bpf_ringbuf_query, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c969a9b90d3..9de3540fa90c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -26,6 +26,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -662,6 +663,16 @@ out: return err; } +static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) +{ + struct bpf_map *map = filp->private_data; + + if (map->ops->map_poll) + return map->ops->map_poll(map, filp, pts); + + return EPOLLERR; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -670,6 +681,7 @@ const struct file_operations bpf_map_fops = { .read = bpf_dummy_read, .write = bpf_dummy_write, .mmap = bpf_map_mmap, + .poll = bpf_map_poll, }; int bpf_map_new_fd(struct bpf_map *map, int flags) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d725a26f66e..5c7bbaac81ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -233,6 +233,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + int mem_size; u64 msize_max_value; int ref_obj_id; int func_id; @@ -408,7 +409,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL; + type == PTR_TO_BTF_ID_OR_NULL || + type == PTR_TO_MEM_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -422,7 +424,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_MEM || + type == PTR_TO_MEM_OR_NULL; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -436,7 +440,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return func_id == BPF_FUNC_sk_release; + return func_id == BPF_FUNC_sk_release || + func_id == BPF_FUNC_ringbuf_submit || + func_id == BPF_FUNC_ringbuf_discard; } static bool may_be_acquire_function(enum bpf_func_id func_id) @@ -444,7 +450,8 @@ static bool may_be_acquire_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_map_lookup_elem; + func_id == BPF_FUNC_map_lookup_elem || + func_id == BPF_FUNC_ringbuf_reserve; } static bool is_acquire_function(enum bpf_func_id func_id, @@ -454,7 +461,8 @@ static bool is_acquire_function(enum bpf_func_id func_id, if (func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp) + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_ringbuf_reserve) return true; if (func_id == BPF_FUNC_map_lookup_elem && @@ -494,6 +502,8 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_MEM] = "mem", + [PTR_TO_MEM_OR_NULL] = "mem_or_null", }; static char slot_type_char[] = { @@ -2468,32 +2478,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, return 0; } -/* check read/write into map element returned by bpf_map_lookup_elem() */ -static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, bool zero_size_allowed) +/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ +static int __check_mem_access(struct bpf_verifier_env *env, int regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + bool size_ok = size > 0 || (size == 0 && zero_size_allowed); + struct bpf_reg_state *reg; + + if (off >= 0 && size_ok && (u64)off + size <= mem_size) + return 0; - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - off + size > map->value_size) { + reg = &cur_regs(env)[regno]; + switch (reg->type) { + case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", - map->value_size, off, size); - return -EACCES; + mem_size, off, size); + break; + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_PACKET_END: + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, off, mem_size); + break; + case PTR_TO_MEM: + default: + verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n", + mem_size, off, size); } - return 0; + + return -EACCES; } -/* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) +/* check read/write into a memory region with possible variable offset */ +static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; - /* We may have adjusted the register to this map value, so we + /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ @@ -2514,10 +2541,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size, - zero_size_allowed); + err = __check_mem_access(env, regno, reg->smin_value + off, size, + mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the array range\n", + verbose(env, "R%d min value is outside of the allowed memory range\n", regno); return err; } @@ -2527,18 +2554,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size, - zero_size_allowed); - if (err) - verbose(env, "R%d max value is outside of the array range\n", + err = __check_mem_access(env, regno, reg->umax_value + off, size, + mem_size, zero_size_allowed); + if (err) { + verbose(env, "R%d max value is outside of the allowed memory range\n", regno); + return err; + } + + return 0; +} - if (map_value_has_spin_lock(reg->map_ptr)) { - u32 lock = reg->map_ptr->spin_lock_off; +/* check read/write into a map element with possible variable offset */ +static int check_map_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, bool zero_size_allowed) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_mem_region_access(env, regno, off, size, map->value_size, + zero_size_allowed); + if (err) + return err; + + if (map_value_has_spin_lock(map)) { + u32 lock = map->spin_lock_off; /* if any part of struct bpf_spin_lock can be touched by * load/store reject this program. @@ -2596,21 +2643,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) -{ - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; - - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - (u64)off + size > reg->range) { - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, reg->off, reg->range); - return -EACCES; - } - return 0; -} - static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { @@ -2631,16 +2663,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size, zero_size_allowed); + err = __check_mem_access(env, regno, off, size, reg->range, + zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; } - /* __check_packet_access has made sure "off + size - 1" is within u16. + /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info - * that __check_packet_access would have rejected this pkt access. + * that __check_mem_access would have rejected this pkt access. * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = @@ -3220,6 +3253,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } + } else if (reg->type == PTR_TO_MEM) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into mem\n", value_regno); + return -EACCES; + } + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; u32 btf_id = 0; @@ -3557,6 +3600,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MEM: + return check_mem_region_access(env, regno, reg->off, + access_size, reg->mem_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3661,6 +3708,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_ALLOC_MEM || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + +static bool arg_type_is_alloc_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; +} + static bool arg_type_is_int_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_INT || @@ -3720,7 +3778,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + arg_type == ARG_CONST_SIZE_OR_ZERO || + arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { expected_type = SCALAR_VALUE; if (type != expected_type) goto err_type; @@ -3791,13 +3850,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * happens during stack boundary checking. */ if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MEM_OR_NULL) + (arg_type == ARG_PTR_TO_MEM_OR_NULL || + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MEM && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_alloc_mem_ptr(arg_type)) { + expected_type = PTR_TO_MEM; + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) + /* final test in check_stack_boundary() */; + else if (type != expected_type) + goto err_type; + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && @@ -3893,6 +3968,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); + } else if (arg_type_is_alloc_size(arg_type)) { + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + meta->mem_size = reg->var_off.value; } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -3929,6 +4011,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_xdp_output) goto error; break; + case BPF_MAP_TYPE_RINGBUF: + if (func_id != BPF_FUNC_ringbuf_output && + func_id != BPF_FUNC_ringbuf_reserve && + func_id != BPF_FUNC_ringbuf_submit && + func_id != BPF_FUNC_ringbuf_discard && + func_id != BPF_FUNC_ringbuf_query) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -4655,6 +4745,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].mem_size = meta.mem_size; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -6611,6 +6706,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_TCP_SOCK; } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { reg->type = PTR_TO_BTF_ID; + } else if (reg->type == PTR_TO_MEM_OR_NULL) { + reg->type = PTR_TO_MEM; } if (is_null) { /* We don't need id and ref_obj_id from this point diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 187cd6995bbb..3767d34114c0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1088,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_read_value_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 54b93f8b49b8..974ca6e948e3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -147,6 +147,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, }; /* Note that tracing related programs such as @@ -3157,6 +3158,59 @@ union bpf_attr { * **bpf_sk_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * 0, on success; + * < 0, on error. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; + * - BPF_RB_RING_SIZE - the size of ring buffer; + * - BPF_RB_CONS_POS - consumer position (can wrap around); + * - BPF_RB_PROD_POS - producer(s) position (can wrap around); + * Data returned is just a momentary snapshots of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if flags are not recognized. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3288,7 +3342,12 @@ union bpf_attr { FN(seq_printf), \ FN(seq_write), \ FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), + FN(sk_ancestor_cgroup_id), \ + FN(ringbuf_output), \ + FN(ringbuf_reserve), \ + FN(ringbuf_submit), \ + FN(ringbuf_discard), \ + FN(ringbuf_query), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3398,6 +3457,29 @@ enum { BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c index e0fad1548737..d781bc86e100 100644 --- a/tools/testing/selftests/bpf/verifier/and.c +++ b/tools/testing/selftests/bpf/verifier/and.c @@ -15,7 +15,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -44,7 +44,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index f3c33e128709..1c4b1939f5a8 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -117,7 +117,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -137,7 +137,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R0 unbounded memory access, make sure to bounds check any such access", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index 58f4aa593b1b..4d6645f2874c 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -20,7 +20,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, }, { @@ -146,7 +146,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT }, { @@ -354,7 +354,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT }, { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 7629a0cebb9b..94258c6b5235 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -105,7 +105,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_map_hash_8b = { 16 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "calls: overlapping caller/callee", diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index b9fb28e8e224..988f46a1a4c7 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -68,7 +68,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 7", @@ -220,7 +220,7 @@ }, .fixup_map_array_small = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 19", diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c index 67ab12410050..5a605ae131a9 100644 --- a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c +++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c @@ -318,7 +318,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 4 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, diff --git a/tools/testing/selftests/bpf/verifier/helper_value_access.c b/tools/testing/selftests/bpf/verifier/helper_value_access.c index 7572e403ddb9..961f28139b96 100644 --- a/tools/testing/selftests/bpf/verifier/helper_value_access.c +++ b/tools/testing/selftests/bpf/verifier/helper_value_access.c @@ -280,7 +280,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -415,7 +415,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -926,7 +926,7 @@ }, .fixup_map_hash_16b = { 3, 10 }, .result = REJECT, - .errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R2 unbounded memory access, make sure to bounds check any such access", .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, { diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index a53d99cebd9f..97ee658e1242 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -50,7 +50,7 @@ .fixup_map_array_48b = { 8 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R0 min value is outside of the array range", + .errstr_unpriv = "R0 min value is outside of the allowed memory range", .retval = 1, }, { @@ -325,7 +325,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result_unpriv = REJECT, .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", }, @@ -601,7 +601,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R1 max value is outside of the array range", + .errstr = "R1 max value is outside of the allowed memory range", .errstr_unpriv = "R1 pointer arithmetic of map value goes out of range", .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -726,7 +726,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "map access: value_ptr -= known scalar, 2", -- cgit v1.2.3-71-gd317 From b36e62eb85215a60916f910070f6d494b4f3e73a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 28 May 2020 17:48:10 -0700 Subject: bpf: Use strncpy_from_unsafe_strict() in bpf_seq_printf() helper In bpf_seq_printf() helper, when user specified a "%s" in the format string, strncpy_from_unsafe() is used to read the actual string to a buffer. The string could be a format string or a string in the kernel data structure. It is really unlikely that the string will reside in the user memory. This is different from Commit b2a5212fb634 ("bpf: Restrict bpf_trace_printk()'s %s usage and add %pks, %pus specifier") which still used strncpy_from_unsafe() for "%s" to preserve the old behavior. If in the future, bpf_seq_printf() indeed needs to read user memory, we can implement "%pus" format string. Based on discussion in [1], if the intent is to read kernel memory, strncpy_from_unsafe_strict() should be used. So this patch changed to use strncpy_from_unsafe_strict(). [1]: https://lore.kernel.org/bpf/20200521152301.2587579-1-hch@lst.de/T/ Fixes: 492e639f0c22 ("bpf: Add bpf_seq_printf and bpf_seq_write helpers") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Cc: Christoph Hellwig Link: https://lore.kernel.org/bpf/20200529004810.3352219-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3767d34114c0..b6c24be5ff53 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -585,9 +585,9 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, goto out; } - err = strncpy_from_unsafe(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - MAX_SEQ_PRINTF_STR_LEN); + err = strncpy_from_unsafe_strict(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + MAX_SEQ_PRINTF_STR_LEN); if (err < 0) bufs->buf[memcpy_cnt][0] = '\0'; params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; -- cgit v1.2.3-71-gd317 From 958a3f2d2aff896ae2a622878e456114f4a4cd15 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 31 May 2020 17:42:55 +0200 Subject: bpf: Use tracing helpers for lsm programs Currenty lsm uses bpf_tracing_func_proto helpers which do not include stack trace or perf event output. It's useful to have those for bpftrace lsm support [1]. Using tracing_prog_func_proto helpers for lsm programs. [1] https://github.com/iovisor/bpftrace/pull/1347 Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Cc: KP Singh Link: https://lore.kernel.org/bpf/20200531154255.896551-1-jolsa@kernel.org --- include/linux/bpf.h | 3 +++ kernel/bpf/bpf_lsm.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e042311f991f..07052d44bca1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1633,6 +1633,9 @@ extern const struct bpf_func_proto bpf_ringbuf_query_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); +const struct bpf_func_proto *tracing_prog_func_proto( + enum bpf_func_id func_id, const struct bpf_prog *prog); + /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 19636703b24e..fb278144e9fd 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -49,6 +49,6 @@ const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = bpf_tracing_func_proto, + .get_func_proto = tracing_prog_func_proto, .is_valid_access = btf_ctx_access, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b6c24be5ff53..c41186417d93 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1467,7 +1467,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } -static const struct bpf_func_proto * +const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { -- cgit v1.2.3-71-gd317