From 644bfe51fa49c22244d24e896cd3fe3ee2f2cfd1 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:37 +0200 Subject: cpumap: Formalize map value as a named struct As it has been already done for devmap, introduce 'struct bpf_cpumap_val' to formalize the expected values that can be passed in for a CPUMAP. Update cpumap code to use the struct. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/754f950674665dae6139c061d28c1d982aaf4170.1594734381.git.lorenzo@kernel.org --- include/uapi/linux/bpf.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5e386389913a..109623527358 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3849,6 +3849,15 @@ struct bpf_devmap_val { } bpf_prog; }; +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ +}; + enum sk_action { SK_DROP = 0, SK_PASS, -- cgit v1.2.3-71-gd317 From 9216477449f33cdbc9c9a99d49f500b7fbb81702 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:38 +0200 Subject: bpf: cpumap: Add the possibility to attach an eBPF program to cpumap Introduce the capability to attach an eBPF program to cpumap entries. The idea behind this feature is to add the possibility to define on which CPU run the eBPF program if the underlying hw does not support RSS. Current supported verdicts are XDP_DROP and XDP_PASS. This patch has been tested on Marvell ESPRESSObin using xdp_redirect_cpu sample available in the kernel tree to identify possible performance regressions. Results show there are no observable differences in packet-per-second: $./xdp_redirect_cpu --progname xdp_cpu_map0 --dev eth0 --cpu 1 rx: 354.8 Kpps rx: 356.0 Kpps rx: 356.8 Kpps rx: 356.3 Kpps rx: 356.6 Kpps rx: 356.6 Kpps rx: 356.7 Kpps rx: 355.8 Kpps rx: 356.8 Kpps rx: 356.8 Kpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/5c9febdf903d810b3415732e5cd98491d7d9067a.1594734381.git.lorenzo@kernel.org --- include/linux/bpf.h | 6 ++ include/net/xdp.h | 5 ++ include/trace/events/xdp.h | 14 +++-- include/uapi/linux/bpf.h | 5 ++ kernel/bpf/cpumap.c | 121 ++++++++++++++++++++++++++++++++++++----- net/core/dev.c | 9 +++ tools/include/uapi/linux/bpf.h | 5 ++ 7 files changed, 148 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c67c88ad35f8..54ad426dbea1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1272,6 +1272,7 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); +bool cpu_map_prog_allowed(struct bpf_map *map); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) @@ -1432,6 +1433,11 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, return 0; } +static inline bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return false; +} + static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { diff --git a/include/net/xdp.h b/include/net/xdp.h index 5b383c450858..83b9e0142b52 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -98,6 +98,11 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +struct xdp_cpumap_stats { + unsigned int pass; + unsigned int drop; +}; + /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b73d3e141323..e2c99f5bee39 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -177,9 +177,9 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, - int sched), + int sched, struct xdp_cpumap_stats *xdp_stats), - TP_ARGS(map_id, processed, drops, sched), + TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) @@ -188,6 +188,8 @@ TRACE_EVENT(xdp_cpumap_kthread, __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) + __field(unsigned int, xdp_pass) + __field(unsigned int, xdp_drop) ), TP_fast_assign( @@ -197,16 +199,20 @@ TRACE_EVENT(xdp_cpumap_kthread, __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; + __entry->xdp_pass = xdp_stats->pass; + __entry->xdp_drop = xdp_stats->drop; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" - " sched=%d", + " sched=%d" + " xdp_pass=%u xdp_drop=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, - __entry->sched) + __entry->sched, + __entry->xdp_pass, __entry->xdp_drop) ); TRACE_EVENT(xdp_cpumap_enqueue, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 109623527358..c010b57fce3f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -227,6 +227,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, __MAX_BPF_ATTACH_TYPE }; @@ -3856,6 +3857,10 @@ struct bpf_devmap_val { */ struct bpf_cpumap_val { __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; }; enum sk_action { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ff48dc00e8d0..b3a8aea81ee5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -63,6 +63,7 @@ struct bpf_cpu_map_entry { struct task_struct *kthread; struct bpf_cpumap_val value; + struct bpf_prog *prog; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; @@ -82,6 +83,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { + u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; int err = -ENOMEM; u64 cost; @@ -92,7 +94,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + (value_size != offsetofend(struct bpf_cpumap_val, qsize) && + value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || + attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); cmap = kzalloc(sizeof(*cmap), GFP_USER); @@ -214,6 +218,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { if (atomic_dec_and_test(&rcpu->refcnt)) { + if (rcpu->prog) + bpf_prog_put(rcpu->prog); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); @@ -222,6 +228,62 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, + void **frames, int n, + struct xdp_cpumap_stats *stats) +{ + struct xdp_rxq_info rxq; + struct xdp_buff xdp; + int i, nframes = 0; + + if (!rcpu->prog) + return n; + + rcu_read_lock(); + + xdp_set_return_frame_no_direct(); + xdp.rxq = &rxq; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + rxq.dev = xdpf->dev_rx; + rxq.mem = xdpf->mem; + /* TODO: report queue_index to xdp_rxq_info */ + + xdp_convert_frame_to_buff(xdpf, &xdp); + + act = bpf_prog_run_xdp(rcpu->prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (err < 0) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + frames[nframes++] = xdpf; + stats->pass++; + } + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fallthrough */ + case XDP_DROP: + xdp_return_frame(xdpf); + stats->drop++; + break; + } + } + + xdp_clear_return_frame_no_direct(); + + rcu_read_unlock(); + + return nframes; +} + #define CPUMAP_BATCH 8 static int cpu_map_kthread_run(void *data) @@ -236,11 +298,12 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + struct xdp_cpumap_stats stats = {}; /* zero stats */ + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m; + int i, n, m, nframes; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -261,8 +324,8 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - + n = __ptr_ring_consume_batched(rcpu->queue, frames, + CPUMAP_BATCH); for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page = virt_to_page(f); @@ -274,15 +337,19 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } - m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < n; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - drops = n; + /* Support running another XDP prog on this CPU */ + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); + if (nframes) { + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < nframes; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops += nframes; + } } local_bh_disable(); - for (i = 0; i < n; i++) { + for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; int ret; @@ -299,7 +366,7 @@ static int cpu_map_kthread_run(void *data) drops++; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats); local_bh_enable(); /* resched point, may call do_softirq() */ } @@ -309,13 +376,38 @@ static int cpu_map_kthread_run(void *data) return 0; } +bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_CPUMAP && + map->value_size != offsetofend(struct bpf_cpumap_val, qsize); +} + +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +{ + struct bpf_prog *prog; + + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + bpf_prog_put(prog); + return -EINVAL; + } + + rcpu->value.bpf_prog.id = prog->aux->id; + rcpu->prog = prog; + + return 0; +} + static struct bpf_cpu_map_entry * __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { + int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; - int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -357,6 +449,9 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + goto free_ptr_ring; + /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); diff --git a/net/core/dev.c b/net/core/dev.c index b61075828358..b820527f0a8d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5448,6 +5448,8 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) for (i = 0; i < new->aux->used_map_cnt; i++) { if (dev_map_can_have_prog(new->aux->used_maps[i])) return -EINVAL; + if (cpu_map_prog_allowed(new->aux->used_maps[i])) + return -EINVAL; } } @@ -8875,6 +8877,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_CPUMAP) { + NL_SET_ERR_MSG(extack, + "BPF_XDP_CPUMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 109623527358..c010b57fce3f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -227,6 +227,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, __MAX_BPF_ATTACH_TYPE }; @@ -3856,6 +3857,10 @@ struct bpf_devmap_val { */ struct bpf_cpumap_val { __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; }; enum sk_action { -- cgit v1.2.3-71-gd317 From bfdfa51702dec67e9fcd52568b4cf3c7f799db8b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 18:29:11 -0700 Subject: bpf: Drop duplicated words in uapi helper comments Drop doubled words "will" and "attach". Signed-off-by: Randy Dunlap Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/6b9f71ae-4f8e-0259-2c5d-187ddaefe6eb@infradead.org --- include/uapi/linux/bpf.h | 6 +++--- tools/include/uapi/linux/bpf.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c010b57fce3f..7ac3992dacfe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2420,7 +2420,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -2457,7 +2457,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -4000,7 +4000,7 @@ struct bpf_link_info { /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on - * attach attach type). + * attach type). */ struct bpf_sock_addr { __u32 user_family; /* Allows 4-byte read, but no write. */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c010b57fce3f..7ac3992dacfe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2420,7 +2420,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -2457,7 +2457,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -4000,7 +4000,7 @@ struct bpf_link_info { /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on - * attach attach type). + * attach type). */ struct bpf_sock_addr { __u32 user_family; /* Allows 4-byte read, but no write. */ -- cgit v1.2.3-71-gd317 From e9ddbb7707ff5891616240026062b8c1e29864ca Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:23 +0200 Subject: bpf: Introduce SK_LOOKUP program type with a dedicated attach point Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer when looking up a listening socket for a new connection request for connection oriented protocols, or when looking up an unconnected socket for a packet for connection-less protocols. When called, SK_LOOKUP BPF program can select a socket that will receive the packet. This serves as a mechanism to overcome the limits of what bind() API allows to express. Two use-cases driving this work are: (1) steer packets destined to an IP range, on fixed port to a socket 192.0.2.0/24, port 80 -> NGINX socket (2) steer packets destined to an IP address, on any port to a socket 198.51.100.1, any port -> L7 proxy socket In its run-time context program receives information about the packet that triggered the socket lookup. Namely IP version, L4 protocol identifier, and address 4-tuple. Context can be further extended to include ingress interface identifier. To select a socket BPF program fetches it from a map holding socket references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...) helper to record the selection. Transport layer then uses the selected socket as a result of socket lookup. In its basic form, SK_LOOKUP acts as a filter and hence must return either SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should look for a socket to receive the packet, or use the one selected by the program if available, while SK_DROP informs the transport layer that the lookup should fail. This patch only enables the user to attach an SK_LOOKUP program to a network namespace. Subsequent patches hook it up to run on local delivery path in ipv4 and ipv6 stacks. Suggested-by: Marek Majkowski Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com --- include/linux/bpf-netns.h | 3 + include/linux/bpf.h | 1 + include/linux/bpf_types.h | 2 + include/linux/filter.h | 17 +++++ include/uapi/linux/bpf.h | 77 +++++++++++++++++++ kernel/bpf/net_namespace.c | 5 ++ kernel/bpf/syscall.c | 9 +++ kernel/bpf/verifier.c | 13 +++- net/core/filter.c | 180 +++++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 9 ++- 10 files changed, 312 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index 47d5b0c708c9..722f799c1a2e 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -8,6 +8,7 @@ enum netns_bpf_attach_type { NETNS_BPF_INVALID = -1, NETNS_BPF_FLOW_DISSECTOR = 0, + NETNS_BPF_SK_LOOKUP, MAX_NETNS_BPF_ATTACH_TYPE }; @@ -17,6 +18,8 @@ to_netns_bpf_attach_type(enum bpf_attach_type attach_type) switch (attach_type) { case BPF_FLOW_DISSECTOR: return NETNS_BPF_FLOW_DISSECTOR; + case BPF_SK_LOOKUP: + return NETNS_BPF_SK_LOOKUP; default: return NETNS_BPF_INVALID; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c8c9eabcd106..adb16bdc5f0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -249,6 +249,7 @@ enum bpf_arg_type { ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ + ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a18ae82a298a..a52a5688418e 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -64,6 +64,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport, struct sk_reuseport_md, struct sk_reuseport_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_LOOKUP, sk_lookup, + struct bpf_sk_lookup, struct bpf_sk_lookup_kern) #endif #if defined(CONFIG_BPF_JIT) BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops, diff --git a/include/linux/filter.h b/include/linux/filter.h index 0b0144752d78..fa1ea12ad2cd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1278,4 +1278,21 @@ struct bpf_sockopt_kern { s32 retval; }; +struct bpf_sk_lookup_kern { + u16 family; + u16 protocol; + struct { + __be32 saddr; + __be32 daddr; + } v4; + struct { + const struct in6_addr *saddr; + const struct in6_addr *daddr; + } v6; + __be16 sport; + u16 dport; + struct sock *selected_sk; + bool no_reuseport; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7ac3992dacfe..54d0c886e3ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -189,6 +189,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, }; enum bpf_attach_type { @@ -228,6 +229,7 @@ enum bpf_attach_type { BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, BPF_XDP_CPUMAP, + BPF_SK_LOOKUP, __MAX_BPF_ATTACH_TYPE }; @@ -3069,6 +3071,10 @@ union bpf_attr { * * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, * will cause *skb* to be delivered to the specified socket. @@ -3094,6 +3100,56 @@ union bpf_attr { * **-ESOCKTNOSUPPORT** if the socket type is not supported * (reuseport). * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. + * + * Select the *sk* as a result of a socket lookup. + * + * For the operation to succeed passed socket must be compatible + * with the packet description provided by the *ctx* object. + * + * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must + * be an exact match. While IP family (**AF_INET** or + * **AF_INET6**) must be compatible, that is IPv6 sockets + * that are not v6-only can be selected for IPv4 packets. + * + * Only TCP listeners and UDP unconnected sockets can be + * selected. *sk* can also be NULL to reset any previous + * selection. + * + * *flags* argument can combination of following values: + * + * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous + * socket selection, potentially done by a BPF program + * that ran before us. + * + * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip + * load-balancing within reuseport group for the socket + * being selected. + * + * On success *ctx->sk* will point to the selected socket. + * + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EAFNOSUPPORT** if socket family (*sk->family*) is + * not compatible with packet family (*ctx->family*). + * + * * **-EEXIST** if socket has been already selected, + * potentially by another program, and + * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. + * + * * **-EINVAL** if unsupported flags were specified. + * + * * **-EPROTOTYPE** if socket L4 protocol + * (*sk->protocol*) doesn't match packet protocol + * (*ctx->protocol*). + * + * * **-ESOCKTNOSUPPORT** if socket is not in allowed + * state (TCP listening or UDP unconnected). + * * u64 bpf_ktime_get_boot_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. @@ -3607,6 +3663,12 @@ enum { BPF_RINGBUF_HDR_SZ = 8, }; +/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ +enum { + BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), + BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, @@ -4349,4 +4411,19 @@ struct bpf_pidns_info { __u32 pid; __u32 tgid; }; + +/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ +struct bpf_sk_lookup { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + + __u32 family; /* Protocol family (AF_INET, AF_INET6) */ + __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ + __u32 remote_ip4; /* Network byte order */ + __u32 remote_ip6[4]; /* Network byte order */ + __u32 remote_port; /* Network byte order */ + __u32 local_ip4; /* Network byte order */ + __u32 local_ip6[4]; /* Network byte order */ + __u32 local_port; /* Host byte order */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index e9c8e26ac8f2..38b368bccda2 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -373,6 +373,8 @@ static int netns_bpf_max_progs(enum netns_bpf_attach_type type) switch (type) { case NETNS_BPF_FLOW_DISSECTOR: return 1; + case NETNS_BPF_SK_LOOKUP: + return 64; default: return 0; } @@ -403,6 +405,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, case NETNS_BPF_FLOW_DISSECTOR: err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; + case NETNS_BPF_SK_LOOKUP: + err = 0; /* nothing to check */ + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7ea9dfbebd8c..d07417d17712 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2022,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_SK_LOOKUP: + if (expected_attach_type == BPF_SK_LOOKUP) + return 0; + return -EINVAL; case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; @@ -2756,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: if (!capable(CAP_NET_ADMIN)) @@ -2817,6 +2822,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: return BPF_PROG_TYPE_TRACING; + case BPF_SK_LOOKUP: + return BPF_PROG_TYPE_SK_LOOKUP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -2953,6 +2960,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: + case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; @@ -3891,6 +3899,7 @@ static int link_create(union bpf_attr *attr) ret = tracing_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3c1efc9d08fd..9a6703bc3f36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3878,10 +3878,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } meta->ref_obj_id = reg->ref_obj_id; } - } else if (arg_type == ARG_PTR_TO_SOCKET) { + } else if (arg_type == ARG_PTR_TO_SOCKET || + arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { + if (type != expected_type) + goto err_type; + } } else if (arg_type == ARG_PTR_TO_BTF_ID) { expected_type = PTR_TO_BTF_ID; if (type != expected_type) @@ -7354,6 +7358,9 @@ static int check_return_code(struct bpf_verifier_env *env) return -ENOTSUPP; } break; + case BPF_PROG_TYPE_SK_LOOKUP: + range = tnum_range(SK_DROP, SK_PASS); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. diff --git a/net/core/filter.c b/net/core/filter.c index bdd2382e655d..d099436b3ff5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9229,6 +9229,186 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; + +BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, + struct sock *, sk, u64, flags) +{ + if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | + BPF_SK_LOOKUP_F_NO_REUSEPORT))) + return -EINVAL; + if (unlikely(sk && sk_is_refcounted(sk))) + return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ + if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED)) + return -ESOCKTNOSUPPORT; /* reject connected sockets */ + + /* Check if socket is suitable for packet L3/L4 protocol */ + if (sk && sk->sk_protocol != ctx->protocol) + return -EPROTOTYPE; + if (sk && sk->sk_family != ctx->family && + (sk->sk_family == AF_INET || ipv6_only_sock(sk))) + return -EAFNOSUPPORT; + + if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) + return -EEXIST; + + /* Select socket as lookup result */ + ctx->selected_sk = sk; + ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; + return 0; +} + +static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { + .func = bpf_sk_lookup_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_lookup_assign_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool sk_lookup_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) + return false; + if (off % size != 0) + return false; + if (type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sk_lookup, sk): + info->reg_type = PTR_TO_SOCKET_OR_NULL; + return size == sizeof(__u64); + + case bpf_ctx_range(struct bpf_sk_lookup, family): + case bpf_ctx_range(struct bpf_sk_lookup, protocol): + case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): + case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): + case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + case bpf_ctx_range(struct bpf_sk_lookup, local_port): + bpf_ctx_record_field_size(info, sizeof(__u32)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + + default: + return false; + } +} + +static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sk_lookup, sk): + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, selected_sk)); + break; + + case offsetof(struct bpf_sk_lookup, family): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + family, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + protocol, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, remote_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.saddr, 4, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.daddr, 4, target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sk_lookup, + remote_ip6[0], remote_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.saddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case bpf_ctx_range_till(struct bpf_sk_lookup, + local_ip6[0], local_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.daddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case offsetof(struct bpf_sk_lookup, remote_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + sport, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + dport, 2, target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_prog_ops sk_lookup_prog_ops = { +}; + +const struct bpf_verifier_ops sk_lookup_verifier_ops = { + .get_func_proto = sk_lookup_func_proto, + .is_valid_access = sk_lookup_is_valid_access, + .convert_ctx_access = sk_lookup_convert_ctx_access, +}; + #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6843376733df..5bfa448b4704 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -404,6 +404,7 @@ class PrinterHelpers(Printer): type_fwds = [ 'struct bpf_fib_lookup', + 'struct bpf_sk_lookup', 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', @@ -450,6 +451,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_sk_lookup', 'struct bpf_sock', 'struct bpf_sock_addr', 'struct bpf_sock_ops', @@ -487,6 +489,11 @@ class PrinterHelpers(Printer): 'struct sk_msg_buff': 'struct sk_msg_md', 'struct xdp_buff': 'struct xdp_md', } + # Helpers overloaded for different context types. + overloaded_helpers = [ + 'bpf_get_socket_cookie', + 'bpf_sk_assign', + ] def print_header(self): header = '''\ @@ -543,7 +550,7 @@ class PrinterHelpers(Printer): for i, a in enumerate(proto['args']): t = a['type'] n = a['name'] - if proto['name'] == 'bpf_get_socket_cookie' and i == 0: + if proto['name'] in self.overloaded_helpers and i == 0: t = 'void' n = 'ctx' one_arg = '{}{}'.format(comma, self.map_type(t)) -- cgit v1.2.3-71-gd317