From 9a18eedb145d080d542766af1d7513ebfccd1604 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:04 -0800 Subject: bpf: offload: don't use prog->aux->offload as boolean We currently use aux->offload to indicate that program is bound to a specific device. This forces us to keep the offload structure around even after the device is gone. Add a bool member to struct bpf_prog_aux to indicate if offload was requested. Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index da54ef644fcd..838eee10e979 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -201,6 +201,7 @@ struct bpf_prog_aux { u32 stack_depth; u32 id; u32 func_cnt; + bool offload_requested; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct latch_tree_node ksym_tnode; @@ -529,7 +530,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) { - return aux->offload; + return aux->offload_requested; } #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, -- cgit v1.2.3-71-gd317 From cae1927c0b4a93ae15de824faca1f6f611a44fcd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:05 -0800 Subject: bpf: offload: allow netdev to disappear while verifier is running To allow verifier instruction callbacks without any extra locking NETDEV_UNREGISTER notification would wait on a waitqueue for verifier to finish. This design decision was made when rtnl lock was providing all the locking. Use the read/write lock instead and remove the workqueue. Verifier will now call into the offload code, so dev_ops are moved to offload structure. Since verifier calls are all under bpf_prog_is_dev_bound() we no longer need static inline implementations to please builds with CONFIG_NET=n. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.h | 2 +- drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 2 +- drivers/net/netdevsim/bpf.c | 2 +- include/linux/bpf.h | 9 +++++-- include/linux/bpf_verifier.h | 16 ++---------- include/linux/netdevice.h | 4 +-- kernel/bpf/offload.c | 30 ++++++++++++----------- kernel/bpf/verifier.c | 20 ++++++--------- 8 files changed, 37 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index aae1be9ed056..89a9b6393882 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -238,7 +238,7 @@ struct nfp_bpf_vnic { int nfp_bpf_jit(struct nfp_prog *prog); -extern const struct bpf_ext_analyzer_ops nfp_bpf_analyzer_ops; +extern const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops; struct netdev_bpf; struct nfp_app; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 9c2608445bd8..d8870c2f11f3 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -260,6 +260,6 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) return 0; } -const struct bpf_ext_analyzer_ops nfp_bpf_analyzer_ops = { +const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops = { .insn_hook = nfp_verify_insn, }; diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index a243fa7ae02f..5134d5c1306c 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -66,7 +66,7 @@ nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn) return 0; } -static const struct bpf_ext_analyzer_ops nsim_bpf_analyzer_ops = { +static const struct bpf_prog_offload_ops nsim_bpf_analyzer_ops = { .insn_hook = nsim_bpf_verify_insn, }; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 838eee10e979..669549f7e3e8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -17,6 +17,7 @@ #include #include +struct bpf_verifier_env; struct perf_event; struct bpf_prog; struct bpf_map; @@ -184,14 +185,18 @@ struct bpf_verifier_ops { struct bpf_prog *prog, u32 *target_size); }; +struct bpf_prog_offload_ops { + int (*insn_hook)(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx); +}; + struct bpf_dev_offload { struct bpf_prog *prog; struct net_device *netdev; void *dev_priv; struct list_head offloads; bool dev_state; - bool verifier_running; - wait_queue_head_t verifier_done; + const struct bpf_prog_offload_ops *dev_ops; }; struct bpf_prog_aux { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 883a35d50cd5..2feb218c001d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -166,12 +166,6 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log) return log->len_used >= log->len_total - 1; } -struct bpf_verifier_env; -struct bpf_ext_analyzer_ops { - int (*insn_hook)(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx); -}; - #define BPF_MAX_SUBPROGS 256 /* single container for all structs @@ -185,7 +179,6 @@ struct bpf_verifier_env { bool strict_alignment; /* perform strict pointer alignment checks */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ - const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ @@ -206,13 +199,8 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) return cur->frame[cur->curframe]->regs; } -#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); -#else -static inline int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) -{ - return -EOPNOTSUPP; -} -#endif +int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx); #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 352066e4eeef..49bfc6eec74c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -804,7 +804,7 @@ enum bpf_netdev_command { BPF_OFFLOAD_DESTROY, }; -struct bpf_ext_analyzer_ops; +struct bpf_prog_offload_ops; struct netlink_ext_ack; struct netdev_bpf { @@ -826,7 +826,7 @@ struct netdev_bpf { /* BPF_OFFLOAD_VERIFIER_PREP */ struct { struct bpf_prog *prog; - const struct bpf_ext_analyzer_ops *ops; /* callee set */ + const struct bpf_prog_offload_ops *ops; /* callee set */ } verifier; /* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */ struct { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 032079754d88..69ddc3899bab 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -44,7 +44,6 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) return -ENOMEM; offload->prog = prog; - init_waitqueue_head(&offload->verifier_done); offload->netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); @@ -97,15 +96,28 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) if (err) goto exit_unlock; - env->dev_ops = data.verifier.ops; - + env->prog->aux->offload->dev_ops = data.verifier.ops; env->prog->aux->offload->dev_state = true; - env->prog->aux->offload->verifier_running = true; exit_unlock: rtnl_unlock(); return err; } +int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) +{ + struct bpf_dev_offload *offload; + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload->netdev) + ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); + up_read(&bpf_devs_lock); + + return ret; +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_dev_offload *offload = prog->aux->offload; @@ -117,9 +129,6 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) data.offload.prog = prog; - if (offload->verifier_running) - wait_event(offload->verifier_done, !offload->verifier_running); - if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); @@ -132,9 +141,6 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_dev_offload *offload = prog->aux->offload; - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); down_write(&bpf_devs_lock); __bpf_prog_offload_destroy(prog); @@ -146,15 +152,11 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) static int bpf_prog_offload_translate(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; int ret; data.offload.prog = prog; - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); rtnl_unlock(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98d8637cf70d..a2b211262c25 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4438,15 +4438,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; } -static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) -{ - if (env->dev_ops && env->dev_ops->insn_hook) - return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); - - return 0; -} - static int do_check(struct bpf_verifier_env *env) { struct bpf_verifier_state *state; @@ -4531,9 +4522,12 @@ static int do_check(struct bpf_verifier_env *env) print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); } - err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); - if (err) - return err; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + err = bpf_prog_offload_verify_insn(env, insn_idx, + prev_insn_idx); + if (err) + return err; + } regs = cur_regs(env); env->insn_aux_data[insn_idx].seen = true; @@ -5463,7 +5457,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - if (env->prog->aux->offload) { + if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env); if (ret) goto err_unlock; -- cgit v1.2.3-71-gd317 From ad8ad79f4f6078f456792f7f8d344da2be9bc74f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:07 -0800 Subject: bpf: offload: free program id when device disappears Bound programs are quite useless after their device disappears. They are simply waiting for reference count to go to zero, don't list them in BPF_PROG_GET_NEXT_ID by freeing their ID early. Note that orphaned offload programs will return -ENODEV on BPF_OBJ_GET_INFO_BY_FD so user will never see ID 0. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/offload.c | 3 +++ kernel/bpf/syscall.c | 9 +++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 669549f7e3e8..9a916ab34299 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -357,6 +357,8 @@ void bpf_prog_put(struct bpf_prog *prog); int __bpf_prog_charge(struct user_struct *user, u32 pages); void __bpf_prog_uncharge(struct user_struct *user, u32 pages); +void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); + struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 3126e1a842e6..e4f1668a021c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -130,6 +130,9 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ + bpf_prog_free_id(prog, true); + list_del_init(&offload->offloads); kfree(offload); prog->aux->offload = NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e0afc2e39fd5..e02dafa6f402 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -905,9 +905,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) +void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { - /* cBPF to eBPF migrations are currently not in the idr store. */ + /* cBPF to eBPF migrations are currently not in the idr store. + * Offloaded programs are removed from the store when their device + * disappears - even if someone grabs an fd to them they are unusable, + * simply waiting for refcnt to drop to be freed. + */ if (!prog->aux->id) return; @@ -917,6 +921,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) __acquire(&prog_idr_lock); idr_remove(&prog_idr, prog->aux->id); + prog->aux->id = 0; if (do_idr_lock) spin_unlock_bh(&prog_idr_lock); -- cgit v1.2.3-71-gd317 From cdab6ba8668d68f031bfd5d237b4586ec4f8cd88 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:08 -0800 Subject: nsfs: generalize ns_get_path() for path resolution with a task ns_get_path() takes struct task_struct and proc_ns_ops as its parameters. For path resolution directly from a namespace, e.g. based on a networking device's net name space, we need more flexibility. Add a ns_get_path_cb() helper which will allow callers to use any method of obtaining the name space reference. Convert ns_get_path() to use ns_get_path_cb(). Following patches will bring a networking user. CC: Eric W. Biederman Suggested-by: Daniel Borkmann Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- fs/nsfs.c | 29 ++++++++++++++++++++++++++--- include/linux/proc_ns.h | 3 +++ 2 files changed, 29 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/nsfs.c b/fs/nsfs.c index 7c6f76d29f56..36b0772701a0 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -103,14 +103,14 @@ slow: goto got_it; } -void *ns_get_path(struct path *path, struct task_struct *task, - const struct proc_ns_operations *ns_ops) +void *ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, + void *private_data) { struct ns_common *ns; void *ret; again: - ns = ns_ops->get(task); + ns = ns_get_cb(private_data); if (!ns) return ERR_PTR(-ENOENT); @@ -120,6 +120,29 @@ again: return ret; } +struct ns_get_path_task_args { + const struct proc_ns_operations *ns_ops; + struct task_struct *task; +}; + +static struct ns_common *ns_get_path_task(void *private_data) +{ + struct ns_get_path_task_args *args = private_data; + + return args->ns_ops->get(args->task); +} + +void *ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct ns_get_path_task_args args = { + .ns_ops = ns_ops, + .task = task, + }; + + return ns_get_path_cb(path, ns_get_path_task, &args); +} + int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)) { diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 2ff18c9840a7..d31cb6215905 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -78,6 +78,9 @@ extern struct file *proc_ns_fget(int fd); #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) extern void *ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops); +typedef struct ns_common *ns_get_path_helper_t(void *); +extern void *ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, + void *private_data); extern int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops); -- cgit v1.2.3-71-gd317 From 675fc275a3a2d905535207237402c6d8dcb5fa4b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:09 -0800 Subject: bpf: offload: report device information for offloaded programs Report to the user ifindex and namespace information of offloaded programs. If device has disappeared return -ENODEV. Specify the namespace using dev/inode combination. CC: Eric W. Biederman Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 3 +++ kernel/bpf/offload.c | 59 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 +++++ tools/include/uapi/linux/bpf.h | 3 +++ 5 files changed, 73 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9a916ab34299..7810ae57b357 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -531,6 +531,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); +int bpf_prog_offload_info_fill(struct bpf_prog_info *info, + struct bpf_prog *prog); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69eabfcb9bdb..f2f8b36e2ad4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -921,6 +921,9 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index e4f1668a021c..040d4e0edf3f 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -16,9 +16,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -176,6 +178,63 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } +struct ns_get_path_bpf_prog_args { + struct bpf_prog *prog; + struct bpf_prog_info *info; +}; + +static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_prog_args *args = private_data; + struct bpf_prog_aux *aux = args->prog->aux; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (aux->offload) { + args->info->ifindex = aux->offload->netdev->ifindex; + net = dev_net(aux->offload->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_prog_offload_info_fill(struct bpf_prog_info *info, + struct bpf_prog *prog) +{ + struct ns_get_path_bpf_prog_args args = { + .prog = prog, + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + const struct bpf_prog_ops bpf_offload_prog_ops = { }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e02dafa6f402..ebf0fb23e237 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1707,6 +1707,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } + if (bpf_prog_is_dev_bound(prog->aux)) { + err = bpf_prog_offload_info_fill(&info, prog); + if (err) + return err; + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index db1b0923a308..4e8c60acfa32 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -921,6 +921,9 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3-71-gd317 From 5f103c5d4dbadec0f2cacd39b6429e1b8a8cf983 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Jan 2018 17:57:56 -0800 Subject: bpf: only build sockmap with CONFIG_INET The sockmap infrastructure is only aware of TCP sockets at the moment. In the future we plan to add UDP. In both cases CONFIG_NET should be built-in. So lets only build sockmap if CONFIG_INET is enabled. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- include/linux/bpf_types.h | 2 +- kernel/bpf/Makefile | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7810ae57b357..9e03046d1df2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -554,7 +554,7 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) } #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ -#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) +#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 978c1d9c9383..19b8349a3809 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -42,7 +42,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) -#ifdef CONFIG_STREAM_PARSER +#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e691da0b3bab..a713fd23ec88 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,9 +9,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) +ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif endif +endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif -- cgit v1.2.3-71-gd317 From aecd67b60722dd24353b0bc50e78a55b30707dcd Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:25:13 +0100 Subject: xdp: base API for new XDP rx-queue info concept This patch only introduce the core data structures and API functions. All XDP enabled drivers must use the API before this info can used. There is a need for XDP to know more about the RX-queue a given XDP frames have arrived on. For both the XDP bpf-prog and kernel side. Instead of extending xdp_buff each time new info is needed, the patch creates a separate read-mostly struct xdp_rxq_info, that contains this info. We stress this data/cache-line is for read-only info. This is NOT for dynamic per packet info, use the data_meta for such use-cases. The performance advantage is this info can be setup at RX-ring init time, instead of updating N-members in xdp_buff. A possible (driver level) micro optimization is that xdp_buff->rxq assignment could be done once per XDP/NAPI loop. The extra pointer deref only happens for program needing access to this info (thus, no slowdown to existing use-cases). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 ++ include/net/xdp.h | 47 +++++++++++++++++++++++++++++++++++ net/core/Makefile | 2 +- net/core/xdp.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 include/net/xdp.h create mode 100644 net/core/xdp.c (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 2b0df2703671..425056c7f96c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -503,6 +504,7 @@ struct xdp_buff { void *data_end; void *data_meta; void *data_hard_start; + struct xdp_rxq_info *rxq; }; /* Compute the linear packet data range [data, data_end) which diff --git a/include/net/xdp.h b/include/net/xdp.h new file mode 100644 index 000000000000..86c41631a908 --- /dev/null +++ b/include/net/xdp.h @@ -0,0 +1,47 @@ +/* include/net/xdp.h + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ +#ifndef __LINUX_NET_XDP_H__ +#define __LINUX_NET_XDP_H__ + +/** + * DOC: XDP RX-queue information + * + * The XDP RX-queue info (xdp_rxq_info) is associated with the driver + * level RX-ring queues. It is information that is specific to how + * the driver have configured a given RX-ring queue. + * + * Each xdp_buff frame received in the driver carry a (pointer) + * reference to this xdp_rxq_info structure. This provides the XDP + * data-path read-access to RX-info for both kernel and bpf-side + * (limited subset). + * + * For now, direct access is only safe while running in NAPI/softirq + * context. Contents is read-mostly and must not be updated during + * driver NAPI/softirq poll. + * + * The driver usage API is a register and unregister API. + * + * The struct is not directly tied to the XDP prog. A new XDP prog + * can be attached as long as it doesn't change the underlying + * RX-ring. If the RX-ring does change significantly, the NIC driver + * naturally need to stop the RX-ring before purging and reallocating + * memory. In that process the driver MUST call unregistor (which + * also apply for driver shutdown and unload). The register API is + * also mandatory during RX-ring setup. + */ + +struct xdp_rxq_info { + struct net_device *dev; + u32 queue_index; + u32 reg_state; +} ____cacheline_aligned; /* perf critical, avoid false-sharing */ + +int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index); +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); + +#endif /* __LINUX_NET_XDP_H__ */ diff --git a/net/core/Makefile b/net/core/Makefile index 1fd0a9c88b1b..6dbbba8c57ae 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o + fib_notifier.o xdp.o obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o diff --git a/net/core/xdp.c b/net/core/xdp.c new file mode 100644 index 000000000000..229bc5a0ee04 --- /dev/null +++ b/net/core/xdp.c @@ -0,0 +1,67 @@ +/* net/core/xdp.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ +#include +#include + +#include + +#define REG_STATE_NEW 0x0 +#define REG_STATE_REGISTERED 0x1 +#define REG_STATE_UNREGISTERED 0x2 +#define REG_STATE_UNUSED 0x3 + +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) +{ + /* Simplify driver cleanup code paths, allow unreg "unused" */ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) + return; + + WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); + + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; + xdp_rxq->dev = NULL; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); + +static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) +{ + memset(xdp_rxq, 0, sizeof(*xdp_rxq)); +} + +/* Returns 0 on success, negative on failure */ +int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index) +{ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) { + WARN(1, "Driver promised not to register this"); + return -EINVAL; + } + + if (xdp_rxq->reg_state == REG_STATE_REGISTERED) { + WARN(1, "Missing unregister, handled but fix driver"); + xdp_rxq_info_unreg(xdp_rxq); + } + + if (!dev) { + WARN(1, "Missing net_device from driver"); + return -ENODEV; + } + + /* State either UNREGISTERED or NEW */ + xdp_rxq_info_init(xdp_rxq); + xdp_rxq->dev = dev; + xdp_rxq->queue_index = queue_index; + + xdp_rxq->reg_state = REG_STATE_REGISTERED; + return 0; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg); + +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->reg_state = REG_STATE_UNUSED; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); -- cgit v1.2.3-71-gd317 From e817f85652c14d78f170b18797e4c477c78949e0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:26:09 +0100 Subject: xdp: generic XDP handling of xdp_rxq_info Hook points for xdp_rxq_info: * reg : netif_alloc_rx_queues * unreg: netif_free_rx_queues The net_device have some members (num_rx_queues + real_num_rx_queues) and data-area (dev->_rx with struct netdev_rx_queue's) that were primarily used for exporting information about RPS (CONFIG_RPS) queues to sysfs (CONFIG_SYSFS). For generic XDP extend struct netdev_rx_queue with the xdp_rxq_info, and remove some of the CONFIG_SYSFS ifdefs. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 69 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 61 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 49bfc6eec74c..440b000f07f4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -44,6 +44,7 @@ #include #endif #include +#include #include #include @@ -686,6 +687,7 @@ struct netdev_rx_queue { #endif struct kobject kobj; struct net_device *dev; + struct xdp_rxq_info xdp_rxq; } ____cacheline_aligned_in_smp; /* diff --git a/net/core/dev.c b/net/core/dev.c index 2eb66c0d9cdb..d7925ef8743d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3906,9 +3906,33 @@ drop: return NET_RX_DROP; } +static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_rx_queue *rxqueue; + + rxqueue = dev->_rx; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + + return rxqueue; /* Return first rxqueue */ + } + rxqueue += index; + } + return rxqueue; +} + static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; struct xdp_buff xdp; void *orig_data; @@ -3952,6 +3976,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; + rxqueue = netif_get_rxqueue(skb); + xdp.rxq = &rxqueue->xdp_rxq; + act = bpf_prog_run_xdp(xdp_prog, &xdp); off = xdp.data - orig_data; @@ -7589,12 +7616,12 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_SYSFS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); + int err = 0; BUG_ON(count < 1); @@ -7604,11 +7631,39 @@ static int netif_alloc_rx_queues(struct net_device *dev) dev->_rx = rx; - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { rx[i].dev = dev; + + /* XDP RX-queue setup */ + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + if (err < 0) + goto err_rxq_info; + } return 0; + +err_rxq_info: + /* Rollback successful reg's and free other resources */ + while (i--) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); + kfree(dev->_rx); + dev->_rx = NULL; + return err; +} + +static void netif_free_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + struct netdev_rx_queue *rx; + + /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ + if (!dev->_rx) + return; + + rx = dev->_rx; + + for (i = 0; i < count; i++) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); } -#endif static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) @@ -8169,12 +8224,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } -#ifdef CONFIG_SYSFS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } -#endif alloc_size = sizeof(struct net_device); if (sizeof_priv) { @@ -8231,12 +8284,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_SYSFS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; -#endif strcpy(dev->name, name); dev->name_assign_type = name_assign_type; @@ -8275,9 +8326,7 @@ void free_netdev(struct net_device *dev) might_sleep(); netif_free_tx_queues(dev); -#ifdef CONFIG_SYSFS - kvfree(dev->_rx); -#endif + netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); -- cgit v1.2.3-71-gd317