syscall.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
syscall.c (128743B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
      3 */
      4#include <linux/bpf.h>
      5#include <linux/bpf-cgroup.h>
      6#include <linux/bpf_trace.h>
      7#include <linux/bpf_lirc.h>
      8#include <linux/bpf_verifier.h>
      9#include <linux/bsearch.h>
     10#include <linux/btf.h>
     11#include <linux/syscalls.h>
     12#include <linux/slab.h>
     13#include <linux/sched/signal.h>
     14#include <linux/vmalloc.h>
     15#include <linux/mmzone.h>
     16#include <linux/anon_inodes.h>
     17#include <linux/fdtable.h>
     18#include <linux/file.h>
     19#include <linux/fs.h>
     20#include <linux/license.h>
     21#include <linux/filter.h>
     22#include <linux/kernel.h>
     23#include <linux/idr.h>
     24#include <linux/cred.h>
     25#include <linux/timekeeping.h>
     26#include <linux/ctype.h>
     27#include <linux/nospec.h>
     28#include <linux/audit.h>
     29#include <uapi/linux/btf.h>
     30#include <linux/pgtable.h>
     31#include <linux/bpf_lsm.h>
     32#include <linux/poll.h>
     33#include <linux/sort.h>
     34#include <linux/bpf-netns.h>
     35#include <linux/rcupdate_trace.h>
     36#include <linux/memcontrol.h>
     37#include <linux/trace_events.h>
     38
     39#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
     40			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
     41			  (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
     42#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
     43#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
     44#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
     45			IS_FD_HASH(map))
     46
     47#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
     48
     49DEFINE_PER_CPU(int, bpf_prog_active);
     50static DEFINE_IDR(prog_idr);
     51static DEFINE_SPINLOCK(prog_idr_lock);
     52static DEFINE_IDR(map_idr);
     53static DEFINE_SPINLOCK(map_idr_lock);
     54static DEFINE_IDR(link_idr);
     55static DEFINE_SPINLOCK(link_idr_lock);
     56
     57int sysctl_unprivileged_bpf_disabled __read_mostly =
     58	IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
     59
     60static const struct bpf_map_ops * const bpf_map_types[] = {
     61#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
     62#define BPF_MAP_TYPE(_id, _ops) \
     63	[_id] = &_ops,
     64#define BPF_LINK_TYPE(_id, _name)
     65#include <linux/bpf_types.h>
     66#undef BPF_PROG_TYPE
     67#undef BPF_MAP_TYPE
     68#undef BPF_LINK_TYPE
     69};
     70
     71/*
     72 * If we're handed a bigger struct than we know of, ensure all the unknown bits
     73 * are 0 - i.e. new user-space does not rely on any kernel feature extensions
     74 * we don't know about yet.
     75 *
     76 * There is a ToCToU between this function call and the following
     77 * copy_from_user() call. However, this is not a concern since this function is
     78 * meant to be a future-proofing of bits.
     79 */
     80int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
     81			     size_t expected_size,
     82			     size_t actual_size)
     83{
     84	int res;
     85
     86	if (unlikely(actual_size > PAGE_SIZE))	/* silly large */
     87		return -E2BIG;
     88
     89	if (actual_size <= expected_size)
     90		return 0;
     91
     92	if (uaddr.is_kernel)
     93		res = memchr_inv(uaddr.kernel + expected_size, 0,
     94				 actual_size - expected_size) == NULL;
     95	else
     96		res = check_zeroed_user(uaddr.user + expected_size,
     97					actual_size - expected_size);
     98	if (res < 0)
     99		return res;
    100	return res ? 0 : -E2BIG;
    101}
    102
    103const struct bpf_map_ops bpf_map_offload_ops = {
    104	.map_meta_equal = bpf_map_meta_equal,
    105	.map_alloc = bpf_map_offload_map_alloc,
    106	.map_free = bpf_map_offload_map_free,
    107	.map_check_btf = map_check_no_btf,
    108};
    109
    110static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
    111{
    112	const struct bpf_map_ops *ops;
    113	u32 type = attr->map_type;
    114	struct bpf_map *map;
    115	int err;
    116
    117	if (type >= ARRAY_SIZE(bpf_map_types))
    118		return ERR_PTR(-EINVAL);
    119	type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
    120	ops = bpf_map_types[type];
    121	if (!ops)
    122		return ERR_PTR(-EINVAL);
    123
    124	if (ops->map_alloc_check) {
    125		err = ops->map_alloc_check(attr);
    126		if (err)
    127			return ERR_PTR(err);
    128	}
    129	if (attr->map_ifindex)
    130		ops = &bpf_map_offload_ops;
    131	map = ops->map_alloc(attr);
    132	if (IS_ERR(map))
    133		return map;
    134	map->ops = ops;
    135	map->map_type = type;
    136	return map;
    137}
    138
    139static void bpf_map_write_active_inc(struct bpf_map *map)
    140{
    141	atomic64_inc(&map->writecnt);
    142}
    143
    144static void bpf_map_write_active_dec(struct bpf_map *map)
    145{
    146	atomic64_dec(&map->writecnt);
    147}
    148
    149bool bpf_map_write_active(const struct bpf_map *map)
    150{
    151	return atomic64_read(&map->writecnt) != 0;
    152}
    153
    154static u32 bpf_map_value_size(const struct bpf_map *map)
    155{
    156	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
    157	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
    158	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
    159	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
    160		return round_up(map->value_size, 8) * num_possible_cpus();
    161	else if (IS_FD_MAP(map))
    162		return sizeof(u32);
    163	else
    164		return  map->value_size;
    165}
    166
    167static void maybe_wait_bpf_programs(struct bpf_map *map)
    168{
    169	/* Wait for any running BPF programs to complete so that
    170	 * userspace, when we return to it, knows that all programs
    171	 * that could be running use the new map value.
    172	 */
    173	if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
    174	    map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
    175		synchronize_rcu();
    176}
    177
    178static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
    179				void *value, __u64 flags)
    180{
    181	int err;
    182
    183	/* Need to create a kthread, thus must support schedule */
    184	if (bpf_map_is_dev_bound(map)) {
    185		return bpf_map_offload_update_elem(map, key, value, flags);
    186	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
    187		   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
    188		return map->ops->map_update_elem(map, key, value, flags);
    189	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
    190		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
    191		return sock_map_update_elem_sys(map, key, value, flags);
    192	} else if (IS_FD_PROG_ARRAY(map)) {
    193		return bpf_fd_array_map_update_elem(map, f.file, key, value,
    194						    flags);
    195	}
    196
    197	bpf_disable_instrumentation();
    198	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
    199	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
    200		err = bpf_percpu_hash_update(map, key, value, flags);
    201	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
    202		err = bpf_percpu_array_update(map, key, value, flags);
    203	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
    204		err = bpf_percpu_cgroup_storage_update(map, key, value,
    205						       flags);
    206	} else if (IS_FD_ARRAY(map)) {
    207		rcu_read_lock();
    208		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
    209						   flags);
    210		rcu_read_unlock();
    211	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
    212		rcu_read_lock();
    213		err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
    214						  flags);
    215		rcu_read_unlock();
    216	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
    217		/* rcu_read_lock() is not needed */
    218		err = bpf_fd_reuseport_array_update_elem(map, key, value,
    219							 flags);
    220	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
    221		   map->map_type == BPF_MAP_TYPE_STACK ||
    222		   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
    223		err = map->ops->map_push_elem(map, value, flags);
    224	} else {
    225		rcu_read_lock();
    226		err = map->ops->map_update_elem(map, key, value, flags);
    227		rcu_read_unlock();
    228	}
    229	bpf_enable_instrumentation();
    230	maybe_wait_bpf_programs(map);
    231
    232	return err;
    233}
    234
    235static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
    236			      __u64 flags)
    237{
    238	void *ptr;
    239	int err;
    240
    241	if (bpf_map_is_dev_bound(map))
    242		return bpf_map_offload_lookup_elem(map, key, value);
    243
    244	bpf_disable_instrumentation();
    245	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
    246	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
    247		err = bpf_percpu_hash_copy(map, key, value);
    248	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
    249		err = bpf_percpu_array_copy(map, key, value);
    250	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
    251		err = bpf_percpu_cgroup_storage_copy(map, key, value);
    252	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
    253		err = bpf_stackmap_copy(map, key, value);
    254	} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
    255		err = bpf_fd_array_map_lookup_elem(map, key, value);
    256	} else if (IS_FD_HASH(map)) {
    257		err = bpf_fd_htab_map_lookup_elem(map, key, value);
    258	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
    259		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
    260	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
    261		   map->map_type == BPF_MAP_TYPE_STACK ||
    262		   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
    263		err = map->ops->map_peek_elem(map, value);
    264	} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
    265		/* struct_ops map requires directly updating "value" */
    266		err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
    267	} else {
    268		rcu_read_lock();
    269		if (map->ops->map_lookup_elem_sys_only)
    270			ptr = map->ops->map_lookup_elem_sys_only(map, key);
    271		else
    272			ptr = map->ops->map_lookup_elem(map, key);
    273		if (IS_ERR(ptr)) {
    274			err = PTR_ERR(ptr);
    275		} else if (!ptr) {
    276			err = -ENOENT;
    277		} else {
    278			err = 0;
    279			if (flags & BPF_F_LOCK)
    280				/* lock 'ptr' and copy everything but lock */
    281				copy_map_value_locked(map, value, ptr, true);
    282			else
    283				copy_map_value(map, value, ptr);
    284			/* mask lock and timer, since value wasn't zero inited */
    285			check_and_init_map_value(map, value);
    286		}
    287		rcu_read_unlock();
    288	}
    289
    290	bpf_enable_instrumentation();
    291	maybe_wait_bpf_programs(map);
    292
    293	return err;
    294}
    295
    296/* Please, do not use this function outside from the map creation path
    297 * (e.g. in map update path) without taking care of setting the active
    298 * memory cgroup (see at bpf_map_kmalloc_node() for example).
    299 */
    300static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
    301{
    302	/* We really just want to fail instead of triggering OOM killer
    303	 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
    304	 * which is used for lower order allocation requests.
    305	 *
    306	 * It has been observed that higher order allocation requests done by
    307	 * vmalloc with __GFP_NORETRY being set might fail due to not trying
    308	 * to reclaim memory from the page cache, thus we set
    309	 * __GFP_RETRY_MAYFAIL to avoid such situations.
    310	 */
    311
    312	const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
    313	unsigned int flags = 0;
    314	unsigned long align = 1;
    315	void *area;
    316
    317	if (size >= SIZE_MAX)
    318		return NULL;
    319
    320	/* kmalloc()'ed memory can't be mmap()'ed */
    321	if (mmapable) {
    322		BUG_ON(!PAGE_ALIGNED(size));
    323		align = SHMLBA;
    324		flags = VM_USERMAP;
    325	} else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
    326		area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
    327				    numa_node);
    328		if (area != NULL)
    329			return area;
    330	}
    331
    332	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
    333			gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
    334			flags, numa_node, __builtin_return_address(0));
    335}
    336
    337void *bpf_map_area_alloc(u64 size, int numa_node)
    338{
    339	return __bpf_map_area_alloc(size, numa_node, false);
    340}
    341
    342void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
    343{
    344	return __bpf_map_area_alloc(size, numa_node, true);
    345}
    346
    347void bpf_map_area_free(void *area)
    348{
    349	kvfree(area);
    350}
    351
    352static u32 bpf_map_flags_retain_permanent(u32 flags)
    353{
    354	/* Some map creation flags are not tied to the map object but
    355	 * rather to the map fd instead, so they have no meaning upon
    356	 * map object inspection since multiple file descriptors with
    357	 * different (access) properties can exist here. Thus, given
    358	 * this has zero meaning for the map itself, lets clear these
    359	 * from here.
    360	 */
    361	return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
    362}
    363
    364void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
    365{
    366	map->map_type = attr->map_type;
    367	map->key_size = attr->key_size;
    368	map->value_size = attr->value_size;
    369	map->max_entries = attr->max_entries;
    370	map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
    371	map->numa_node = bpf_map_attr_numa_node(attr);
    372	map->map_extra = attr->map_extra;
    373}
    374
    375static int bpf_map_alloc_id(struct bpf_map *map)
    376{
    377	int id;
    378
    379	idr_preload(GFP_KERNEL);
    380	spin_lock_bh(&map_idr_lock);
    381	id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
    382	if (id > 0)
    383		map->id = id;
    384	spin_unlock_bh(&map_idr_lock);
    385	idr_preload_end();
    386
    387	if (WARN_ON_ONCE(!id))
    388		return -ENOSPC;
    389
    390	return id > 0 ? 0 : id;
    391}
    392
    393void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
    394{
    395	unsigned long flags;
    396
    397	/* Offloaded maps are removed from the IDR store when their device
    398	 * disappears - even if someone holds an fd to them they are unusable,
    399	 * the memory is gone, all ops will fail; they are simply waiting for
    400	 * refcnt to drop to be freed.
    401	 */
    402	if (!map->id)
    403		return;
    404
    405	if (do_idr_lock)
    406		spin_lock_irqsave(&map_idr_lock, flags);
    407	else
    408		__acquire(&map_idr_lock);
    409
    410	idr_remove(&map_idr, map->id);
    411	map->id = 0;
    412
    413	if (do_idr_lock)
    414		spin_unlock_irqrestore(&map_idr_lock, flags);
    415	else
    416		__release(&map_idr_lock);
    417}
    418
    419#ifdef CONFIG_MEMCG_KMEM
    420static void bpf_map_save_memcg(struct bpf_map *map)
    421{
    422	map->memcg = get_mem_cgroup_from_mm(current->mm);
    423}
    424
    425static void bpf_map_release_memcg(struct bpf_map *map)
    426{
    427	mem_cgroup_put(map->memcg);
    428}
    429
    430void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
    431			   int node)
    432{
    433	struct mem_cgroup *old_memcg;
    434	void *ptr;
    435
    436	old_memcg = set_active_memcg(map->memcg);
    437	ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
    438	set_active_memcg(old_memcg);
    439
    440	return ptr;
    441}
    442
    443void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
    444{
    445	struct mem_cgroup *old_memcg;
    446	void *ptr;
    447
    448	old_memcg = set_active_memcg(map->memcg);
    449	ptr = kzalloc(size, flags | __GFP_ACCOUNT);
    450	set_active_memcg(old_memcg);
    451
    452	return ptr;
    453}
    454
    455void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
    456				    size_t align, gfp_t flags)
    457{
    458	struct mem_cgroup *old_memcg;
    459	void __percpu *ptr;
    460
    461	old_memcg = set_active_memcg(map->memcg);
    462	ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
    463	set_active_memcg(old_memcg);
    464
    465	return ptr;
    466}
    467
    468#else
    469static void bpf_map_save_memcg(struct bpf_map *map)
    470{
    471}
    472
    473static void bpf_map_release_memcg(struct bpf_map *map)
    474{
    475}
    476#endif
    477
    478static int bpf_map_kptr_off_cmp(const void *a, const void *b)
    479{
    480	const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;
    481
    482	if (off_desc1->offset < off_desc2->offset)
    483		return -1;
    484	else if (off_desc1->offset > off_desc2->offset)
    485		return 1;
    486	return 0;
    487}
    488
    489struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
    490{
    491	/* Since members are iterated in btf_find_field in increasing order,
    492	 * offsets appended to kptr_off_tab are in increasing order, so we can
    493	 * do bsearch to find exact match.
    494	 */
    495	struct bpf_map_value_off *tab;
    496
    497	if (!map_value_has_kptrs(map))
    498		return NULL;
    499	tab = map->kptr_off_tab;
    500	return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp);
    501}
    502
    503void bpf_map_free_kptr_off_tab(struct bpf_map *map)
    504{
    505	struct bpf_map_value_off *tab = map->kptr_off_tab;
    506	int i;
    507
    508	if (!map_value_has_kptrs(map))
    509		return;
    510	for (i = 0; i < tab->nr_off; i++) {
    511		if (tab->off[i].kptr.module)
    512			module_put(tab->off[i].kptr.module);
    513		btf_put(tab->off[i].kptr.btf);
    514	}
    515	kfree(tab);
    516	map->kptr_off_tab = NULL;
    517}
    518
    519struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
    520{
    521	struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab;
    522	int size, i;
    523
    524	if (!map_value_has_kptrs(map))
    525		return ERR_PTR(-ENOENT);
    526	size = offsetof(struct bpf_map_value_off, off[tab->nr_off]);
    527	new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN);
    528	if (!new_tab)
    529		return ERR_PTR(-ENOMEM);
    530	/* Do a deep copy of the kptr_off_tab */
    531	for (i = 0; i < tab->nr_off; i++) {
    532		btf_get(tab->off[i].kptr.btf);
    533		if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) {
    534			while (i--) {
    535				if (tab->off[i].kptr.module)
    536					module_put(tab->off[i].kptr.module);
    537				btf_put(tab->off[i].kptr.btf);
    538			}
    539			kfree(new_tab);
    540			return ERR_PTR(-ENXIO);
    541		}
    542	}
    543	return new_tab;
    544}
    545
    546bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
    547{
    548	struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
    549	bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b);
    550	int size;
    551
    552	if (!a_has_kptr && !b_has_kptr)
    553		return true;
    554	if (a_has_kptr != b_has_kptr)
    555		return false;
    556	if (tab_a->nr_off != tab_b->nr_off)
    557		return false;
    558	size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]);
    559	return !memcmp(tab_a, tab_b, size);
    560}
    561
    562/* Caller must ensure map_value_has_kptrs is true. Note that this function can
    563 * be called on a map value while the map_value is visible to BPF programs, as
    564 * it ensures the correct synchronization, and we already enforce the same using
    565 * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs.
    566 */
    567void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
    568{
    569	struct bpf_map_value_off *tab = map->kptr_off_tab;
    570	unsigned long *btf_id_ptr;
    571	int i;
    572
    573	for (i = 0; i < tab->nr_off; i++) {
    574		struct bpf_map_value_off_desc *off_desc = &tab->off[i];
    575		unsigned long old_ptr;
    576
    577		btf_id_ptr = map_value + off_desc->offset;
    578		if (off_desc->type == BPF_KPTR_UNREF) {
    579			u64 *p = (u64 *)btf_id_ptr;
    580
    581			WRITE_ONCE(p, 0);
    582			continue;
    583		}
    584		old_ptr = xchg(btf_id_ptr, 0);
    585		off_desc->kptr.dtor((void *)old_ptr);
    586	}
    587}
    588
    589/* called from workqueue */
    590static void bpf_map_free_deferred(struct work_struct *work)
    591{
    592	struct bpf_map *map = container_of(work, struct bpf_map, work);
    593
    594	security_bpf_map_free(map);
    595	kfree(map->off_arr);
    596	bpf_map_release_memcg(map);
    597	/* implementation dependent freeing, map_free callback also does
    598	 * bpf_map_free_kptr_off_tab, if needed.
    599	 */
    600	map->ops->map_free(map);
    601}
    602
    603static void bpf_map_put_uref(struct bpf_map *map)
    604{
    605	if (atomic64_dec_and_test(&map->usercnt)) {
    606		if (map->ops->map_release_uref)
    607			map->ops->map_release_uref(map);
    608	}
    609}
    610
    611/* decrement map refcnt and schedule it for freeing via workqueue
    612 * (unrelying map implementation ops->map_free() might sleep)
    613 */
    614static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
    615{
    616	if (atomic64_dec_and_test(&map->refcnt)) {
    617		/* bpf_map_free_id() must be called first */
    618		bpf_map_free_id(map, do_idr_lock);
    619		btf_put(map->btf);
    620		INIT_WORK(&map->work, bpf_map_free_deferred);
    621		schedule_work(&map->work);
    622	}
    623}
    624
    625void bpf_map_put(struct bpf_map *map)
    626{
    627	__bpf_map_put(map, true);
    628}
    629EXPORT_SYMBOL_GPL(bpf_map_put);
    630
    631void bpf_map_put_with_uref(struct bpf_map *map)
    632{
    633	bpf_map_put_uref(map);
    634	bpf_map_put(map);
    635}
    636
    637static int bpf_map_release(struct inode *inode, struct file *filp)
    638{
    639	struct bpf_map *map = filp->private_data;
    640
    641	if (map->ops->map_release)
    642		map->ops->map_release(map, filp);
    643
    644	bpf_map_put_with_uref(map);
    645	return 0;
    646}
    647
    648static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
    649{
    650	fmode_t mode = f.file->f_mode;
    651
    652	/* Our file permissions may have been overridden by global
    653	 * map permissions facing syscall side.
    654	 */
    655	if (READ_ONCE(map->frozen))
    656		mode &= ~FMODE_CAN_WRITE;
    657	return mode;
    658}
    659
    660#ifdef CONFIG_PROC_FS
    661/* Provides an approximation of the map's memory footprint.
    662 * Used only to provide a backward compatibility and display
    663 * a reasonable "memlock" info.
    664 */
    665static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
    666{
    667	unsigned long size;
    668
    669	size = round_up(map->key_size + bpf_map_value_size(map), 8);
    670
    671	return round_up(map->max_entries * size, PAGE_SIZE);
    672}
    673
    674static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
    675{
    676	struct bpf_map *map = filp->private_data;
    677	u32 type = 0, jited = 0;
    678
    679	if (map_type_contains_progs(map)) {
    680		spin_lock(&map->owner.lock);
    681		type  = map->owner.type;
    682		jited = map->owner.jited;
    683		spin_unlock(&map->owner.lock);
    684	}
    685
    686	seq_printf(m,
    687		   "map_type:\t%u\n"
    688		   "key_size:\t%u\n"
    689		   "value_size:\t%u\n"
    690		   "max_entries:\t%u\n"
    691		   "map_flags:\t%#x\n"
    692		   "map_extra:\t%#llx\n"
    693		   "memlock:\t%lu\n"
    694		   "map_id:\t%u\n"
    695		   "frozen:\t%u\n",
    696		   map->map_type,
    697		   map->key_size,
    698		   map->value_size,
    699		   map->max_entries,
    700		   map->map_flags,
    701		   (unsigned long long)map->map_extra,
    702		   bpf_map_memory_footprint(map),
    703		   map->id,
    704		   READ_ONCE(map->frozen));
    705	if (type) {
    706		seq_printf(m, "owner_prog_type:\t%u\n", type);
    707		seq_printf(m, "owner_jited:\t%u\n", jited);
    708	}
    709}
    710#endif
    711
    712static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
    713			      loff_t *ppos)
    714{
    715	/* We need this handler such that alloc_file() enables
    716	 * f_mode with FMODE_CAN_READ.
    717	 */
    718	return -EINVAL;
    719}
    720
    721static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
    722			       size_t siz, loff_t *ppos)
    723{
    724	/* We need this handler such that alloc_file() enables
    725	 * f_mode with FMODE_CAN_WRITE.
    726	 */
    727	return -EINVAL;
    728}
    729
    730/* called for any extra memory-mapped regions (except initial) */
    731static void bpf_map_mmap_open(struct vm_area_struct *vma)
    732{
    733	struct bpf_map *map = vma->vm_file->private_data;
    734
    735	if (vma->vm_flags & VM_MAYWRITE)
    736		bpf_map_write_active_inc(map);
    737}
    738
    739/* called for all unmapped memory region (including initial) */
    740static void bpf_map_mmap_close(struct vm_area_struct *vma)
    741{
    742	struct bpf_map *map = vma->vm_file->private_data;
    743
    744	if (vma->vm_flags & VM_MAYWRITE)
    745		bpf_map_write_active_dec(map);
    746}
    747
    748static const struct vm_operations_struct bpf_map_default_vmops = {
    749	.open		= bpf_map_mmap_open,
    750	.close		= bpf_map_mmap_close,
    751};
    752
    753static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
    754{
    755	struct bpf_map *map = filp->private_data;
    756	int err;
    757
    758	if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
    759	    map_value_has_timer(map) || map_value_has_kptrs(map))
    760		return -ENOTSUPP;
    761
    762	if (!(vma->vm_flags & VM_SHARED))
    763		return -EINVAL;
    764
    765	mutex_lock(&map->freeze_mutex);
    766
    767	if (vma->vm_flags & VM_WRITE) {
    768		if (map->frozen) {
    769			err = -EPERM;
    770			goto out;
    771		}
    772		/* map is meant to be read-only, so do not allow mapping as
    773		 * writable, because it's possible to leak a writable page
    774		 * reference and allows user-space to still modify it after
    775		 * freezing, while verifier will assume contents do not change
    776		 */
    777		if (map->map_flags & BPF_F_RDONLY_PROG) {
    778			err = -EACCES;
    779			goto out;
    780		}
    781	}
    782
    783	/* set default open/close callbacks */
    784	vma->vm_ops = &bpf_map_default_vmops;
    785	vma->vm_private_data = map;
    786	vma->vm_flags &= ~VM_MAYEXEC;
    787	if (!(vma->vm_flags & VM_WRITE))
    788		/* disallow re-mapping with PROT_WRITE */
    789		vma->vm_flags &= ~VM_MAYWRITE;
    790
    791	err = map->ops->map_mmap(map, vma);
    792	if (err)
    793		goto out;
    794
    795	if (vma->vm_flags & VM_MAYWRITE)
    796		bpf_map_write_active_inc(map);
    797out:
    798	mutex_unlock(&map->freeze_mutex);
    799	return err;
    800}
    801
    802static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
    803{
    804	struct bpf_map *map = filp->private_data;
    805
    806	if (map->ops->map_poll)
    807		return map->ops->map_poll(map, filp, pts);
    808
    809	return EPOLLERR;
    810}
    811
    812const struct file_operations bpf_map_fops = {
    813#ifdef CONFIG_PROC_FS
    814	.show_fdinfo	= bpf_map_show_fdinfo,
    815#endif
    816	.release	= bpf_map_release,
    817	.read		= bpf_dummy_read,
    818	.write		= bpf_dummy_write,
    819	.mmap		= bpf_map_mmap,
    820	.poll		= bpf_map_poll,
    821};
    822
    823int bpf_map_new_fd(struct bpf_map *map, int flags)
    824{
    825	int ret;
    826
    827	ret = security_bpf_map(map, OPEN_FMODE(flags));
    828	if (ret < 0)
    829		return ret;
    830
    831	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
    832				flags | O_CLOEXEC);
    833}
    834
    835int bpf_get_file_flag(int flags)
    836{
    837	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
    838		return -EINVAL;
    839	if (flags & BPF_F_RDONLY)
    840		return O_RDONLY;
    841	if (flags & BPF_F_WRONLY)
    842		return O_WRONLY;
    843	return O_RDWR;
    844}
    845
    846/* helper macro to check that unused fields 'union bpf_attr' are zero */
    847#define CHECK_ATTR(CMD) \
    848	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
    849		   sizeof(attr->CMD##_LAST_FIELD), 0, \
    850		   sizeof(*attr) - \
    851		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
    852		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
    853
    854/* dst and src must have at least "size" number of bytes.
    855 * Return strlen on success and < 0 on error.
    856 */
    857int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
    858{
    859	const char *end = src + size;
    860	const char *orig_src = src;
    861
    862	memset(dst, 0, size);
    863	/* Copy all isalnum(), '_' and '.' chars. */
    864	while (src < end && *src) {
    865		if (!isalnum(*src) &&
    866		    *src != '_' && *src != '.')
    867			return -EINVAL;
    868		*dst++ = *src++;
    869	}
    870
    871	/* No '\0' found in "size" number of bytes */
    872	if (src == end)
    873		return -EINVAL;
    874
    875	return src - orig_src;
    876}
    877
    878int map_check_no_btf(const struct bpf_map *map,
    879		     const struct btf *btf,
    880		     const struct btf_type *key_type,
    881		     const struct btf_type *value_type)
    882{
    883	return -ENOTSUPP;
    884}
    885
    886static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv)
    887{
    888	const u32 a = *(const u32 *)_a;
    889	const u32 b = *(const u32 *)_b;
    890
    891	if (a < b)
    892		return -1;
    893	else if (a > b)
    894		return 1;
    895	return 0;
    896}
    897
    898static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv)
    899{
    900	struct bpf_map *map = (struct bpf_map *)priv;
    901	u32 *off_base = map->off_arr->field_off;
    902	u32 *a = _a, *b = _b;
    903	u8 *sz_a, *sz_b;
    904
    905	sz_a = map->off_arr->field_sz + (a - off_base);
    906	sz_b = map->off_arr->field_sz + (b - off_base);
    907
    908	swap(*a, *b);
    909	swap(*sz_a, *sz_b);
    910}
    911
    912static int bpf_map_alloc_off_arr(struct bpf_map *map)
    913{
    914	bool has_spin_lock = map_value_has_spin_lock(map);
    915	bool has_timer = map_value_has_timer(map);
    916	bool has_kptrs = map_value_has_kptrs(map);
    917	struct bpf_map_off_arr *off_arr;
    918	u32 i;
    919
    920	if (!has_spin_lock && !has_timer && !has_kptrs) {
    921		map->off_arr = NULL;
    922		return 0;
    923	}
    924
    925	off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN);
    926	if (!off_arr)
    927		return -ENOMEM;
    928	map->off_arr = off_arr;
    929
    930	off_arr->cnt = 0;
    931	if (has_spin_lock) {
    932		i = off_arr->cnt;
    933
    934		off_arr->field_off[i] = map->spin_lock_off;
    935		off_arr->field_sz[i] = sizeof(struct bpf_spin_lock);
    936		off_arr->cnt++;
    937	}
    938	if (has_timer) {
    939		i = off_arr->cnt;
    940
    941		off_arr->field_off[i] = map->timer_off;
    942		off_arr->field_sz[i] = sizeof(struct bpf_timer);
    943		off_arr->cnt++;
    944	}
    945	if (has_kptrs) {
    946		struct bpf_map_value_off *tab = map->kptr_off_tab;
    947		u32 *off = &off_arr->field_off[off_arr->cnt];
    948		u8 *sz = &off_arr->field_sz[off_arr->cnt];
    949
    950		for (i = 0; i < tab->nr_off; i++) {
    951			*off++ = tab->off[i].offset;
    952			*sz++ = sizeof(u64);
    953		}
    954		off_arr->cnt += tab->nr_off;
    955	}
    956
    957	if (off_arr->cnt == 1)
    958		return 0;
    959	sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]),
    960	       map_off_arr_cmp, map_off_arr_swap, map);
    961	return 0;
    962}
    963
    964static int map_check_btf(struct bpf_map *map, const struct btf *btf,
    965			 u32 btf_key_id, u32 btf_value_id)
    966{
    967	const struct btf_type *key_type, *value_type;
    968	u32 key_size, value_size;
    969	int ret = 0;
    970
    971	/* Some maps allow key to be unspecified. */
    972	if (btf_key_id) {
    973		key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
    974		if (!key_type || key_size != map->key_size)
    975			return -EINVAL;
    976	} else {
    977		key_type = btf_type_by_id(btf, 0);
    978		if (!map->ops->map_check_btf)
    979			return -EINVAL;
    980	}
    981
    982	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
    983	if (!value_type || value_size != map->value_size)
    984		return -EINVAL;
    985
    986	map->spin_lock_off = btf_find_spin_lock(btf, value_type);
    987
    988	if (map_value_has_spin_lock(map)) {
    989		if (map->map_flags & BPF_F_RDONLY_PROG)
    990			return -EACCES;
    991		if (map->map_type != BPF_MAP_TYPE_HASH &&
    992		    map->map_type != BPF_MAP_TYPE_ARRAY &&
    993		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
    994		    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
    995		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
    996		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
    997			return -ENOTSUPP;
    998		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
    999		    map->value_size) {
   1000			WARN_ONCE(1,
   1001				  "verifier bug spin_lock_off %d value_size %d\n",
   1002				  map->spin_lock_off, map->value_size);
   1003			return -EFAULT;
   1004		}
   1005	}
   1006
   1007	map->timer_off = btf_find_timer(btf, value_type);
   1008	if (map_value_has_timer(map)) {
   1009		if (map->map_flags & BPF_F_RDONLY_PROG)
   1010			return -EACCES;
   1011		if (map->map_type != BPF_MAP_TYPE_HASH &&
   1012		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
   1013		    map->map_type != BPF_MAP_TYPE_ARRAY)
   1014			return -EOPNOTSUPP;
   1015	}
   1016
   1017	map->kptr_off_tab = btf_parse_kptrs(btf, value_type);
   1018	if (map_value_has_kptrs(map)) {
   1019		if (!bpf_capable()) {
   1020			ret = -EPERM;
   1021			goto free_map_tab;
   1022		}
   1023		if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
   1024			ret = -EACCES;
   1025			goto free_map_tab;
   1026		}
   1027		if (map->map_type != BPF_MAP_TYPE_HASH &&
   1028		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
   1029		    map->map_type != BPF_MAP_TYPE_ARRAY) {
   1030			ret = -EOPNOTSUPP;
   1031			goto free_map_tab;
   1032		}
   1033	}
   1034
   1035	if (map->ops->map_check_btf) {
   1036		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
   1037		if (ret < 0)
   1038			goto free_map_tab;
   1039	}
   1040
   1041	return ret;
   1042free_map_tab:
   1043	bpf_map_free_kptr_off_tab(map);
   1044	return ret;
   1045}
   1046
   1047#define BPF_MAP_CREATE_LAST_FIELD map_extra
   1048/* called via syscall */
   1049static int map_create(union bpf_attr *attr)
   1050{
   1051	int numa_node = bpf_map_attr_numa_node(attr);
   1052	struct bpf_map *map;
   1053	int f_flags;
   1054	int err;
   1055
   1056	err = CHECK_ATTR(BPF_MAP_CREATE);
   1057	if (err)
   1058		return -EINVAL;
   1059
   1060	if (attr->btf_vmlinux_value_type_id) {
   1061		if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
   1062		    attr->btf_key_type_id || attr->btf_value_type_id)
   1063			return -EINVAL;
   1064	} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
   1065		return -EINVAL;
   1066	}
   1067
   1068	if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
   1069	    attr->map_extra != 0)
   1070		return -EINVAL;
   1071
   1072	f_flags = bpf_get_file_flag(attr->map_flags);
   1073	if (f_flags < 0)
   1074		return f_flags;
   1075
   1076	if (numa_node != NUMA_NO_NODE &&
   1077	    ((unsigned int)numa_node >= nr_node_ids ||
   1078	     !node_online(numa_node)))
   1079		return -EINVAL;
   1080
   1081	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
   1082	map = find_and_alloc_map(attr);
   1083	if (IS_ERR(map))
   1084		return PTR_ERR(map);
   1085
   1086	err = bpf_obj_name_cpy(map->name, attr->map_name,
   1087			       sizeof(attr->map_name));
   1088	if (err < 0)
   1089		goto free_map;
   1090
   1091	atomic64_set(&map->refcnt, 1);
   1092	atomic64_set(&map->usercnt, 1);
   1093	mutex_init(&map->freeze_mutex);
   1094	spin_lock_init(&map->owner.lock);
   1095
   1096	map->spin_lock_off = -EINVAL;
   1097	map->timer_off = -EINVAL;
   1098	if (attr->btf_key_type_id || attr->btf_value_type_id ||
   1099	    /* Even the map's value is a kernel's struct,
   1100	     * the bpf_prog.o must have BTF to begin with
   1101	     * to figure out the corresponding kernel's
   1102	     * counter part.  Thus, attr->btf_fd has
   1103	     * to be valid also.
   1104	     */
   1105	    attr->btf_vmlinux_value_type_id) {
   1106		struct btf *btf;
   1107
   1108		btf = btf_get_by_fd(attr->btf_fd);
   1109		if (IS_ERR(btf)) {
   1110			err = PTR_ERR(btf);
   1111			goto free_map;
   1112		}
   1113		if (btf_is_kernel(btf)) {
   1114			btf_put(btf);
   1115			err = -EACCES;
   1116			goto free_map;
   1117		}
   1118		map->btf = btf;
   1119
   1120		if (attr->btf_value_type_id) {
   1121			err = map_check_btf(map, btf, attr->btf_key_type_id,
   1122					    attr->btf_value_type_id);
   1123			if (err)
   1124				goto free_map;
   1125		}
   1126
   1127		map->btf_key_type_id = attr->btf_key_type_id;
   1128		map->btf_value_type_id = attr->btf_value_type_id;
   1129		map->btf_vmlinux_value_type_id =
   1130			attr->btf_vmlinux_value_type_id;
   1131	}
   1132
   1133	err = bpf_map_alloc_off_arr(map);
   1134	if (err)
   1135		goto free_map;
   1136
   1137	err = security_bpf_map_alloc(map);
   1138	if (err)
   1139		goto free_map_off_arr;
   1140
   1141	err = bpf_map_alloc_id(map);
   1142	if (err)
   1143		goto free_map_sec;
   1144
   1145	bpf_map_save_memcg(map);
   1146
   1147	err = bpf_map_new_fd(map, f_flags);
   1148	if (err < 0) {
   1149		/* failed to allocate fd.
   1150		 * bpf_map_put_with_uref() is needed because the above
   1151		 * bpf_map_alloc_id() has published the map
   1152		 * to the userspace and the userspace may
   1153		 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
   1154		 */
   1155		bpf_map_put_with_uref(map);
   1156		return err;
   1157	}
   1158
   1159	return err;
   1160
   1161free_map_sec:
   1162	security_bpf_map_free(map);
   1163free_map_off_arr:
   1164	kfree(map->off_arr);
   1165free_map:
   1166	btf_put(map->btf);
   1167	map->ops->map_free(map);
   1168	return err;
   1169}
   1170
   1171/* if error is returned, fd is released.
   1172 * On success caller should complete fd access with matching fdput()
   1173 */
   1174struct bpf_map *__bpf_map_get(struct fd f)
   1175{
   1176	if (!f.file)
   1177		return ERR_PTR(-EBADF);
   1178	if (f.file->f_op != &bpf_map_fops) {
   1179		fdput(f);
   1180		return ERR_PTR(-EINVAL);
   1181	}
   1182
   1183	return f.file->private_data;
   1184}
   1185
   1186void bpf_map_inc(struct bpf_map *map)
   1187{
   1188	atomic64_inc(&map->refcnt);
   1189}
   1190EXPORT_SYMBOL_GPL(bpf_map_inc);
   1191
   1192void bpf_map_inc_with_uref(struct bpf_map *map)
   1193{
   1194	atomic64_inc(&map->refcnt);
   1195	atomic64_inc(&map->usercnt);
   1196}
   1197EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
   1198
   1199struct bpf_map *bpf_map_get(u32 ufd)
   1200{
   1201	struct fd f = fdget(ufd);
   1202	struct bpf_map *map;
   1203
   1204	map = __bpf_map_get(f);
   1205	if (IS_ERR(map))
   1206		return map;
   1207
   1208	bpf_map_inc(map);
   1209	fdput(f);
   1210
   1211	return map;
   1212}
   1213EXPORT_SYMBOL(bpf_map_get);
   1214
   1215struct bpf_map *bpf_map_get_with_uref(u32 ufd)
   1216{
   1217	struct fd f = fdget(ufd);
   1218	struct bpf_map *map;
   1219
   1220	map = __bpf_map_get(f);
   1221	if (IS_ERR(map))
   1222		return map;
   1223
   1224	bpf_map_inc_with_uref(map);
   1225	fdput(f);
   1226
   1227	return map;
   1228}
   1229
   1230/* map_idr_lock should have been held */
   1231static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
   1232{
   1233	int refold;
   1234
   1235	refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
   1236	if (!refold)
   1237		return ERR_PTR(-ENOENT);
   1238	if (uref)
   1239		atomic64_inc(&map->usercnt);
   1240
   1241	return map;
   1242}
   1243
   1244struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
   1245{
   1246	spin_lock_bh(&map_idr_lock);
   1247	map = __bpf_map_inc_not_zero(map, false);
   1248	spin_unlock_bh(&map_idr_lock);
   1249
   1250	return map;
   1251}
   1252EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
   1253
   1254int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
   1255{
   1256	return -ENOTSUPP;
   1257}
   1258
   1259static void *__bpf_copy_key(void __user *ukey, u64 key_size)
   1260{
   1261	if (key_size)
   1262		return vmemdup_user(ukey, key_size);
   1263
   1264	if (ukey)
   1265		return ERR_PTR(-EINVAL);
   1266
   1267	return NULL;
   1268}
   1269
   1270static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
   1271{
   1272	if (key_size)
   1273		return kvmemdup_bpfptr(ukey, key_size);
   1274
   1275	if (!bpfptr_is_null(ukey))
   1276		return ERR_PTR(-EINVAL);
   1277
   1278	return NULL;
   1279}
   1280
   1281/* last field in 'union bpf_attr' used by this command */
   1282#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
   1283
   1284static int map_lookup_elem(union bpf_attr *attr)
   1285{
   1286	void __user *ukey = u64_to_user_ptr(attr->key);
   1287	void __user *uvalue = u64_to_user_ptr(attr->value);
   1288	int ufd = attr->map_fd;
   1289	struct bpf_map *map;
   1290	void *key, *value;
   1291	u32 value_size;
   1292	struct fd f;
   1293	int err;
   1294
   1295	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
   1296		return -EINVAL;
   1297
   1298	if (attr->flags & ~BPF_F_LOCK)
   1299		return -EINVAL;
   1300
   1301	f = fdget(ufd);
   1302	map = __bpf_map_get(f);
   1303	if (IS_ERR(map))
   1304		return PTR_ERR(map);
   1305	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
   1306		err = -EPERM;
   1307		goto err_put;
   1308	}
   1309
   1310	if ((attr->flags & BPF_F_LOCK) &&
   1311	    !map_value_has_spin_lock(map)) {
   1312		err = -EINVAL;
   1313		goto err_put;
   1314	}
   1315
   1316	key = __bpf_copy_key(ukey, map->key_size);
   1317	if (IS_ERR(key)) {
   1318		err = PTR_ERR(key);
   1319		goto err_put;
   1320	}
   1321
   1322	value_size = bpf_map_value_size(map);
   1323
   1324	err = -ENOMEM;
   1325	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
   1326	if (!value)
   1327		goto free_key;
   1328
   1329	if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
   1330		if (copy_from_user(value, uvalue, value_size))
   1331			err = -EFAULT;
   1332		else
   1333			err = bpf_map_copy_value(map, key, value, attr->flags);
   1334		goto free_value;
   1335	}
   1336
   1337	err = bpf_map_copy_value(map, key, value, attr->flags);
   1338	if (err)
   1339		goto free_value;
   1340
   1341	err = -EFAULT;
   1342	if (copy_to_user(uvalue, value, value_size) != 0)
   1343		goto free_value;
   1344
   1345	err = 0;
   1346
   1347free_value:
   1348	kvfree(value);
   1349free_key:
   1350	kvfree(key);
   1351err_put:
   1352	fdput(f);
   1353	return err;
   1354}
   1355
   1356
   1357#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
   1358
   1359static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
   1360{
   1361	bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
   1362	bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
   1363	int ufd = attr->map_fd;
   1364	struct bpf_map *map;
   1365	void *key, *value;
   1366	u32 value_size;
   1367	struct fd f;
   1368	int err;
   1369
   1370	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
   1371		return -EINVAL;
   1372
   1373	f = fdget(ufd);
   1374	map = __bpf_map_get(f);
   1375	if (IS_ERR(map))
   1376		return PTR_ERR(map);
   1377	bpf_map_write_active_inc(map);
   1378	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
   1379		err = -EPERM;
   1380		goto err_put;
   1381	}
   1382
   1383	if ((attr->flags & BPF_F_LOCK) &&
   1384	    !map_value_has_spin_lock(map)) {
   1385		err = -EINVAL;
   1386		goto err_put;
   1387	}
   1388
   1389	key = ___bpf_copy_key(ukey, map->key_size);
   1390	if (IS_ERR(key)) {
   1391		err = PTR_ERR(key);
   1392		goto err_put;
   1393	}
   1394
   1395	value_size = bpf_map_value_size(map);
   1396
   1397	err = -ENOMEM;
   1398	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
   1399	if (!value)
   1400		goto free_key;
   1401
   1402	err = -EFAULT;
   1403	if (copy_from_bpfptr(value, uvalue, value_size) != 0)
   1404		goto free_value;
   1405
   1406	err = bpf_map_update_value(map, f, key, value, attr->flags);
   1407
   1408free_value:
   1409	kvfree(value);
   1410free_key:
   1411	kvfree(key);
   1412err_put:
   1413	bpf_map_write_active_dec(map);
   1414	fdput(f);
   1415	return err;
   1416}
   1417
   1418#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
   1419
   1420static int map_delete_elem(union bpf_attr *attr)
   1421{
   1422	void __user *ukey = u64_to_user_ptr(attr->key);
   1423	int ufd = attr->map_fd;
   1424	struct bpf_map *map;
   1425	struct fd f;
   1426	void *key;
   1427	int err;
   1428
   1429	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
   1430		return -EINVAL;
   1431
   1432	f = fdget(ufd);
   1433	map = __bpf_map_get(f);
   1434	if (IS_ERR(map))
   1435		return PTR_ERR(map);
   1436	bpf_map_write_active_inc(map);
   1437	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
   1438		err = -EPERM;
   1439		goto err_put;
   1440	}
   1441
   1442	key = __bpf_copy_key(ukey, map->key_size);
   1443	if (IS_ERR(key)) {
   1444		err = PTR_ERR(key);
   1445		goto err_put;
   1446	}
   1447
   1448	if (bpf_map_is_dev_bound(map)) {
   1449		err = bpf_map_offload_delete_elem(map, key);
   1450		goto out;
   1451	} else if (IS_FD_PROG_ARRAY(map) ||
   1452		   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
   1453		/* These maps require sleepable context */
   1454		err = map->ops->map_delete_elem(map, key);
   1455		goto out;
   1456	}
   1457
   1458	bpf_disable_instrumentation();
   1459	rcu_read_lock();
   1460	err = map->ops->map_delete_elem(map, key);
   1461	rcu_read_unlock();
   1462	bpf_enable_instrumentation();
   1463	maybe_wait_bpf_programs(map);
   1464out:
   1465	kvfree(key);
   1466err_put:
   1467	bpf_map_write_active_dec(map);
   1468	fdput(f);
   1469	return err;
   1470}
   1471
   1472/* last field in 'union bpf_attr' used by this command */
   1473#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
   1474
   1475static int map_get_next_key(union bpf_attr *attr)
   1476{
   1477	void __user *ukey = u64_to_user_ptr(attr->key);
   1478	void __user *unext_key = u64_to_user_ptr(attr->next_key);
   1479	int ufd = attr->map_fd;
   1480	struct bpf_map *map;
   1481	void *key, *next_key;
   1482	struct fd f;
   1483	int err;
   1484
   1485	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
   1486		return -EINVAL;
   1487
   1488	f = fdget(ufd);
   1489	map = __bpf_map_get(f);
   1490	if (IS_ERR(map))
   1491		return PTR_ERR(map);
   1492	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
   1493		err = -EPERM;
   1494		goto err_put;
   1495	}
   1496
   1497	if (ukey) {
   1498		key = __bpf_copy_key(ukey, map->key_size);
   1499		if (IS_ERR(key)) {
   1500			err = PTR_ERR(key);
   1501			goto err_put;
   1502		}
   1503	} else {
   1504		key = NULL;
   1505	}
   1506
   1507	err = -ENOMEM;
   1508	next_key = kvmalloc(map->key_size, GFP_USER);
   1509	if (!next_key)
   1510		goto free_key;
   1511
   1512	if (bpf_map_is_dev_bound(map)) {
   1513		err = bpf_map_offload_get_next_key(map, key, next_key);
   1514		goto out;
   1515	}
   1516
   1517	rcu_read_lock();
   1518	err = map->ops->map_get_next_key(map, key, next_key);
   1519	rcu_read_unlock();
   1520out:
   1521	if (err)
   1522		goto free_next_key;
   1523
   1524	err = -EFAULT;
   1525	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
   1526		goto free_next_key;
   1527
   1528	err = 0;
   1529
   1530free_next_key:
   1531	kvfree(next_key);
   1532free_key:
   1533	kvfree(key);
   1534err_put:
   1535	fdput(f);
   1536	return err;
   1537}
   1538
   1539int generic_map_delete_batch(struct bpf_map *map,
   1540			     const union bpf_attr *attr,
   1541			     union bpf_attr __user *uattr)
   1542{
   1543	void __user *keys = u64_to_user_ptr(attr->batch.keys);
   1544	u32 cp, max_count;
   1545	int err = 0;
   1546	void *key;
   1547
   1548	if (attr->batch.elem_flags & ~BPF_F_LOCK)
   1549		return -EINVAL;
   1550
   1551	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
   1552	    !map_value_has_spin_lock(map)) {
   1553		return -EINVAL;
   1554	}
   1555
   1556	max_count = attr->batch.count;
   1557	if (!max_count)
   1558		return 0;
   1559
   1560	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
   1561	if (!key)
   1562		return -ENOMEM;
   1563
   1564	for (cp = 0; cp < max_count; cp++) {
   1565		err = -EFAULT;
   1566		if (copy_from_user(key, keys + cp * map->key_size,
   1567				   map->key_size))
   1568			break;
   1569
   1570		if (bpf_map_is_dev_bound(map)) {
   1571			err = bpf_map_offload_delete_elem(map, key);
   1572			break;
   1573		}
   1574
   1575		bpf_disable_instrumentation();
   1576		rcu_read_lock();
   1577		err = map->ops->map_delete_elem(map, key);
   1578		rcu_read_unlock();
   1579		bpf_enable_instrumentation();
   1580		if (err)
   1581			break;
   1582		cond_resched();
   1583	}
   1584	if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
   1585		err = -EFAULT;
   1586
   1587	kvfree(key);
   1588
   1589	maybe_wait_bpf_programs(map);
   1590	return err;
   1591}
   1592
   1593int generic_map_update_batch(struct bpf_map *map,
   1594			     const union bpf_attr *attr,
   1595			     union bpf_attr __user *uattr)
   1596{
   1597	void __user *values = u64_to_user_ptr(attr->batch.values);
   1598	void __user *keys = u64_to_user_ptr(attr->batch.keys);
   1599	u32 value_size, cp, max_count;
   1600	int ufd = attr->batch.map_fd;
   1601	void *key, *value;
   1602	struct fd f;
   1603	int err = 0;
   1604
   1605	if (attr->batch.elem_flags & ~BPF_F_LOCK)
   1606		return -EINVAL;
   1607
   1608	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
   1609	    !map_value_has_spin_lock(map)) {
   1610		return -EINVAL;
   1611	}
   1612
   1613	value_size = bpf_map_value_size(map);
   1614
   1615	max_count = attr->batch.count;
   1616	if (!max_count)
   1617		return 0;
   1618
   1619	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
   1620	if (!key)
   1621		return -ENOMEM;
   1622
   1623	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
   1624	if (!value) {
   1625		kvfree(key);
   1626		return -ENOMEM;
   1627	}
   1628
   1629	f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
   1630	for (cp = 0; cp < max_count; cp++) {
   1631		err = -EFAULT;
   1632		if (copy_from_user(key, keys + cp * map->key_size,
   1633		    map->key_size) ||
   1634		    copy_from_user(value, values + cp * value_size, value_size))
   1635			break;
   1636
   1637		err = bpf_map_update_value(map, f, key, value,
   1638					   attr->batch.elem_flags);
   1639
   1640		if (err)
   1641			break;
   1642		cond_resched();
   1643	}
   1644
   1645	if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
   1646		err = -EFAULT;
   1647
   1648	kvfree(value);
   1649	kvfree(key);
   1650	fdput(f);
   1651	return err;
   1652}
   1653
   1654#define MAP_LOOKUP_RETRIES 3
   1655
   1656int generic_map_lookup_batch(struct bpf_map *map,
   1657				    const union bpf_attr *attr,
   1658				    union bpf_attr __user *uattr)
   1659{
   1660	void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
   1661	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
   1662	void __user *values = u64_to_user_ptr(attr->batch.values);
   1663	void __user *keys = u64_to_user_ptr(attr->batch.keys);
   1664	void *buf, *buf_prevkey, *prev_key, *key, *value;
   1665	int err, retry = MAP_LOOKUP_RETRIES;
   1666	u32 value_size, cp, max_count;
   1667
   1668	if (attr->batch.elem_flags & ~BPF_F_LOCK)
   1669		return -EINVAL;
   1670
   1671	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
   1672	    !map_value_has_spin_lock(map))
   1673		return -EINVAL;
   1674
   1675	value_size = bpf_map_value_size(map);
   1676
   1677	max_count = attr->batch.count;
   1678	if (!max_count)
   1679		return 0;
   1680
   1681	if (put_user(0, &uattr->batch.count))
   1682		return -EFAULT;
   1683
   1684	buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
   1685	if (!buf_prevkey)
   1686		return -ENOMEM;
   1687
   1688	buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
   1689	if (!buf) {
   1690		kvfree(buf_prevkey);
   1691		return -ENOMEM;
   1692	}
   1693
   1694	err = -EFAULT;
   1695	prev_key = NULL;
   1696	if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
   1697		goto free_buf;
   1698	key = buf;
   1699	value = key + map->key_size;
   1700	if (ubatch)
   1701		prev_key = buf_prevkey;
   1702
   1703	for (cp = 0; cp < max_count;) {
   1704		rcu_read_lock();
   1705		err = map->ops->map_get_next_key(map, prev_key, key);
   1706		rcu_read_unlock();
   1707		if (err)
   1708			break;
   1709		err = bpf_map_copy_value(map, key, value,
   1710					 attr->batch.elem_flags);
   1711
   1712		if (err == -ENOENT) {
   1713			if (retry) {
   1714				retry--;
   1715				continue;
   1716			}
   1717			err = -EINTR;
   1718			break;
   1719		}
   1720
   1721		if (err)
   1722			goto free_buf;
   1723
   1724		if (copy_to_user(keys + cp * map->key_size, key,
   1725				 map->key_size)) {
   1726			err = -EFAULT;
   1727			goto free_buf;
   1728		}
   1729		if (copy_to_user(values + cp * value_size, value, value_size)) {
   1730			err = -EFAULT;
   1731			goto free_buf;
   1732		}
   1733
   1734		if (!prev_key)
   1735			prev_key = buf_prevkey;
   1736
   1737		swap(prev_key, key);
   1738		retry = MAP_LOOKUP_RETRIES;
   1739		cp++;
   1740		cond_resched();
   1741	}
   1742
   1743	if (err == -EFAULT)
   1744		goto free_buf;
   1745
   1746	if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
   1747		    (cp && copy_to_user(uobatch, prev_key, map->key_size))))
   1748		err = -EFAULT;
   1749
   1750free_buf:
   1751	kvfree(buf_prevkey);
   1752	kvfree(buf);
   1753	return err;
   1754}
   1755
   1756#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
   1757
   1758static int map_lookup_and_delete_elem(union bpf_attr *attr)
   1759{
   1760	void __user *ukey = u64_to_user_ptr(attr->key);
   1761	void __user *uvalue = u64_to_user_ptr(attr->value);
   1762	int ufd = attr->map_fd;
   1763	struct bpf_map *map;
   1764	void *key, *value;
   1765	u32 value_size;
   1766	struct fd f;
   1767	int err;
   1768
   1769	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
   1770		return -EINVAL;
   1771
   1772	if (attr->flags & ~BPF_F_LOCK)
   1773		return -EINVAL;
   1774
   1775	f = fdget(ufd);
   1776	map = __bpf_map_get(f);
   1777	if (IS_ERR(map))
   1778		return PTR_ERR(map);
   1779	bpf_map_write_active_inc(map);
   1780	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
   1781	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
   1782		err = -EPERM;
   1783		goto err_put;
   1784	}
   1785
   1786	if (attr->flags &&
   1787	    (map->map_type == BPF_MAP_TYPE_QUEUE ||
   1788	     map->map_type == BPF_MAP_TYPE_STACK)) {
   1789		err = -EINVAL;
   1790		goto err_put;
   1791	}
   1792
   1793	if ((attr->flags & BPF_F_LOCK) &&
   1794	    !map_value_has_spin_lock(map)) {
   1795		err = -EINVAL;
   1796		goto err_put;
   1797	}
   1798
   1799	key = __bpf_copy_key(ukey, map->key_size);
   1800	if (IS_ERR(key)) {
   1801		err = PTR_ERR(key);
   1802		goto err_put;
   1803	}
   1804
   1805	value_size = bpf_map_value_size(map);
   1806
   1807	err = -ENOMEM;
   1808	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
   1809	if (!value)
   1810		goto free_key;
   1811
   1812	err = -ENOTSUPP;
   1813	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
   1814	    map->map_type == BPF_MAP_TYPE_STACK) {
   1815		err = map->ops->map_pop_elem(map, value);
   1816	} else if (map->map_type == BPF_MAP_TYPE_HASH ||
   1817		   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
   1818		   map->map_type == BPF_MAP_TYPE_LRU_HASH ||
   1819		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
   1820		if (!bpf_map_is_dev_bound(map)) {
   1821			bpf_disable_instrumentation();
   1822			rcu_read_lock();
   1823			err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
   1824			rcu_read_unlock();
   1825			bpf_enable_instrumentation();
   1826		}
   1827	}
   1828
   1829	if (err)
   1830		goto free_value;
   1831
   1832	if (copy_to_user(uvalue, value, value_size) != 0) {
   1833		err = -EFAULT;
   1834		goto free_value;
   1835	}
   1836
   1837	err = 0;
   1838
   1839free_value:
   1840	kvfree(value);
   1841free_key:
   1842	kvfree(key);
   1843err_put:
   1844	bpf_map_write_active_dec(map);
   1845	fdput(f);
   1846	return err;
   1847}
   1848
   1849#define BPF_MAP_FREEZE_LAST_FIELD map_fd
   1850
   1851static int map_freeze(const union bpf_attr *attr)
   1852{
   1853	int err = 0, ufd = attr->map_fd;
   1854	struct bpf_map *map;
   1855	struct fd f;
   1856
   1857	if (CHECK_ATTR(BPF_MAP_FREEZE))
   1858		return -EINVAL;
   1859
   1860	f = fdget(ufd);
   1861	map = __bpf_map_get(f);
   1862	if (IS_ERR(map))
   1863		return PTR_ERR(map);
   1864
   1865	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
   1866	    map_value_has_timer(map) || map_value_has_kptrs(map)) {
   1867		fdput(f);
   1868		return -ENOTSUPP;
   1869	}
   1870
   1871	mutex_lock(&map->freeze_mutex);
   1872	if (bpf_map_write_active(map)) {
   1873		err = -EBUSY;
   1874		goto err_put;
   1875	}
   1876	if (READ_ONCE(map->frozen)) {
   1877		err = -EBUSY;
   1878		goto err_put;
   1879	}
   1880	if (!bpf_capable()) {
   1881		err = -EPERM;
   1882		goto err_put;
   1883	}
   1884
   1885	WRITE_ONCE(map->frozen, true);
   1886err_put:
   1887	mutex_unlock(&map->freeze_mutex);
   1888	fdput(f);
   1889	return err;
   1890}
   1891
   1892static const struct bpf_prog_ops * const bpf_prog_types[] = {
   1893#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
   1894	[_id] = & _name ## _prog_ops,
   1895#define BPF_MAP_TYPE(_id, _ops)
   1896#define BPF_LINK_TYPE(_id, _name)
   1897#include <linux/bpf_types.h>
   1898#undef BPF_PROG_TYPE
   1899#undef BPF_MAP_TYPE
   1900#undef BPF_LINK_TYPE
   1901};
   1902
   1903static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
   1904{
   1905	const struct bpf_prog_ops *ops;
   1906
   1907	if (type >= ARRAY_SIZE(bpf_prog_types))
   1908		return -EINVAL;
   1909	type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
   1910	ops = bpf_prog_types[type];
   1911	if (!ops)
   1912		return -EINVAL;
   1913
   1914	if (!bpf_prog_is_dev_bound(prog->aux))
   1915		prog->aux->ops = ops;
   1916	else
   1917		prog->aux->ops = &bpf_offload_prog_ops;
   1918	prog->type = type;
   1919	return 0;
   1920}
   1921
   1922enum bpf_audit {
   1923	BPF_AUDIT_LOAD,
   1924	BPF_AUDIT_UNLOAD,
   1925	BPF_AUDIT_MAX,
   1926};
   1927
   1928static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
   1929	[BPF_AUDIT_LOAD]   = "LOAD",
   1930	[BPF_AUDIT_UNLOAD] = "UNLOAD",
   1931};
   1932
   1933static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
   1934{
   1935	struct audit_context *ctx = NULL;
   1936	struct audit_buffer *ab;
   1937
   1938	if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
   1939		return;
   1940	if (audit_enabled == AUDIT_OFF)
   1941		return;
   1942	if (op == BPF_AUDIT_LOAD)
   1943		ctx = audit_context();
   1944	ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
   1945	if (unlikely(!ab))
   1946		return;
   1947	audit_log_format(ab, "prog-id=%u op=%s",
   1948			 prog->aux->id, bpf_audit_str[op]);
   1949	audit_log_end(ab);
   1950}
   1951
   1952static int bpf_prog_alloc_id(struct bpf_prog *prog)
   1953{
   1954	int id;
   1955
   1956	idr_preload(GFP_KERNEL);
   1957	spin_lock_bh(&prog_idr_lock);
   1958	id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
   1959	if (id > 0)
   1960		prog->aux->id = id;
   1961	spin_unlock_bh(&prog_idr_lock);
   1962	idr_preload_end();
   1963
   1964	/* id is in [1, INT_MAX) */
   1965	if (WARN_ON_ONCE(!id))
   1966		return -ENOSPC;
   1967
   1968	return id > 0 ? 0 : id;
   1969}
   1970
   1971void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
   1972{
   1973	unsigned long flags;
   1974
   1975	/* cBPF to eBPF migrations are currently not in the idr store.
   1976	 * Offloaded programs are removed from the store when their device
   1977	 * disappears - even if someone grabs an fd to them they are unusable,
   1978	 * simply waiting for refcnt to drop to be freed.
   1979	 */
   1980	if (!prog->aux->id)
   1981		return;
   1982
   1983	if (do_idr_lock)
   1984		spin_lock_irqsave(&prog_idr_lock, flags);
   1985	else
   1986		__acquire(&prog_idr_lock);
   1987
   1988	idr_remove(&prog_idr, prog->aux->id);
   1989	prog->aux->id = 0;
   1990
   1991	if (do_idr_lock)
   1992		spin_unlock_irqrestore(&prog_idr_lock, flags);
   1993	else
   1994		__release(&prog_idr_lock);
   1995}
   1996
   1997static void __bpf_prog_put_rcu(struct rcu_head *rcu)
   1998{
   1999	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
   2000
   2001	kvfree(aux->func_info);
   2002	kfree(aux->func_info_aux);
   2003	free_uid(aux->user);
   2004	security_bpf_prog_free(aux);
   2005	bpf_prog_free(aux->prog);
   2006}
   2007
   2008static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
   2009{
   2010	bpf_prog_kallsyms_del_all(prog);
   2011	btf_put(prog->aux->btf);
   2012	kvfree(prog->aux->jited_linfo);
   2013	kvfree(prog->aux->linfo);
   2014	kfree(prog->aux->kfunc_tab);
   2015	if (prog->aux->attach_btf)
   2016		btf_put(prog->aux->attach_btf);
   2017
   2018	if (deferred) {
   2019		if (prog->aux->sleepable)
   2020			call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
   2021		else
   2022			call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
   2023	} else {
   2024		__bpf_prog_put_rcu(&prog->aux->rcu);
   2025	}
   2026}
   2027
   2028static void bpf_prog_put_deferred(struct work_struct *work)
   2029{
   2030	struct bpf_prog_aux *aux;
   2031	struct bpf_prog *prog;
   2032
   2033	aux = container_of(work, struct bpf_prog_aux, work);
   2034	prog = aux->prog;
   2035	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
   2036	bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
   2037	__bpf_prog_put_noref(prog, true);
   2038}
   2039
   2040static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
   2041{
   2042	struct bpf_prog_aux *aux = prog->aux;
   2043
   2044	if (atomic64_dec_and_test(&aux->refcnt)) {
   2045		/* bpf_prog_free_id() must be called first */
   2046		bpf_prog_free_id(prog, do_idr_lock);
   2047
   2048		if (in_irq() || irqs_disabled()) {
   2049			INIT_WORK(&aux->work, bpf_prog_put_deferred);
   2050			schedule_work(&aux->work);
   2051		} else {
   2052			bpf_prog_put_deferred(&aux->work);
   2053		}
   2054	}
   2055}
   2056
   2057void bpf_prog_put(struct bpf_prog *prog)
   2058{
   2059	__bpf_prog_put(prog, true);
   2060}
   2061EXPORT_SYMBOL_GPL(bpf_prog_put);
   2062
   2063static int bpf_prog_release(struct inode *inode, struct file *filp)
   2064{
   2065	struct bpf_prog *prog = filp->private_data;
   2066
   2067	bpf_prog_put(prog);
   2068	return 0;
   2069}
   2070
   2071struct bpf_prog_kstats {
   2072	u64 nsecs;
   2073	u64 cnt;
   2074	u64 misses;
   2075};
   2076
   2077static void bpf_prog_get_stats(const struct bpf_prog *prog,
   2078			       struct bpf_prog_kstats *stats)
   2079{
   2080	u64 nsecs = 0, cnt = 0, misses = 0;
   2081	int cpu;
   2082
   2083	for_each_possible_cpu(cpu) {
   2084		const struct bpf_prog_stats *st;
   2085		unsigned int start;
   2086		u64 tnsecs, tcnt, tmisses;
   2087
   2088		st = per_cpu_ptr(prog->stats, cpu);
   2089		do {
   2090			start = u64_stats_fetch_begin_irq(&st->syncp);
   2091			tnsecs = u64_stats_read(&st->nsecs);
   2092			tcnt = u64_stats_read(&st->cnt);
   2093			tmisses = u64_stats_read(&st->misses);
   2094		} while (u64_stats_fetch_retry_irq(&st->syncp, start));
   2095		nsecs += tnsecs;
   2096		cnt += tcnt;
   2097		misses += tmisses;
   2098	}
   2099	stats->nsecs = nsecs;
   2100	stats->cnt = cnt;
   2101	stats->misses = misses;
   2102}
   2103
   2104#ifdef CONFIG_PROC_FS
   2105static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
   2106{
   2107	const struct bpf_prog *prog = filp->private_data;
   2108	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
   2109	struct bpf_prog_kstats stats;
   2110
   2111	bpf_prog_get_stats(prog, &stats);
   2112	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
   2113	seq_printf(m,
   2114		   "prog_type:\t%u\n"
   2115		   "prog_jited:\t%u\n"
   2116		   "prog_tag:\t%s\n"
   2117		   "memlock:\t%llu\n"
   2118		   "prog_id:\t%u\n"
   2119		   "run_time_ns:\t%llu\n"
   2120		   "run_cnt:\t%llu\n"
   2121		   "recursion_misses:\t%llu\n"
   2122		   "verified_insns:\t%u\n",
   2123		   prog->type,
   2124		   prog->jited,
   2125		   prog_tag,
   2126		   prog->pages * 1ULL << PAGE_SHIFT,
   2127		   prog->aux->id,
   2128		   stats.nsecs,
   2129		   stats.cnt,
   2130		   stats.misses,
   2131		   prog->aux->verified_insns);
   2132}
   2133#endif
   2134
   2135const struct file_operations bpf_prog_fops = {
   2136#ifdef CONFIG_PROC_FS
   2137	.show_fdinfo	= bpf_prog_show_fdinfo,
   2138#endif
   2139	.release	= bpf_prog_release,
   2140	.read		= bpf_dummy_read,
   2141	.write		= bpf_dummy_write,
   2142};
   2143
   2144int bpf_prog_new_fd(struct bpf_prog *prog)
   2145{
   2146	int ret;
   2147
   2148	ret = security_bpf_prog(prog);
   2149	if (ret < 0)
   2150		return ret;
   2151
   2152	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
   2153				O_RDWR | O_CLOEXEC);
   2154}
   2155
   2156static struct bpf_prog *____bpf_prog_get(struct fd f)
   2157{
   2158	if (!f.file)
   2159		return ERR_PTR(-EBADF);
   2160	if (f.file->f_op != &bpf_prog_fops) {
   2161		fdput(f);
   2162		return ERR_PTR(-EINVAL);
   2163	}
   2164
   2165	return f.file->private_data;
   2166}
   2167
   2168void bpf_prog_add(struct bpf_prog *prog, int i)
   2169{
   2170	atomic64_add(i, &prog->aux->refcnt);
   2171}
   2172EXPORT_SYMBOL_GPL(bpf_prog_add);
   2173
   2174void bpf_prog_sub(struct bpf_prog *prog, int i)
   2175{
   2176	/* Only to be used for undoing previous bpf_prog_add() in some
   2177	 * error path. We still know that another entity in our call
   2178	 * path holds a reference to the program, thus atomic_sub() can
   2179	 * be safely used in such cases!
   2180	 */
   2181	WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
   2182}
   2183EXPORT_SYMBOL_GPL(bpf_prog_sub);
   2184
   2185void bpf_prog_inc(struct bpf_prog *prog)
   2186{
   2187	atomic64_inc(&prog->aux->refcnt);
   2188}
   2189EXPORT_SYMBOL_GPL(bpf_prog_inc);
   2190
   2191/* prog_idr_lock should have been held */
   2192struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
   2193{
   2194	int refold;
   2195
   2196	refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
   2197
   2198	if (!refold)
   2199		return ERR_PTR(-ENOENT);
   2200
   2201	return prog;
   2202}
   2203EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
   2204
   2205bool bpf_prog_get_ok(struct bpf_prog *prog,
   2206			    enum bpf_prog_type *attach_type, bool attach_drv)
   2207{
   2208	/* not an attachment, just a refcount inc, always allow */
   2209	if (!attach_type)
   2210		return true;
   2211
   2212	if (prog->type != *attach_type)
   2213		return false;
   2214	if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
   2215		return false;
   2216
   2217	return true;
   2218}
   2219
   2220static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
   2221				       bool attach_drv)
   2222{
   2223	struct fd f = fdget(ufd);
   2224	struct bpf_prog *prog;
   2225
   2226	prog = ____bpf_prog_get(f);
   2227	if (IS_ERR(prog))
   2228		return prog;
   2229	if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
   2230		prog = ERR_PTR(-EINVAL);
   2231		goto out;
   2232	}
   2233
   2234	bpf_prog_inc(prog);
   2235out:
   2236	fdput(f);
   2237	return prog;
   2238}
   2239
   2240struct bpf_prog *bpf_prog_get(u32 ufd)
   2241{
   2242	return __bpf_prog_get(ufd, NULL, false);
   2243}
   2244
   2245struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
   2246				       bool attach_drv)
   2247{
   2248	return __bpf_prog_get(ufd, &type, attach_drv);
   2249}
   2250EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
   2251
   2252/* Initially all BPF programs could be loaded w/o specifying
   2253 * expected_attach_type. Later for some of them specifying expected_attach_type
   2254 * at load time became required so that program could be validated properly.
   2255 * Programs of types that are allowed to be loaded both w/ and w/o (for
   2256 * backward compatibility) expected_attach_type, should have the default attach
   2257 * type assigned to expected_attach_type for the latter case, so that it can be
   2258 * validated later at attach time.
   2259 *
   2260 * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
   2261 * prog type requires it but has some attach types that have to be backward
   2262 * compatible.
   2263 */
   2264static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
   2265{
   2266	switch (attr->prog_type) {
   2267	case BPF_PROG_TYPE_CGROUP_SOCK:
   2268		/* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
   2269		 * exist so checking for non-zero is the way to go here.
   2270		 */
   2271		if (!attr->expected_attach_type)
   2272			attr->expected_attach_type =
   2273				BPF_CGROUP_INET_SOCK_CREATE;
   2274		break;
   2275	case BPF_PROG_TYPE_SK_REUSEPORT:
   2276		if (!attr->expected_attach_type)
   2277			attr->expected_attach_type =
   2278				BPF_SK_REUSEPORT_SELECT;
   2279		break;
   2280	}
   2281}
   2282
   2283static int
   2284bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
   2285			   enum bpf_attach_type expected_attach_type,
   2286			   struct btf *attach_btf, u32 btf_id,
   2287			   struct bpf_prog *dst_prog)
   2288{
   2289	if (btf_id) {
   2290		if (btf_id > BTF_MAX_TYPE)
   2291			return -EINVAL;
   2292
   2293		if (!attach_btf && !dst_prog)
   2294			return -EINVAL;
   2295
   2296		switch (prog_type) {
   2297		case BPF_PROG_TYPE_TRACING:
   2298		case BPF_PROG_TYPE_LSM:
   2299		case BPF_PROG_TYPE_STRUCT_OPS:
   2300		case BPF_PROG_TYPE_EXT:
   2301			break;
   2302		default:
   2303			return -EINVAL;
   2304		}
   2305	}
   2306
   2307	if (attach_btf && (!btf_id || dst_prog))
   2308		return -EINVAL;
   2309
   2310	if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
   2311	    prog_type != BPF_PROG_TYPE_EXT)
   2312		return -EINVAL;
   2313
   2314	switch (prog_type) {
   2315	case BPF_PROG_TYPE_CGROUP_SOCK:
   2316		switch (expected_attach_type) {
   2317		case BPF_CGROUP_INET_SOCK_CREATE:
   2318		case BPF_CGROUP_INET_SOCK_RELEASE:
   2319		case BPF_CGROUP_INET4_POST_BIND:
   2320		case BPF_CGROUP_INET6_POST_BIND:
   2321			return 0;
   2322		default:
   2323			return -EINVAL;
   2324		}
   2325	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
   2326		switch (expected_attach_type) {
   2327		case BPF_CGROUP_INET4_BIND:
   2328		case BPF_CGROUP_INET6_BIND:
   2329		case BPF_CGROUP_INET4_CONNECT:
   2330		case BPF_CGROUP_INET6_CONNECT:
   2331		case BPF_CGROUP_INET4_GETPEERNAME:
   2332		case BPF_CGROUP_INET6_GETPEERNAME:
   2333		case BPF_CGROUP_INET4_GETSOCKNAME:
   2334		case BPF_CGROUP_INET6_GETSOCKNAME:
   2335		case BPF_CGROUP_UDP4_SENDMSG:
   2336		case BPF_CGROUP_UDP6_SENDMSG:
   2337		case BPF_CGROUP_UDP4_RECVMSG:
   2338		case BPF_CGROUP_UDP6_RECVMSG:
   2339			return 0;
   2340		default:
   2341			return -EINVAL;
   2342		}
   2343	case BPF_PROG_TYPE_CGROUP_SKB:
   2344		switch (expected_attach_type) {
   2345		case BPF_CGROUP_INET_INGRESS:
   2346		case BPF_CGROUP_INET_EGRESS:
   2347			return 0;
   2348		default:
   2349			return -EINVAL;
   2350		}
   2351	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
   2352		switch (expected_attach_type) {
   2353		case BPF_CGROUP_SETSOCKOPT:
   2354		case BPF_CGROUP_GETSOCKOPT:
   2355			return 0;
   2356		default:
   2357			return -EINVAL;
   2358		}
   2359	case BPF_PROG_TYPE_SK_LOOKUP:
   2360		if (expected_attach_type == BPF_SK_LOOKUP)
   2361			return 0;
   2362		return -EINVAL;
   2363	case BPF_PROG_TYPE_SK_REUSEPORT:
   2364		switch (expected_attach_type) {
   2365		case BPF_SK_REUSEPORT_SELECT:
   2366		case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
   2367			return 0;
   2368		default:
   2369			return -EINVAL;
   2370		}
   2371	case BPF_PROG_TYPE_SYSCALL:
   2372	case BPF_PROG_TYPE_EXT:
   2373		if (expected_attach_type)
   2374			return -EINVAL;
   2375		fallthrough;
   2376	default:
   2377		return 0;
   2378	}
   2379}
   2380
   2381static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
   2382{
   2383	switch (prog_type) {
   2384	case BPF_PROG_TYPE_SCHED_CLS:
   2385	case BPF_PROG_TYPE_SCHED_ACT:
   2386	case BPF_PROG_TYPE_XDP:
   2387	case BPF_PROG_TYPE_LWT_IN:
   2388	case BPF_PROG_TYPE_LWT_OUT:
   2389	case BPF_PROG_TYPE_LWT_XMIT:
   2390	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
   2391	case BPF_PROG_TYPE_SK_SKB:
   2392	case BPF_PROG_TYPE_SK_MSG:
   2393	case BPF_PROG_TYPE_LIRC_MODE2:
   2394	case BPF_PROG_TYPE_FLOW_DISSECTOR:
   2395	case BPF_PROG_TYPE_CGROUP_DEVICE:
   2396	case BPF_PROG_TYPE_CGROUP_SOCK:
   2397	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
   2398	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
   2399	case BPF_PROG_TYPE_CGROUP_SYSCTL:
   2400	case BPF_PROG_TYPE_SOCK_OPS:
   2401	case BPF_PROG_TYPE_EXT: /* extends any prog */
   2402		return true;
   2403	case BPF_PROG_TYPE_CGROUP_SKB:
   2404		/* always unpriv */
   2405	case BPF_PROG_TYPE_SK_REUSEPORT:
   2406		/* equivalent to SOCKET_FILTER. need CAP_BPF only */
   2407	default:
   2408		return false;
   2409	}
   2410}
   2411
   2412static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
   2413{
   2414	switch (prog_type) {
   2415	case BPF_PROG_TYPE_KPROBE:
   2416	case BPF_PROG_TYPE_TRACEPOINT:
   2417	case BPF_PROG_TYPE_PERF_EVENT:
   2418	case BPF_PROG_TYPE_RAW_TRACEPOINT:
   2419	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
   2420	case BPF_PROG_TYPE_TRACING:
   2421	case BPF_PROG_TYPE_LSM:
   2422	case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
   2423	case BPF_PROG_TYPE_EXT: /* extends any prog */
   2424		return true;
   2425	default:
   2426		return false;
   2427	}
   2428}
   2429
   2430/* last field in 'union bpf_attr' used by this command */
   2431#define	BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
   2432
   2433static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
   2434{
   2435	enum bpf_prog_type type = attr->prog_type;
   2436	struct bpf_prog *prog, *dst_prog = NULL;
   2437	struct btf *attach_btf = NULL;
   2438	int err;
   2439	char license[128];
   2440	bool is_gpl;
   2441
   2442	if (CHECK_ATTR(BPF_PROG_LOAD))
   2443		return -EINVAL;
   2444
   2445	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
   2446				 BPF_F_ANY_ALIGNMENT |
   2447				 BPF_F_TEST_STATE_FREQ |
   2448				 BPF_F_SLEEPABLE |
   2449				 BPF_F_TEST_RND_HI32 |
   2450				 BPF_F_XDP_HAS_FRAGS))
   2451		return -EINVAL;
   2452
   2453	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
   2454	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
   2455	    !bpf_capable())
   2456		return -EPERM;
   2457
   2458	/* copy eBPF program license from user space */
   2459	if (strncpy_from_bpfptr(license,
   2460				make_bpfptr(attr->license, uattr.is_kernel),
   2461				sizeof(license) - 1) < 0)
   2462		return -EFAULT;
   2463	license[sizeof(license) - 1] = 0;
   2464
   2465	/* eBPF programs must be GPL compatible to use GPL-ed functions */
   2466	is_gpl = license_is_gpl_compatible(license);
   2467
   2468	if (attr->insn_cnt == 0 ||
   2469	    attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
   2470		return -E2BIG;
   2471	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
   2472	    type != BPF_PROG_TYPE_CGROUP_SKB &&
   2473	    !bpf_capable())
   2474		return -EPERM;
   2475
   2476	if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
   2477		return -EPERM;
   2478	if (is_perfmon_prog_type(type) && !perfmon_capable())
   2479		return -EPERM;
   2480
   2481	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
   2482	 * or btf, we need to check which one it is
   2483	 */
   2484	if (attr->attach_prog_fd) {
   2485		dst_prog = bpf_prog_get(attr->attach_prog_fd);
   2486		if (IS_ERR(dst_prog)) {
   2487			dst_prog = NULL;
   2488			attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
   2489			if (IS_ERR(attach_btf))
   2490				return -EINVAL;
   2491			if (!btf_is_kernel(attach_btf)) {
   2492				/* attaching through specifying bpf_prog's BTF
   2493				 * objects directly might be supported eventually
   2494				 */
   2495				btf_put(attach_btf);
   2496				return -ENOTSUPP;
   2497			}
   2498		}
   2499	} else if (attr->attach_btf_id) {
   2500		/* fall back to vmlinux BTF, if BTF type ID is specified */
   2501		attach_btf = bpf_get_btf_vmlinux();
   2502		if (IS_ERR(attach_btf))
   2503			return PTR_ERR(attach_btf);
   2504		if (!attach_btf)
   2505			return -EINVAL;
   2506		btf_get(attach_btf);
   2507	}
   2508
   2509	bpf_prog_load_fixup_attach_type(attr);
   2510	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
   2511				       attach_btf, attr->attach_btf_id,
   2512				       dst_prog)) {
   2513		if (dst_prog)
   2514			bpf_prog_put(dst_prog);
   2515		if (attach_btf)
   2516			btf_put(attach_btf);
   2517		return -EINVAL;
   2518	}
   2519
   2520	/* plain bpf_prog allocation */
   2521	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
   2522	if (!prog) {
   2523		if (dst_prog)
   2524			bpf_prog_put(dst_prog);
   2525		if (attach_btf)
   2526			btf_put(attach_btf);
   2527		return -ENOMEM;
   2528	}
   2529
   2530	prog->expected_attach_type = attr->expected_attach_type;
   2531	prog->aux->attach_btf = attach_btf;
   2532	prog->aux->attach_btf_id = attr->attach_btf_id;
   2533	prog->aux->dst_prog = dst_prog;
   2534	prog->aux->offload_requested = !!attr->prog_ifindex;
   2535	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
   2536	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
   2537
   2538	err = security_bpf_prog_alloc(prog->aux);
   2539	if (err)
   2540		goto free_prog;
   2541
   2542	prog->aux->user = get_current_user();
   2543	prog->len = attr->insn_cnt;
   2544
   2545	err = -EFAULT;
   2546	if (copy_from_bpfptr(prog->insns,
   2547			     make_bpfptr(attr->insns, uattr.is_kernel),
   2548			     bpf_prog_insn_size(prog)) != 0)
   2549		goto free_prog_sec;
   2550
   2551	prog->orig_prog = NULL;
   2552	prog->jited = 0;
   2553
   2554	atomic64_set(&prog->aux->refcnt, 1);
   2555	prog->gpl_compatible = is_gpl ? 1 : 0;
   2556
   2557	if (bpf_prog_is_dev_bound(prog->aux)) {
   2558		err = bpf_prog_offload_init(prog, attr);
   2559		if (err)
   2560			goto free_prog_sec;
   2561	}
   2562
   2563	/* find program type: socket_filter vs tracing_filter */
   2564	err = find_prog_type(type, prog);
   2565	if (err < 0)
   2566		goto free_prog_sec;
   2567
   2568	prog->aux->load_time = ktime_get_boottime_ns();
   2569	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
   2570			       sizeof(attr->prog_name));
   2571	if (err < 0)
   2572		goto free_prog_sec;
   2573
   2574	/* run eBPF verifier */
   2575	err = bpf_check(&prog, attr, uattr);
   2576	if (err < 0)
   2577		goto free_used_maps;
   2578
   2579	prog = bpf_prog_select_runtime(prog, &err);
   2580	if (err < 0)
   2581		goto free_used_maps;
   2582
   2583	err = bpf_prog_alloc_id(prog);
   2584	if (err)
   2585		goto free_used_maps;
   2586
   2587	/* Upon success of bpf_prog_alloc_id(), the BPF prog is
   2588	 * effectively publicly exposed. However, retrieving via
   2589	 * bpf_prog_get_fd_by_id() will take another reference,
   2590	 * therefore it cannot be gone underneath us.
   2591	 *
   2592	 * Only for the time /after/ successful bpf_prog_new_fd()
   2593	 * and before returning to userspace, we might just hold
   2594	 * one reference and any parallel close on that fd could
   2595	 * rip everything out. Hence, below notifications must
   2596	 * happen before bpf_prog_new_fd().
   2597	 *
   2598	 * Also, any failure handling from this point onwards must
   2599	 * be using bpf_prog_put() given the program is exposed.
   2600	 */
   2601	bpf_prog_kallsyms_add(prog);
   2602	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
   2603	bpf_audit_prog(prog, BPF_AUDIT_LOAD);
   2604
   2605	err = bpf_prog_new_fd(prog);
   2606	if (err < 0)
   2607		bpf_prog_put(prog);
   2608	return err;
   2609
   2610free_used_maps:
   2611	/* In case we have subprogs, we need to wait for a grace
   2612	 * period before we can tear down JIT memory since symbols
   2613	 * are already exposed under kallsyms.
   2614	 */
   2615	__bpf_prog_put_noref(prog, prog->aux->func_cnt);
   2616	return err;
   2617free_prog_sec:
   2618	free_uid(prog->aux->user);
   2619	security_bpf_prog_free(prog->aux);
   2620free_prog:
   2621	if (prog->aux->attach_btf)
   2622		btf_put(prog->aux->attach_btf);
   2623	bpf_prog_free(prog);
   2624	return err;
   2625}
   2626
   2627#define BPF_OBJ_LAST_FIELD file_flags
   2628
   2629static int bpf_obj_pin(const union bpf_attr *attr)
   2630{
   2631	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
   2632		return -EINVAL;
   2633
   2634	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
   2635}
   2636
   2637static int bpf_obj_get(const union bpf_attr *attr)
   2638{
   2639	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
   2640	    attr->file_flags & ~BPF_OBJ_FLAG_MASK)
   2641		return -EINVAL;
   2642
   2643	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
   2644				attr->file_flags);
   2645}
   2646
   2647void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
   2648		   const struct bpf_link_ops *ops, struct bpf_prog *prog)
   2649{
   2650	atomic64_set(&link->refcnt, 1);
   2651	link->type = type;
   2652	link->id = 0;
   2653	link->ops = ops;
   2654	link->prog = prog;
   2655}
   2656
   2657static void bpf_link_free_id(int id)
   2658{
   2659	if (!id)
   2660		return;
   2661
   2662	spin_lock_bh(&link_idr_lock);
   2663	idr_remove(&link_idr, id);
   2664	spin_unlock_bh(&link_idr_lock);
   2665}
   2666
   2667/* Clean up bpf_link and corresponding anon_inode file and FD. After
   2668 * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
   2669 * anon_inode's release() call. This helper marksbpf_link as
   2670 * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
   2671 * is not decremented, it's the responsibility of a calling code that failed
   2672 * to complete bpf_link initialization.
   2673 */
   2674void bpf_link_cleanup(struct bpf_link_primer *primer)
   2675{
   2676	primer->link->prog = NULL;
   2677	bpf_link_free_id(primer->id);
   2678	fput(primer->file);
   2679	put_unused_fd(primer->fd);
   2680}
   2681
   2682void bpf_link_inc(struct bpf_link *link)
   2683{
   2684	atomic64_inc(&link->refcnt);
   2685}
   2686
   2687/* bpf_link_free is guaranteed to be called from process context */
   2688static void bpf_link_free(struct bpf_link *link)
   2689{
   2690	bpf_link_free_id(link->id);
   2691	if (link->prog) {
   2692		/* detach BPF program, clean up used resources */
   2693		link->ops->release(link);
   2694		bpf_prog_put(link->prog);
   2695	}
   2696	/* free bpf_link and its containing memory */
   2697	link->ops->dealloc(link);
   2698}
   2699
   2700static void bpf_link_put_deferred(struct work_struct *work)
   2701{
   2702	struct bpf_link *link = container_of(work, struct bpf_link, work);
   2703
   2704	bpf_link_free(link);
   2705}
   2706
   2707/* bpf_link_put can be called from atomic context, but ensures that resources
   2708 * are freed from process context
   2709 */
   2710void bpf_link_put(struct bpf_link *link)
   2711{
   2712	if (!atomic64_dec_and_test(&link->refcnt))
   2713		return;
   2714
   2715	if (in_atomic()) {
   2716		INIT_WORK(&link->work, bpf_link_put_deferred);
   2717		schedule_work(&link->work);
   2718	} else {
   2719		bpf_link_free(link);
   2720	}
   2721}
   2722EXPORT_SYMBOL(bpf_link_put);
   2723
   2724static int bpf_link_release(struct inode *inode, struct file *filp)
   2725{
   2726	struct bpf_link *link = filp->private_data;
   2727
   2728	bpf_link_put(link);
   2729	return 0;
   2730}
   2731
   2732#ifdef CONFIG_PROC_FS
   2733#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
   2734#define BPF_MAP_TYPE(_id, _ops)
   2735#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
   2736static const char *bpf_link_type_strs[] = {
   2737	[BPF_LINK_TYPE_UNSPEC] = "<invalid>",
   2738#include <linux/bpf_types.h>
   2739};
   2740#undef BPF_PROG_TYPE
   2741#undef BPF_MAP_TYPE
   2742#undef BPF_LINK_TYPE
   2743
   2744static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
   2745{
   2746	const struct bpf_link *link = filp->private_data;
   2747	const struct bpf_prog *prog = link->prog;
   2748	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
   2749
   2750	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
   2751	seq_printf(m,
   2752		   "link_type:\t%s\n"
   2753		   "link_id:\t%u\n"
   2754		   "prog_tag:\t%s\n"
   2755		   "prog_id:\t%u\n",
   2756		   bpf_link_type_strs[link->type],
   2757		   link->id,
   2758		   prog_tag,
   2759		   prog->aux->id);
   2760	if (link->ops->show_fdinfo)
   2761		link->ops->show_fdinfo(link, m);
   2762}
   2763#endif
   2764
   2765static const struct file_operations bpf_link_fops = {
   2766#ifdef CONFIG_PROC_FS
   2767	.show_fdinfo	= bpf_link_show_fdinfo,
   2768#endif
   2769	.release	= bpf_link_release,
   2770	.read		= bpf_dummy_read,
   2771	.write		= bpf_dummy_write,
   2772};
   2773
   2774static int bpf_link_alloc_id(struct bpf_link *link)
   2775{
   2776	int id;
   2777
   2778	idr_preload(GFP_KERNEL);
   2779	spin_lock_bh(&link_idr_lock);
   2780	id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
   2781	spin_unlock_bh(&link_idr_lock);
   2782	idr_preload_end();
   2783
   2784	return id;
   2785}
   2786
   2787/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
   2788 * reserving unused FD and allocating ID from link_idr. This is to be paired
   2789 * with bpf_link_settle() to install FD and ID and expose bpf_link to
   2790 * user-space, if bpf_link is successfully attached. If not, bpf_link and
   2791 * pre-allocated resources are to be freed with bpf_cleanup() call. All the
   2792 * transient state is passed around in struct bpf_link_primer.
   2793 * This is preferred way to create and initialize bpf_link, especially when
   2794 * there are complicated and expensive operations in between creating bpf_link
   2795 * itself and attaching it to BPF hook. By using bpf_link_prime() and
   2796 * bpf_link_settle() kernel code using bpf_link doesn't have to perform
   2797 * expensive (and potentially failing) roll back operations in a rare case
   2798 * that file, FD, or ID can't be allocated.
   2799 */
   2800int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
   2801{
   2802	struct file *file;
   2803	int fd, id;
   2804
   2805	fd = get_unused_fd_flags(O_CLOEXEC);
   2806	if (fd < 0)
   2807		return fd;
   2808
   2809
   2810	id = bpf_link_alloc_id(link);
   2811	if (id < 0) {
   2812		put_unused_fd(fd);
   2813		return id;
   2814	}
   2815
   2816	file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
   2817	if (IS_ERR(file)) {
   2818		bpf_link_free_id(id);
   2819		put_unused_fd(fd);
   2820		return PTR_ERR(file);
   2821	}
   2822
   2823	primer->link = link;
   2824	primer->file = file;
   2825	primer->fd = fd;
   2826	primer->id = id;
   2827	return 0;
   2828}
   2829
   2830int bpf_link_settle(struct bpf_link_primer *primer)
   2831{
   2832	/* make bpf_link fetchable by ID */
   2833	spin_lock_bh(&link_idr_lock);
   2834	primer->link->id = primer->id;
   2835	spin_unlock_bh(&link_idr_lock);
   2836	/* make bpf_link fetchable by FD */
   2837	fd_install(primer->fd, primer->file);
   2838	/* pass through installed FD */
   2839	return primer->fd;
   2840}
   2841
   2842int bpf_link_new_fd(struct bpf_link *link)
   2843{
   2844	return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
   2845}
   2846
   2847struct bpf_link *bpf_link_get_from_fd(u32 ufd)
   2848{
   2849	struct fd f = fdget(ufd);
   2850	struct bpf_link *link;
   2851
   2852	if (!f.file)
   2853		return ERR_PTR(-EBADF);
   2854	if (f.file->f_op != &bpf_link_fops) {
   2855		fdput(f);
   2856		return ERR_PTR(-EINVAL);
   2857	}
   2858
   2859	link = f.file->private_data;
   2860	bpf_link_inc(link);
   2861	fdput(f);
   2862
   2863	return link;
   2864}
   2865EXPORT_SYMBOL(bpf_link_get_from_fd);
   2866
   2867static void bpf_tracing_link_release(struct bpf_link *link)
   2868{
   2869	struct bpf_tracing_link *tr_link =
   2870		container_of(link, struct bpf_tracing_link, link.link);
   2871
   2872	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
   2873						tr_link->trampoline));
   2874
   2875	bpf_trampoline_put(tr_link->trampoline);
   2876
   2877	/* tgt_prog is NULL if target is a kernel function */
   2878	if (tr_link->tgt_prog)
   2879		bpf_prog_put(tr_link->tgt_prog);
   2880}
   2881
   2882static void bpf_tracing_link_dealloc(struct bpf_link *link)
   2883{
   2884	struct bpf_tracing_link *tr_link =
   2885		container_of(link, struct bpf_tracing_link, link.link);
   2886
   2887	kfree(tr_link);
   2888}
   2889
   2890static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
   2891					 struct seq_file *seq)
   2892{
   2893	struct bpf_tracing_link *tr_link =
   2894		container_of(link, struct bpf_tracing_link, link.link);
   2895
   2896	seq_printf(seq,
   2897		   "attach_type:\t%d\n",
   2898		   tr_link->attach_type);
   2899}
   2900
   2901static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
   2902					   struct bpf_link_info *info)
   2903{
   2904	struct bpf_tracing_link *tr_link =
   2905		container_of(link, struct bpf_tracing_link, link.link);
   2906
   2907	info->tracing.attach_type = tr_link->attach_type;
   2908	bpf_trampoline_unpack_key(tr_link->trampoline->key,
   2909				  &info->tracing.target_obj_id,
   2910				  &info->tracing.target_btf_id);
   2911
   2912	return 0;
   2913}
   2914
   2915static const struct bpf_link_ops bpf_tracing_link_lops = {
   2916	.release = bpf_tracing_link_release,
   2917	.dealloc = bpf_tracing_link_dealloc,
   2918	.show_fdinfo = bpf_tracing_link_show_fdinfo,
   2919	.fill_link_info = bpf_tracing_link_fill_link_info,
   2920};
   2921
   2922static int bpf_tracing_prog_attach(struct bpf_prog *prog,
   2923				   int tgt_prog_fd,
   2924				   u32 btf_id,
   2925				   u64 bpf_cookie)
   2926{
   2927	struct bpf_link_primer link_primer;
   2928	struct bpf_prog *tgt_prog = NULL;
   2929	struct bpf_trampoline *tr = NULL;
   2930	struct bpf_tracing_link *link;
   2931	u64 key = 0;
   2932	int err;
   2933
   2934	switch (prog->type) {
   2935	case BPF_PROG_TYPE_TRACING:
   2936		if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
   2937		    prog->expected_attach_type != BPF_TRACE_FEXIT &&
   2938		    prog->expected_attach_type != BPF_MODIFY_RETURN) {
   2939			err = -EINVAL;
   2940			goto out_put_prog;
   2941		}
   2942		break;
   2943	case BPF_PROG_TYPE_EXT:
   2944		if (prog->expected_attach_type != 0) {
   2945			err = -EINVAL;
   2946			goto out_put_prog;
   2947		}
   2948		break;
   2949	case BPF_PROG_TYPE_LSM:
   2950		if (prog->expected_attach_type != BPF_LSM_MAC) {
   2951			err = -EINVAL;
   2952			goto out_put_prog;
   2953		}
   2954		break;
   2955	default:
   2956		err = -EINVAL;
   2957		goto out_put_prog;
   2958	}
   2959
   2960	if (!!tgt_prog_fd != !!btf_id) {
   2961		err = -EINVAL;
   2962		goto out_put_prog;
   2963	}
   2964
   2965	if (tgt_prog_fd) {
   2966		/* For now we only allow new targets for BPF_PROG_TYPE_EXT */
   2967		if (prog->type != BPF_PROG_TYPE_EXT) {
   2968			err = -EINVAL;
   2969			goto out_put_prog;
   2970		}
   2971
   2972		tgt_prog = bpf_prog_get(tgt_prog_fd);
   2973		if (IS_ERR(tgt_prog)) {
   2974			err = PTR_ERR(tgt_prog);
   2975			tgt_prog = NULL;
   2976			goto out_put_prog;
   2977		}
   2978
   2979		key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
   2980	}
   2981
   2982	link = kzalloc(sizeof(*link), GFP_USER);
   2983	if (!link) {
   2984		err = -ENOMEM;
   2985		goto out_put_prog;
   2986	}
   2987	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
   2988		      &bpf_tracing_link_lops, prog);
   2989	link->attach_type = prog->expected_attach_type;
   2990	link->link.cookie = bpf_cookie;
   2991
   2992	mutex_lock(&prog->aux->dst_mutex);
   2993
   2994	/* There are a few possible cases here:
   2995	 *
   2996	 * - if prog->aux->dst_trampoline is set, the program was just loaded
   2997	 *   and not yet attached to anything, so we can use the values stored
   2998	 *   in prog->aux
   2999	 *
   3000	 * - if prog->aux->dst_trampoline is NULL, the program has already been
   3001         *   attached to a target and its initial target was cleared (below)
   3002	 *
   3003	 * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
   3004	 *   target_btf_id using the link_create API.
   3005	 *
   3006	 * - if tgt_prog == NULL when this function was called using the old
   3007	 *   raw_tracepoint_open API, and we need a target from prog->aux
   3008	 *
   3009	 * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
   3010	 *   was detached and is going for re-attachment.
   3011	 */
   3012	if (!prog->aux->dst_trampoline && !tgt_prog) {
   3013		/*
   3014		 * Allow re-attach for TRACING and LSM programs. If it's
   3015		 * currently linked, bpf_trampoline_link_prog will fail.
   3016		 * EXT programs need to specify tgt_prog_fd, so they
   3017		 * re-attach in separate code path.
   3018		 */
   3019		if (prog->type != BPF_PROG_TYPE_TRACING &&
   3020		    prog->type != BPF_PROG_TYPE_LSM) {
   3021			err = -EINVAL;
   3022			goto out_unlock;
   3023		}
   3024		btf_id = prog->aux->attach_btf_id;
   3025		key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
   3026	}
   3027
   3028	if (!prog->aux->dst_trampoline ||
   3029	    (key && key != prog->aux->dst_trampoline->key)) {
   3030		/* If there is no saved target, or the specified target is
   3031		 * different from the destination specified at load time, we
   3032		 * need a new trampoline and a check for compatibility
   3033		 */
   3034		struct bpf_attach_target_info tgt_info = {};
   3035
   3036		err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
   3037					      &tgt_info);
   3038		if (err)
   3039			goto out_unlock;
   3040
   3041		tr = bpf_trampoline_get(key, &tgt_info);
   3042		if (!tr) {
   3043			err = -ENOMEM;
   3044			goto out_unlock;
   3045		}
   3046	} else {
   3047		/* The caller didn't specify a target, or the target was the
   3048		 * same as the destination supplied during program load. This
   3049		 * means we can reuse the trampoline and reference from program
   3050		 * load time, and there is no need to allocate a new one. This
   3051		 * can only happen once for any program, as the saved values in
   3052		 * prog->aux are cleared below.
   3053		 */
   3054		tr = prog->aux->dst_trampoline;
   3055		tgt_prog = prog->aux->dst_prog;
   3056	}
   3057
   3058	err = bpf_link_prime(&link->link.link, &link_primer);
   3059	if (err)
   3060		goto out_unlock;
   3061
   3062	err = bpf_trampoline_link_prog(&link->link, tr);
   3063	if (err) {
   3064		bpf_link_cleanup(&link_primer);
   3065		link = NULL;
   3066		goto out_unlock;
   3067	}
   3068
   3069	link->tgt_prog = tgt_prog;
   3070	link->trampoline = tr;
   3071
   3072	/* Always clear the trampoline and target prog from prog->aux to make
   3073	 * sure the original attach destination is not kept alive after a
   3074	 * program is (re-)attached to another target.
   3075	 */
   3076	if (prog->aux->dst_prog &&
   3077	    (tgt_prog_fd || tr != prog->aux->dst_trampoline))
   3078		/* got extra prog ref from syscall, or attaching to different prog */
   3079		bpf_prog_put(prog->aux->dst_prog);
   3080	if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
   3081		/* we allocated a new trampoline, so free the old one */
   3082		bpf_trampoline_put(prog->aux->dst_trampoline);
   3083
   3084	prog->aux->dst_prog = NULL;
   3085	prog->aux->dst_trampoline = NULL;
   3086	mutex_unlock(&prog->aux->dst_mutex);
   3087
   3088	return bpf_link_settle(&link_primer);
   3089out_unlock:
   3090	if (tr && tr != prog->aux->dst_trampoline)
   3091		bpf_trampoline_put(tr);
   3092	mutex_unlock(&prog->aux->dst_mutex);
   3093	kfree(link);
   3094out_put_prog:
   3095	if (tgt_prog_fd && tgt_prog)
   3096		bpf_prog_put(tgt_prog);
   3097	return err;
   3098}
   3099
   3100struct bpf_raw_tp_link {
   3101	struct bpf_link link;
   3102	struct bpf_raw_event_map *btp;
   3103};
   3104
   3105static void bpf_raw_tp_link_release(struct bpf_link *link)
   3106{
   3107	struct bpf_raw_tp_link *raw_tp =
   3108		container_of(link, struct bpf_raw_tp_link, link);
   3109
   3110	bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
   3111	bpf_put_raw_tracepoint(raw_tp->btp);
   3112}
   3113
   3114static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
   3115{
   3116	struct bpf_raw_tp_link *raw_tp =
   3117		container_of(link, struct bpf_raw_tp_link, link);
   3118
   3119	kfree(raw_tp);
   3120}
   3121
   3122static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
   3123					struct seq_file *seq)
   3124{
   3125	struct bpf_raw_tp_link *raw_tp_link =
   3126		container_of(link, struct bpf_raw_tp_link, link);
   3127
   3128	seq_printf(seq,
   3129		   "tp_name:\t%s\n",
   3130		   raw_tp_link->btp->tp->name);
   3131}
   3132
   3133static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
   3134					  struct bpf_link_info *info)
   3135{
   3136	struct bpf_raw_tp_link *raw_tp_link =
   3137		container_of(link, struct bpf_raw_tp_link, link);
   3138	char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
   3139	const char *tp_name = raw_tp_link->btp->tp->name;
   3140	u32 ulen = info->raw_tracepoint.tp_name_len;
   3141	size_t tp_len = strlen(tp_name);
   3142
   3143	if (!ulen ^ !ubuf)
   3144		return -EINVAL;
   3145
   3146	info->raw_tracepoint.tp_name_len = tp_len + 1;
   3147
   3148	if (!ubuf)
   3149		return 0;
   3150
   3151	if (ulen >= tp_len + 1) {
   3152		if (copy_to_user(ubuf, tp_name, tp_len + 1))
   3153			return -EFAULT;
   3154	} else {
   3155		char zero = '\0';
   3156
   3157		if (copy_to_user(ubuf, tp_name, ulen - 1))
   3158			return -EFAULT;
   3159		if (put_user(zero, ubuf + ulen - 1))
   3160			return -EFAULT;
   3161		return -ENOSPC;
   3162	}
   3163
   3164	return 0;
   3165}
   3166
   3167static const struct bpf_link_ops bpf_raw_tp_link_lops = {
   3168	.release = bpf_raw_tp_link_release,
   3169	.dealloc = bpf_raw_tp_link_dealloc,
   3170	.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
   3171	.fill_link_info = bpf_raw_tp_link_fill_link_info,
   3172};
   3173
   3174#ifdef CONFIG_PERF_EVENTS
   3175struct bpf_perf_link {
   3176	struct bpf_link link;
   3177	struct file *perf_file;
   3178};
   3179
   3180static void bpf_perf_link_release(struct bpf_link *link)
   3181{
   3182	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
   3183	struct perf_event *event = perf_link->perf_file->private_data;
   3184
   3185	perf_event_free_bpf_prog(event);
   3186	fput(perf_link->perf_file);
   3187}
   3188
   3189static void bpf_perf_link_dealloc(struct bpf_link *link)
   3190{
   3191	struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
   3192
   3193	kfree(perf_link);
   3194}
   3195
   3196static const struct bpf_link_ops bpf_perf_link_lops = {
   3197	.release = bpf_perf_link_release,
   3198	.dealloc = bpf_perf_link_dealloc,
   3199};
   3200
   3201static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
   3202{
   3203	struct bpf_link_primer link_primer;
   3204	struct bpf_perf_link *link;
   3205	struct perf_event *event;
   3206	struct file *perf_file;
   3207	int err;
   3208
   3209	if (attr->link_create.flags)
   3210		return -EINVAL;
   3211
   3212	perf_file = perf_event_get(attr->link_create.target_fd);
   3213	if (IS_ERR(perf_file))
   3214		return PTR_ERR(perf_file);
   3215
   3216	link = kzalloc(sizeof(*link), GFP_USER);
   3217	if (!link) {
   3218		err = -ENOMEM;
   3219		goto out_put_file;
   3220	}
   3221	bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
   3222	link->perf_file = perf_file;
   3223
   3224	err = bpf_link_prime(&link->link, &link_primer);
   3225	if (err) {
   3226		kfree(link);
   3227		goto out_put_file;
   3228	}
   3229
   3230	event = perf_file->private_data;
   3231	err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
   3232	if (err) {
   3233		bpf_link_cleanup(&link_primer);
   3234		goto out_put_file;
   3235	}
   3236	/* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
   3237	bpf_prog_inc(prog);
   3238
   3239	return bpf_link_settle(&link_primer);
   3240
   3241out_put_file:
   3242	fput(perf_file);
   3243	return err;
   3244}
   3245#else
   3246static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
   3247{
   3248	return -EOPNOTSUPP;
   3249}
   3250#endif /* CONFIG_PERF_EVENTS */
   3251
   3252static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
   3253				  const char __user *user_tp_name)
   3254{
   3255	struct bpf_link_primer link_primer;
   3256	struct bpf_raw_tp_link *link;
   3257	struct bpf_raw_event_map *btp;
   3258	const char *tp_name;
   3259	char buf[128];
   3260	int err;
   3261
   3262	switch (prog->type) {
   3263	case BPF_PROG_TYPE_TRACING:
   3264	case BPF_PROG_TYPE_EXT:
   3265	case BPF_PROG_TYPE_LSM:
   3266		if (user_tp_name)
   3267			/* The attach point for this category of programs
   3268			 * should be specified via btf_id during program load.
   3269			 */
   3270			return -EINVAL;
   3271		if (prog->type == BPF_PROG_TYPE_TRACING &&
   3272		    prog->expected_attach_type == BPF_TRACE_RAW_TP) {
   3273			tp_name = prog->aux->attach_func_name;
   3274			break;
   3275		}
   3276		return bpf_tracing_prog_attach(prog, 0, 0, 0);
   3277	case BPF_PROG_TYPE_RAW_TRACEPOINT:
   3278	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
   3279		if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
   3280			return -EFAULT;
   3281		buf[sizeof(buf) - 1] = 0;
   3282		tp_name = buf;
   3283		break;
   3284	default:
   3285		return -EINVAL;
   3286	}
   3287
   3288	btp = bpf_get_raw_tracepoint(tp_name);
   3289	if (!btp)
   3290		return -ENOENT;
   3291
   3292	link = kzalloc(sizeof(*link), GFP_USER);
   3293	if (!link) {
   3294		err = -ENOMEM;
   3295		goto out_put_btp;
   3296	}
   3297	bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
   3298		      &bpf_raw_tp_link_lops, prog);
   3299	link->btp = btp;
   3300
   3301	err = bpf_link_prime(&link->link, &link_primer);
   3302	if (err) {
   3303		kfree(link);
   3304		goto out_put_btp;
   3305	}
   3306
   3307	err = bpf_probe_register(link->btp, prog);
   3308	if (err) {
   3309		bpf_link_cleanup(&link_primer);
   3310		goto out_put_btp;
   3311	}
   3312
   3313	return bpf_link_settle(&link_primer);
   3314
   3315out_put_btp:
   3316	bpf_put_raw_tracepoint(btp);
   3317	return err;
   3318}
   3319
   3320#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
   3321
   3322static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
   3323{
   3324	struct bpf_prog *prog;
   3325	int fd;
   3326
   3327	if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
   3328		return -EINVAL;
   3329
   3330	prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
   3331	if (IS_ERR(prog))
   3332		return PTR_ERR(prog);
   3333
   3334	fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
   3335	if (fd < 0)
   3336		bpf_prog_put(prog);
   3337	return fd;
   3338}
   3339
   3340static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
   3341					     enum bpf_attach_type attach_type)
   3342{
   3343	switch (prog->type) {
   3344	case BPF_PROG_TYPE_CGROUP_SOCK:
   3345	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
   3346	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
   3347	case BPF_PROG_TYPE_SK_LOOKUP:
   3348		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
   3349	case BPF_PROG_TYPE_CGROUP_SKB:
   3350		if (!capable(CAP_NET_ADMIN))
   3351			/* cg-skb progs can be loaded by unpriv user.
   3352			 * check permissions at attach time.
   3353			 */
   3354			return -EPERM;
   3355		return prog->enforce_expected_attach_type &&
   3356			prog->expected_attach_type != attach_type ?
   3357			-EINVAL : 0;
   3358	default:
   3359		return 0;
   3360	}
   3361}
   3362
   3363static enum bpf_prog_type
   3364attach_type_to_prog_type(enum bpf_attach_type attach_type)
   3365{
   3366	switch (attach_type) {
   3367	case BPF_CGROUP_INET_INGRESS:
   3368	case BPF_CGROUP_INET_EGRESS:
   3369		return BPF_PROG_TYPE_CGROUP_SKB;
   3370	case BPF_CGROUP_INET_SOCK_CREATE:
   3371	case BPF_CGROUP_INET_SOCK_RELEASE:
   3372	case BPF_CGROUP_INET4_POST_BIND:
   3373	case BPF_CGROUP_INET6_POST_BIND:
   3374		return BPF_PROG_TYPE_CGROUP_SOCK;
   3375	case BPF_CGROUP_INET4_BIND:
   3376	case BPF_CGROUP_INET6_BIND:
   3377	case BPF_CGROUP_INET4_CONNECT:
   3378	case BPF_CGROUP_INET6_CONNECT:
   3379	case BPF_CGROUP_INET4_GETPEERNAME:
   3380	case BPF_CGROUP_INET6_GETPEERNAME:
   3381	case BPF_CGROUP_INET4_GETSOCKNAME:
   3382	case BPF_CGROUP_INET6_GETSOCKNAME:
   3383	case BPF_CGROUP_UDP4_SENDMSG:
   3384	case BPF_CGROUP_UDP6_SENDMSG:
   3385	case BPF_CGROUP_UDP4_RECVMSG:
   3386	case BPF_CGROUP_UDP6_RECVMSG:
   3387		return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
   3388	case BPF_CGROUP_SOCK_OPS:
   3389		return BPF_PROG_TYPE_SOCK_OPS;
   3390	case BPF_CGROUP_DEVICE:
   3391		return BPF_PROG_TYPE_CGROUP_DEVICE;
   3392	case BPF_SK_MSG_VERDICT:
   3393		return BPF_PROG_TYPE_SK_MSG;
   3394	case BPF_SK_SKB_STREAM_PARSER:
   3395	case BPF_SK_SKB_STREAM_VERDICT:
   3396	case BPF_SK_SKB_VERDICT:
   3397		return BPF_PROG_TYPE_SK_SKB;
   3398	case BPF_LIRC_MODE2:
   3399		return BPF_PROG_TYPE_LIRC_MODE2;
   3400	case BPF_FLOW_DISSECTOR:
   3401		return BPF_PROG_TYPE_FLOW_DISSECTOR;
   3402	case BPF_CGROUP_SYSCTL:
   3403		return BPF_PROG_TYPE_CGROUP_SYSCTL;
   3404	case BPF_CGROUP_GETSOCKOPT:
   3405	case BPF_CGROUP_SETSOCKOPT:
   3406		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
   3407	case BPF_TRACE_ITER:
   3408	case BPF_TRACE_RAW_TP:
   3409	case BPF_TRACE_FENTRY:
   3410	case BPF_TRACE_FEXIT:
   3411	case BPF_MODIFY_RETURN:
   3412		return BPF_PROG_TYPE_TRACING;
   3413	case BPF_LSM_MAC:
   3414		return BPF_PROG_TYPE_LSM;
   3415	case BPF_SK_LOOKUP:
   3416		return BPF_PROG_TYPE_SK_LOOKUP;
   3417	case BPF_XDP:
   3418		return BPF_PROG_TYPE_XDP;
   3419	default:
   3420		return BPF_PROG_TYPE_UNSPEC;
   3421	}
   3422}
   3423
   3424#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
   3425
   3426#define BPF_F_ATTACH_MASK \
   3427	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
   3428
   3429static int bpf_prog_attach(const union bpf_attr *attr)
   3430{
   3431	enum bpf_prog_type ptype;
   3432	struct bpf_prog *prog;
   3433	int ret;
   3434
   3435	if (CHECK_ATTR(BPF_PROG_ATTACH))
   3436		return -EINVAL;
   3437
   3438	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
   3439		return -EINVAL;
   3440
   3441	ptype = attach_type_to_prog_type(attr->attach_type);
   3442	if (ptype == BPF_PROG_TYPE_UNSPEC)
   3443		return -EINVAL;
   3444
   3445	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
   3446	if (IS_ERR(prog))
   3447		return PTR_ERR(prog);
   3448
   3449	if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
   3450		bpf_prog_put(prog);
   3451		return -EINVAL;
   3452	}
   3453
   3454	switch (ptype) {
   3455	case BPF_PROG_TYPE_SK_SKB:
   3456	case BPF_PROG_TYPE_SK_MSG:
   3457		ret = sock_map_get_from_fd(attr, prog);
   3458		break;
   3459	case BPF_PROG_TYPE_LIRC_MODE2:
   3460		ret = lirc_prog_attach(attr, prog);
   3461		break;
   3462	case BPF_PROG_TYPE_FLOW_DISSECTOR:
   3463		ret = netns_bpf_prog_attach(attr, prog);
   3464		break;
   3465	case BPF_PROG_TYPE_CGROUP_DEVICE:
   3466	case BPF_PROG_TYPE_CGROUP_SKB:
   3467	case BPF_PROG_TYPE_CGROUP_SOCK:
   3468	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
   3469	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
   3470	case BPF_PROG_TYPE_CGROUP_SYSCTL:
   3471	case BPF_PROG_TYPE_SOCK_OPS:
   3472		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
   3473		break;
   3474	default:
   3475		ret = -EINVAL;
   3476	}
   3477
   3478	if (ret)
   3479		bpf_prog_put(prog);
   3480	return ret;
   3481}
   3482
   3483#define BPF_PROG_DETACH_LAST_FIELD attach_type
   3484
   3485static int bpf_prog_detach(const union bpf_attr *attr)
   3486{
   3487	enum bpf_prog_type ptype;
   3488
   3489	if (CHECK_ATTR(BPF_PROG_DETACH))
   3490		return -EINVAL;
   3491
   3492	ptype = attach_type_to_prog_type(attr->attach_type);
   3493
   3494	switch (ptype) {
   3495	case BPF_PROG_TYPE_SK_MSG:
   3496	case BPF_PROG_TYPE_SK_SKB:
   3497		return sock_map_prog_detach(attr, ptype);
   3498	case BPF_PROG_TYPE_LIRC_MODE2:
   3499		return lirc_prog_detach(attr);
   3500	case BPF_PROG_TYPE_FLOW_DISSECTOR:
   3501		return netns_bpf_prog_detach(attr, ptype);
   3502	case BPF_PROG_TYPE_CGROUP_DEVICE:
   3503	case BPF_PROG_TYPE_CGROUP_SKB:
   3504	case BPF_PROG_TYPE_CGROUP_SOCK:
   3505	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
   3506	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
   3507	case BPF_PROG_TYPE_CGROUP_SYSCTL:
   3508	case BPF_PROG_TYPE_SOCK_OPS:
   3509		return cgroup_bpf_prog_detach(attr, ptype);
   3510	default:
   3511		return -EINVAL;
   3512	}
   3513}
   3514
   3515#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
   3516
   3517static int bpf_prog_query(const union bpf_attr *attr,
   3518			  union bpf_attr __user *uattr)
   3519{
   3520	if (!capable(CAP_NET_ADMIN))
   3521		return -EPERM;
   3522	if (CHECK_ATTR(BPF_PROG_QUERY))
   3523		return -EINVAL;
   3524	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
   3525		return -EINVAL;
   3526
   3527	switch (attr->query.attach_type) {
   3528	case BPF_CGROUP_INET_INGRESS:
   3529	case BPF_CGROUP_INET_EGRESS:
   3530	case BPF_CGROUP_INET_SOCK_CREATE:
   3531	case BPF_CGROUP_INET_SOCK_RELEASE:
   3532	case BPF_CGROUP_INET4_BIND:
   3533	case BPF_CGROUP_INET6_BIND:
   3534	case BPF_CGROUP_INET4_POST_BIND:
   3535	case BPF_CGROUP_INET6_POST_BIND:
   3536	case BPF_CGROUP_INET4_CONNECT:
   3537	case BPF_CGROUP_INET6_CONNECT:
   3538	case BPF_CGROUP_INET4_GETPEERNAME:
   3539	case BPF_CGROUP_INET6_GETPEERNAME:
   3540	case BPF_CGROUP_INET4_GETSOCKNAME:
   3541	case BPF_CGROUP_INET6_GETSOCKNAME:
   3542	case BPF_CGROUP_UDP4_SENDMSG:
   3543	case BPF_CGROUP_UDP6_SENDMSG:
   3544	case BPF_CGROUP_UDP4_RECVMSG:
   3545	case BPF_CGROUP_UDP6_RECVMSG:
   3546	case BPF_CGROUP_SOCK_OPS:
   3547	case BPF_CGROUP_DEVICE:
   3548	case BPF_CGROUP_SYSCTL:
   3549	case BPF_CGROUP_GETSOCKOPT:
   3550	case BPF_CGROUP_SETSOCKOPT:
   3551		return cgroup_bpf_prog_query(attr, uattr);
   3552	case BPF_LIRC_MODE2:
   3553		return lirc_prog_query(attr, uattr);
   3554	case BPF_FLOW_DISSECTOR:
   3555	case BPF_SK_LOOKUP:
   3556		return netns_bpf_prog_query(attr, uattr);
   3557	case BPF_SK_SKB_STREAM_PARSER:
   3558	case BPF_SK_SKB_STREAM_VERDICT:
   3559	case BPF_SK_MSG_VERDICT:
   3560	case BPF_SK_SKB_VERDICT:
   3561		return sock_map_bpf_prog_query(attr, uattr);
   3562	default:
   3563		return -EINVAL;
   3564	}
   3565}
   3566
   3567#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
   3568
   3569static int bpf_prog_test_run(const union bpf_attr *attr,
   3570			     union bpf_attr __user *uattr)
   3571{
   3572	struct bpf_prog *prog;
   3573	int ret = -ENOTSUPP;
   3574
   3575	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
   3576		return -EINVAL;
   3577
   3578	if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
   3579	    (!attr->test.ctx_size_in && attr->test.ctx_in))
   3580		return -EINVAL;
   3581
   3582	if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
   3583	    (!attr->test.ctx_size_out && attr->test.ctx_out))
   3584		return -EINVAL;
   3585
   3586	prog = bpf_prog_get(attr->test.prog_fd);
   3587	if (IS_ERR(prog))
   3588		return PTR_ERR(prog);
   3589
   3590	if (prog->aux->ops->test_run)
   3591		ret = prog->aux->ops->test_run(prog, attr, uattr);
   3592
   3593	bpf_prog_put(prog);
   3594	return ret;
   3595}
   3596
   3597#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
   3598
   3599static int bpf_obj_get_next_id(const union bpf_attr *attr,
   3600			       union bpf_attr __user *uattr,
   3601			       struct idr *idr,
   3602			       spinlock_t *lock)
   3603{
   3604	u32 next_id = attr->start_id;
   3605	int err = 0;
   3606
   3607	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
   3608		return -EINVAL;
   3609
   3610	if (!capable(CAP_SYS_ADMIN))
   3611		return -EPERM;
   3612
   3613	next_id++;
   3614	spin_lock_bh(lock);
   3615	if (!idr_get_next(idr, &next_id))
   3616		err = -ENOENT;
   3617	spin_unlock_bh(lock);
   3618
   3619	if (!err)
   3620		err = put_user(next_id, &uattr->next_id);
   3621
   3622	return err;
   3623}
   3624
   3625struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
   3626{
   3627	struct bpf_map *map;
   3628
   3629	spin_lock_bh(&map_idr_lock);
   3630again:
   3631	map = idr_get_next(&map_idr, id);
   3632	if (map) {
   3633		map = __bpf_map_inc_not_zero(map, false);
   3634		if (IS_ERR(map)) {
   3635			(*id)++;
   3636			goto again;
   3637		}
   3638	}
   3639	spin_unlock_bh(&map_idr_lock);
   3640
   3641	return map;
   3642}
   3643
   3644struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
   3645{
   3646	struct bpf_prog *prog;
   3647
   3648	spin_lock_bh(&prog_idr_lock);
   3649again:
   3650	prog = idr_get_next(&prog_idr, id);
   3651	if (prog) {
   3652		prog = bpf_prog_inc_not_zero(prog);
   3653		if (IS_ERR(prog)) {
   3654			(*id)++;
   3655			goto again;
   3656		}
   3657	}
   3658	spin_unlock_bh(&prog_idr_lock);
   3659
   3660	return prog;
   3661}
   3662
   3663#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
   3664
   3665struct bpf_prog *bpf_prog_by_id(u32 id)
   3666{
   3667	struct bpf_prog *prog;
   3668
   3669	if (!id)
   3670		return ERR_PTR(-ENOENT);
   3671
   3672	spin_lock_bh(&prog_idr_lock);
   3673	prog = idr_find(&prog_idr, id);
   3674	if (prog)
   3675		prog = bpf_prog_inc_not_zero(prog);
   3676	else
   3677		prog = ERR_PTR(-ENOENT);
   3678	spin_unlock_bh(&prog_idr_lock);
   3679	return prog;
   3680}
   3681
   3682static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
   3683{
   3684	struct bpf_prog *prog;
   3685	u32 id = attr->prog_id;
   3686	int fd;
   3687
   3688	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
   3689		return -EINVAL;
   3690
   3691	if (!capable(CAP_SYS_ADMIN))
   3692		return -EPERM;
   3693
   3694	prog = bpf_prog_by_id(id);
   3695	if (IS_ERR(prog))
   3696		return PTR_ERR(prog);
   3697
   3698	fd = bpf_prog_new_fd(prog);
   3699	if (fd < 0)
   3700		bpf_prog_put(prog);
   3701
   3702	return fd;
   3703}
   3704
   3705#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
   3706
   3707static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
   3708{
   3709	struct bpf_map *map;
   3710	u32 id = attr->map_id;
   3711	int f_flags;
   3712	int fd;
   3713
   3714	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
   3715	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
   3716		return -EINVAL;
   3717
   3718	if (!capable(CAP_SYS_ADMIN))
   3719		return -EPERM;
   3720
   3721	f_flags = bpf_get_file_flag(attr->open_flags);
   3722	if (f_flags < 0)
   3723		return f_flags;
   3724
   3725	spin_lock_bh(&map_idr_lock);
   3726	map = idr_find(&map_idr, id);
   3727	if (map)
   3728		map = __bpf_map_inc_not_zero(map, true);
   3729	else
   3730		map = ERR_PTR(-ENOENT);
   3731	spin_unlock_bh(&map_idr_lock);
   3732
   3733	if (IS_ERR(map))
   3734		return PTR_ERR(map);
   3735
   3736	fd = bpf_map_new_fd(map, f_flags);
   3737	if (fd < 0)
   3738		bpf_map_put_with_uref(map);
   3739
   3740	return fd;
   3741}
   3742
   3743static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
   3744					      unsigned long addr, u32 *off,
   3745					      u32 *type)
   3746{
   3747	const struct bpf_map *map;
   3748	int i;
   3749
   3750	mutex_lock(&prog->aux->used_maps_mutex);
   3751	for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
   3752		map = prog->aux->used_maps[i];
   3753		if (map == (void *)addr) {
   3754			*type = BPF_PSEUDO_MAP_FD;
   3755			goto out;
   3756		}
   3757		if (!map->ops->map_direct_value_meta)
   3758			continue;
   3759		if (!map->ops->map_direct_value_meta(map, addr, off)) {
   3760			*type = BPF_PSEUDO_MAP_VALUE;
   3761			goto out;
   3762		}
   3763	}
   3764	map = NULL;
   3765
   3766out:
   3767	mutex_unlock(&prog->aux->used_maps_mutex);
   3768	return map;
   3769}
   3770
   3771static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
   3772					      const struct cred *f_cred)
   3773{
   3774	const struct bpf_map *map;
   3775	struct bpf_insn *insns;
   3776	u32 off, type;
   3777	u64 imm;
   3778	u8 code;
   3779	int i;
   3780
   3781	insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
   3782			GFP_USER);
   3783	if (!insns)
   3784		return insns;
   3785
   3786	for (i = 0; i < prog->len; i++) {
   3787		code = insns[i].code;
   3788
   3789		if (code == (BPF_JMP | BPF_TAIL_CALL)) {
   3790			insns[i].code = BPF_JMP | BPF_CALL;
   3791			insns[i].imm = BPF_FUNC_tail_call;
   3792			/* fall-through */
   3793		}
   3794		if (code == (BPF_JMP | BPF_CALL) ||
   3795		    code == (BPF_JMP | BPF_CALL_ARGS)) {
   3796			if (code == (BPF_JMP | BPF_CALL_ARGS))
   3797				insns[i].code = BPF_JMP | BPF_CALL;
   3798			if (!bpf_dump_raw_ok(f_cred))
   3799				insns[i].imm = 0;
   3800			continue;
   3801		}
   3802		if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
   3803			insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
   3804			continue;
   3805		}
   3806
   3807		if (code != (BPF_LD | BPF_IMM | BPF_DW))
   3808			continue;
   3809
   3810		imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
   3811		map = bpf_map_from_imm(prog, imm, &off, &type);
   3812		if (map) {
   3813			insns[i].src_reg = type;
   3814			insns[i].imm = map->id;
   3815			insns[i + 1].imm = off;
   3816			continue;
   3817		}
   3818	}
   3819
   3820	return insns;
   3821}
   3822
   3823static int set_info_rec_size(struct bpf_prog_info *info)
   3824{
   3825	/*
   3826	 * Ensure info.*_rec_size is the same as kernel expected size
   3827	 *
   3828	 * or
   3829	 *
   3830	 * Only allow zero *_rec_size if both _rec_size and _cnt are
   3831	 * zero.  In this case, the kernel will set the expected
   3832	 * _rec_size back to the info.
   3833	 */
   3834
   3835	if ((info->nr_func_info || info->func_info_rec_size) &&
   3836	    info->func_info_rec_size != sizeof(struct bpf_func_info))
   3837		return -EINVAL;
   3838
   3839	if ((info->nr_line_info || info->line_info_rec_size) &&
   3840	    info->line_info_rec_size != sizeof(struct bpf_line_info))
   3841		return -EINVAL;
   3842
   3843	if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
   3844	    info->jited_line_info_rec_size != sizeof(__u64))
   3845		return -EINVAL;
   3846
   3847	info->func_info_rec_size = sizeof(struct bpf_func_info);
   3848	info->line_info_rec_size = sizeof(struct bpf_line_info);
   3849	info->jited_line_info_rec_size = sizeof(__u64);
   3850
   3851	return 0;
   3852}
   3853
   3854static int bpf_prog_get_info_by_fd(struct file *file,
   3855				   struct bpf_prog *prog,
   3856				   const union bpf_attr *attr,
   3857				   union bpf_attr __user *uattr)
   3858{
   3859	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
   3860	struct bpf_prog_info info;
   3861	u32 info_len = attr->info.info_len;
   3862	struct bpf_prog_kstats stats;
   3863	char __user *uinsns;
   3864	u32 ulen;
   3865	int err;
   3866
   3867	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
   3868	if (err)
   3869		return err;
   3870	info_len = min_t(u32, sizeof(info), info_len);
   3871
   3872	memset(&info, 0, sizeof(info));
   3873	if (copy_from_user(&info, uinfo, info_len))
   3874		return -EFAULT;
   3875
   3876	info.type = prog->type;
   3877	info.id = prog->aux->id;
   3878	info.load_time = prog->aux->load_time;
   3879	info.created_by_uid = from_kuid_munged(current_user_ns(),
   3880					       prog->aux->user->uid);
   3881	info.gpl_compatible = prog->gpl_compatible;
   3882
   3883	memcpy(info.tag, prog->tag, sizeof(prog->tag));
   3884	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
   3885
   3886	mutex_lock(&prog->aux->used_maps_mutex);
   3887	ulen = info.nr_map_ids;
   3888	info.nr_map_ids = prog->aux->used_map_cnt;
   3889	ulen = min_t(u32, info.nr_map_ids, ulen);
   3890	if (ulen) {
   3891		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
   3892		u32 i;
   3893
   3894		for (i = 0; i < ulen; i++)
   3895			if (put_user(prog->aux->used_maps[i]->id,
   3896				     &user_map_ids[i])) {
   3897				mutex_unlock(&prog->aux->used_maps_mutex);
   3898				return -EFAULT;
   3899			}
   3900	}
   3901	mutex_unlock(&prog->aux->used_maps_mutex);
   3902
   3903	err = set_info_rec_size(&info);
   3904	if (err)
   3905		return err;
   3906
   3907	bpf_prog_get_stats(prog, &stats);
   3908	info.run_time_ns = stats.nsecs;
   3909	info.run_cnt = stats.cnt;
   3910	info.recursion_misses = stats.misses;
   3911
   3912	info.verified_insns = prog->aux->verified_insns;
   3913
   3914	if (!bpf_capable()) {
   3915		info.jited_prog_len = 0;
   3916		info.xlated_prog_len = 0;
   3917		info.nr_jited_ksyms = 0;
   3918		info.nr_jited_func_lens = 0;
   3919		info.nr_func_info = 0;
   3920		info.nr_line_info = 0;
   3921		info.nr_jited_line_info = 0;
   3922		goto done;
   3923	}
   3924
   3925	ulen = info.xlated_prog_len;
   3926	info.xlated_prog_len = bpf_prog_insn_size(prog);
   3927	if (info.xlated_prog_len && ulen) {
   3928		struct bpf_insn *insns_sanitized;
   3929		bool fault;
   3930
   3931		if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
   3932			info.xlated_prog_insns = 0;
   3933			goto done;
   3934		}
   3935		insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
   3936		if (!insns_sanitized)
   3937			return -ENOMEM;
   3938		uinsns = u64_to_user_ptr(info.xlated_prog_insns);
   3939		ulen = min_t(u32, info.xlated_prog_len, ulen);
   3940		fault = copy_to_user(uinsns, insns_sanitized, ulen);
   3941		kfree(insns_sanitized);
   3942		if (fault)
   3943			return -EFAULT;
   3944	}
   3945
   3946	if (bpf_prog_is_dev_bound(prog->aux)) {
   3947		err = bpf_prog_offload_info_fill(&info, prog);
   3948		if (err)
   3949			return err;
   3950		goto done;
   3951	}
   3952
   3953	/* NOTE: the following code is supposed to be skipped for offload.
   3954	 * bpf_prog_offload_info_fill() is the place to fill similar fields
   3955	 * for offload.
   3956	 */
   3957	ulen = info.jited_prog_len;
   3958	if (prog->aux->func_cnt) {
   3959		u32 i;
   3960
   3961		info.jited_prog_len = 0;
   3962		for (i = 0; i < prog->aux->func_cnt; i++)
   3963			info.jited_prog_len += prog->aux->func[i]->jited_len;
   3964	} else {
   3965		info.jited_prog_len = prog->jited_len;
   3966	}
   3967
   3968	if (info.jited_prog_len && ulen) {
   3969		if (bpf_dump_raw_ok(file->f_cred)) {
   3970			uinsns = u64_to_user_ptr(info.jited_prog_insns);
   3971			ulen = min_t(u32, info.jited_prog_len, ulen);
   3972
   3973			/* for multi-function programs, copy the JITed
   3974			 * instructions for all the functions
   3975			 */
   3976			if (prog->aux->func_cnt) {
   3977				u32 len, free, i;
   3978				u8 *img;
   3979
   3980				free = ulen;
   3981				for (i = 0; i < prog->aux->func_cnt; i++) {
   3982					len = prog->aux->func[i]->jited_len;
   3983					len = min_t(u32, len, free);
   3984					img = (u8 *) prog->aux->func[i]->bpf_func;
   3985					if (copy_to_user(uinsns, img, len))
   3986						return -EFAULT;
   3987					uinsns += len;
   3988					free -= len;
   3989					if (!free)
   3990						break;
   3991				}
   3992			} else {
   3993				if (copy_to_user(uinsns, prog->bpf_func, ulen))
   3994					return -EFAULT;
   3995			}
   3996		} else {
   3997			info.jited_prog_insns = 0;
   3998		}
   3999	}
   4000
   4001	ulen = info.nr_jited_ksyms;
   4002	info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
   4003	if (ulen) {
   4004		if (bpf_dump_raw_ok(file->f_cred)) {
   4005			unsigned long ksym_addr;
   4006			u64 __user *user_ksyms;
   4007			u32 i;
   4008
   4009			/* copy the address of the kernel symbol
   4010			 * corresponding to each function
   4011			 */
   4012			ulen = min_t(u32, info.nr_jited_ksyms, ulen);
   4013			user_ksyms = u64_to_user_ptr(info.jited_ksyms);
   4014			if (prog->aux->func_cnt) {
   4015				for (i = 0; i < ulen; i++) {
   4016					ksym_addr = (unsigned long)
   4017						prog->aux->func[i]->bpf_func;
   4018					if (put_user((u64) ksym_addr,
   4019						     &user_ksyms[i]))
   4020						return -EFAULT;
   4021				}
   4022			} else {
   4023				ksym_addr = (unsigned long) prog->bpf_func;
   4024				if (put_user((u64) ksym_addr, &user_ksyms[0]))
   4025					return -EFAULT;
   4026			}
   4027		} else {
   4028			info.jited_ksyms = 0;
   4029		}
   4030	}
   4031
   4032	ulen = info.nr_jited_func_lens;
   4033	info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
   4034	if (ulen) {
   4035		if (bpf_dump_raw_ok(file->f_cred)) {
   4036			u32 __user *user_lens;
   4037			u32 func_len, i;
   4038
   4039			/* copy the JITed image lengths for each function */
   4040			ulen = min_t(u32, info.nr_jited_func_lens, ulen);
   4041			user_lens = u64_to_user_ptr(info.jited_func_lens);
   4042			if (prog->aux->func_cnt) {
   4043				for (i = 0; i < ulen; i++) {
   4044					func_len =
   4045						prog->aux->func[i]->jited_len;
   4046					if (put_user(func_len, &user_lens[i]))
   4047						return -EFAULT;
   4048				}
   4049			} else {
   4050				func_len = prog->jited_len;
   4051				if (put_user(func_len, &user_lens[0]))
   4052					return -EFAULT;
   4053			}
   4054		} else {
   4055			info.jited_func_lens = 0;
   4056		}
   4057	}
   4058
   4059	if (prog->aux->btf)
   4060		info.btf_id = btf_obj_id(prog->aux->btf);
   4061
   4062	ulen = info.nr_func_info;
   4063	info.nr_func_info = prog->aux->func_info_cnt;
   4064	if (info.nr_func_info && ulen) {
   4065		char __user *user_finfo;
   4066
   4067		user_finfo = u64_to_user_ptr(info.func_info);
   4068		ulen = min_t(u32, info.nr_func_info, ulen);
   4069		if (copy_to_user(user_finfo, prog->aux->func_info,
   4070				 info.func_info_rec_size * ulen))
   4071			return -EFAULT;
   4072	}
   4073
   4074	ulen = info.nr_line_info;
   4075	info.nr_line_info = prog->aux->nr_linfo;
   4076	if (info.nr_line_info && ulen) {
   4077		__u8 __user *user_linfo;
   4078
   4079		user_linfo = u64_to_user_ptr(info.line_info);
   4080		ulen = min_t(u32, info.nr_line_info, ulen);
   4081		if (copy_to_user(user_linfo, prog->aux->linfo,
   4082				 info.line_info_rec_size * ulen))
   4083			return -EFAULT;
   4084	}
   4085
   4086	ulen = info.nr_jited_line_info;
   4087	if (prog->aux->jited_linfo)
   4088		info.nr_jited_line_info = prog->aux->nr_linfo;
   4089	else
   4090		info.nr_jited_line_info = 0;
   4091	if (info.nr_jited_line_info && ulen) {
   4092		if (bpf_dump_raw_ok(file->f_cred)) {
   4093			__u64 __user *user_linfo;
   4094			u32 i;
   4095
   4096			user_linfo = u64_to_user_ptr(info.jited_line_info);
   4097			ulen = min_t(u32, info.nr_jited_line_info, ulen);
   4098			for (i = 0; i < ulen; i++) {
   4099				if (put_user((__u64)(long)prog->aux->jited_linfo[i],
   4100					     &user_linfo[i]))
   4101					return -EFAULT;
   4102			}
   4103		} else {
   4104			info.jited_line_info = 0;
   4105		}
   4106	}
   4107
   4108	ulen = info.nr_prog_tags;
   4109	info.nr_prog_tags = prog->aux->func_cnt ? : 1;
   4110	if (ulen) {
   4111		__u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
   4112		u32 i;
   4113
   4114		user_prog_tags = u64_to_user_ptr(info.prog_tags);
   4115		ulen = min_t(u32, info.nr_prog_tags, ulen);
   4116		if (prog->aux->func_cnt) {
   4117			for (i = 0; i < ulen; i++) {
   4118				if (copy_to_user(user_prog_tags[i],
   4119						 prog->aux->func[i]->tag,
   4120						 BPF_TAG_SIZE))
   4121					return -EFAULT;
   4122			}
   4123		} else {
   4124			if (copy_to_user(user_prog_tags[0],
   4125					 prog->tag, BPF_TAG_SIZE))
   4126				return -EFAULT;
   4127		}
   4128	}
   4129
   4130done:
   4131	if (copy_to_user(uinfo, &info, info_len) ||
   4132	    put_user(info_len, &uattr->info.info_len))
   4133		return -EFAULT;
   4134
   4135	return 0;
   4136}
   4137
   4138static int bpf_map_get_info_by_fd(struct file *file,
   4139				  struct bpf_map *map,
   4140				  const union bpf_attr *attr,
   4141				  union bpf_attr __user *uattr)
   4142{
   4143	struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
   4144	struct bpf_map_info info;
   4145	u32 info_len = attr->info.info_len;
   4146	int err;
   4147
   4148	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
   4149	if (err)
   4150		return err;
   4151	info_len = min_t(u32, sizeof(info), info_len);
   4152
   4153	memset(&info, 0, sizeof(info));
   4154	info.type = map->map_type;
   4155	info.id = map->id;
   4156	info.key_size = map->key_size;
   4157	info.value_size = map->value_size;
   4158	info.max_entries = map->max_entries;
   4159	info.map_flags = map->map_flags;
   4160	info.map_extra = map->map_extra;
   4161	memcpy(info.name, map->name, sizeof(map->name));
   4162
   4163	if (map->btf) {
   4164		info.btf_id = btf_obj_id(map->btf);
   4165		info.btf_key_type_id = map->btf_key_type_id;
   4166		info.btf_value_type_id = map->btf_value_type_id;
   4167	}
   4168	info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
   4169
   4170	if (bpf_map_is_dev_bound(map)) {
   4171		err = bpf_map_offload_info_fill(&info, map);
   4172		if (err)
   4173			return err;
   4174	}
   4175
   4176	if (copy_to_user(uinfo, &info, info_len) ||
   4177	    put_user(info_len, &uattr->info.info_len))
   4178		return -EFAULT;
   4179
   4180	return 0;
   4181}
   4182
   4183static int bpf_btf_get_info_by_fd(struct file *file,
   4184				  struct btf *btf,
   4185				  const union bpf_attr *attr,
   4186				  union bpf_attr __user *uattr)
   4187{
   4188	struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
   4189	u32 info_len = attr->info.info_len;
   4190	int err;
   4191
   4192	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
   4193	if (err)
   4194		return err;
   4195
   4196	return btf_get_info_by_fd(btf, attr, uattr);
   4197}
   4198
   4199static int bpf_link_get_info_by_fd(struct file *file,
   4200				  struct bpf_link *link,
   4201				  const union bpf_attr *attr,
   4202				  union bpf_attr __user *uattr)
   4203{
   4204	struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
   4205	struct bpf_link_info info;
   4206	u32 info_len = attr->info.info_len;
   4207	int err;
   4208
   4209	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
   4210	if (err)
   4211		return err;
   4212	info_len = min_t(u32, sizeof(info), info_len);
   4213
   4214	memset(&info, 0, sizeof(info));
   4215	if (copy_from_user(&info, uinfo, info_len))
   4216		return -EFAULT;
   4217
   4218	info.type = link->type;
   4219	info.id = link->id;
   4220	info.prog_id = link->prog->aux->id;
   4221
   4222	if (link->ops->fill_link_info) {
   4223		err = link->ops->fill_link_info(link, &info);
   4224		if (err)
   4225			return err;
   4226	}
   4227
   4228	if (copy_to_user(uinfo, &info, info_len) ||
   4229	    put_user(info_len, &uattr->info.info_len))
   4230		return -EFAULT;
   4231
   4232	return 0;
   4233}
   4234
   4235
   4236#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
   4237
   4238static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
   4239				  union bpf_attr __user *uattr)
   4240{
   4241	int ufd = attr->info.bpf_fd;
   4242	struct fd f;
   4243	int err;
   4244
   4245	if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
   4246		return -EINVAL;
   4247
   4248	f = fdget(ufd);
   4249	if (!f.file)
   4250		return -EBADFD;
   4251
   4252	if (f.file->f_op == &bpf_prog_fops)
   4253		err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
   4254					      uattr);
   4255	else if (f.file->f_op == &bpf_map_fops)
   4256		err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
   4257					     uattr);
   4258	else if (f.file->f_op == &btf_fops)
   4259		err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
   4260	else if (f.file->f_op == &bpf_link_fops)
   4261		err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
   4262					      attr, uattr);
   4263	else
   4264		err = -EINVAL;
   4265
   4266	fdput(f);
   4267	return err;
   4268}
   4269
   4270#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
   4271
   4272static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
   4273{
   4274	if (CHECK_ATTR(BPF_BTF_LOAD))
   4275		return -EINVAL;
   4276
   4277	if (!bpf_capable())
   4278		return -EPERM;
   4279
   4280	return btf_new_fd(attr, uattr);
   4281}
   4282
   4283#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
   4284
   4285static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
   4286{
   4287	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
   4288		return -EINVAL;
   4289
   4290	if (!capable(CAP_SYS_ADMIN))
   4291		return -EPERM;
   4292
   4293	return btf_get_fd_by_id(attr->btf_id);
   4294}
   4295
   4296static int bpf_task_fd_query_copy(const union bpf_attr *attr,
   4297				    union bpf_attr __user *uattr,
   4298				    u32 prog_id, u32 fd_type,
   4299				    const char *buf, u64 probe_offset,
   4300				    u64 probe_addr)
   4301{
   4302	char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
   4303	u32 len = buf ? strlen(buf) : 0, input_len;
   4304	int err = 0;
   4305
   4306	if (put_user(len, &uattr->task_fd_query.buf_len))
   4307		return -EFAULT;
   4308	input_len = attr->task_fd_query.buf_len;
   4309	if (input_len && ubuf) {
   4310		if (!len) {
   4311			/* nothing to copy, just make ubuf NULL terminated */
   4312			char zero = '\0';
   4313
   4314			if (put_user(zero, ubuf))
   4315				return -EFAULT;
   4316		} else if (input_len >= len + 1) {
   4317			/* ubuf can hold the string with NULL terminator */
   4318			if (copy_to_user(ubuf, buf, len + 1))
   4319				return -EFAULT;
   4320		} else {
   4321			/* ubuf cannot hold the string with NULL terminator,
   4322			 * do a partial copy with NULL terminator.
   4323			 */
   4324			char zero = '\0';
   4325
   4326			err = -ENOSPC;
   4327			if (copy_to_user(ubuf, buf, input_len - 1))
   4328				return -EFAULT;
   4329			if (put_user(zero, ubuf + input_len - 1))
   4330				return -EFAULT;
   4331		}
   4332	}
   4333
   4334	if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
   4335	    put_user(fd_type, &uattr->task_fd_query.fd_type) ||
   4336	    put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
   4337	    put_user(probe_addr, &uattr->task_fd_query.probe_addr))
   4338		return -EFAULT;
   4339
   4340	return err;
   4341}
   4342
   4343#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
   4344
   4345static int bpf_task_fd_query(const union bpf_attr *attr,
   4346			     union bpf_attr __user *uattr)
   4347{
   4348	pid_t pid = attr->task_fd_query.pid;
   4349	u32 fd = attr->task_fd_query.fd;
   4350	const struct perf_event *event;
   4351	struct task_struct *task;
   4352	struct file *file;
   4353	int err;
   4354
   4355	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
   4356		return -EINVAL;
   4357
   4358	if (!capable(CAP_SYS_ADMIN))
   4359		return -EPERM;
   4360
   4361	if (attr->task_fd_query.flags != 0)
   4362		return -EINVAL;
   4363
   4364	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
   4365	if (!task)
   4366		return -ENOENT;
   4367
   4368	err = 0;
   4369	file = fget_task(task, fd);
   4370	put_task_struct(task);
   4371	if (!file)
   4372		return -EBADF;
   4373
   4374	if (file->f_op == &bpf_link_fops) {
   4375		struct bpf_link *link = file->private_data;
   4376
   4377		if (link->ops == &bpf_raw_tp_link_lops) {
   4378			struct bpf_raw_tp_link *raw_tp =
   4379				container_of(link, struct bpf_raw_tp_link, link);
   4380			struct bpf_raw_event_map *btp = raw_tp->btp;
   4381
   4382			err = bpf_task_fd_query_copy(attr, uattr,
   4383						     raw_tp->link.prog->aux->id,
   4384						     BPF_FD_TYPE_RAW_TRACEPOINT,
   4385						     btp->tp->name, 0, 0);
   4386			goto put_file;
   4387		}
   4388		goto out_not_supp;
   4389	}
   4390
   4391	event = perf_get_event(file);
   4392	if (!IS_ERR(event)) {
   4393		u64 probe_offset, probe_addr;
   4394		u32 prog_id, fd_type;
   4395		const char *buf;
   4396
   4397		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
   4398					      &buf, &probe_offset,
   4399					      &probe_addr);
   4400		if (!err)
   4401			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
   4402						     fd_type, buf,
   4403						     probe_offset,
   4404						     probe_addr);
   4405		goto put_file;
   4406	}
   4407
   4408out_not_supp:
   4409	err = -ENOTSUPP;
   4410put_file:
   4411	fput(file);
   4412	return err;
   4413}
   4414
   4415#define BPF_MAP_BATCH_LAST_FIELD batch.flags
   4416
   4417#define BPF_DO_BATCH(fn)			\
   4418	do {					\
   4419		if (!fn) {			\
   4420			err = -ENOTSUPP;	\
   4421			goto err_put;		\
   4422		}				\
   4423		err = fn(map, attr, uattr);	\
   4424	} while (0)
   4425
   4426static int bpf_map_do_batch(const union bpf_attr *attr,
   4427			    union bpf_attr __user *uattr,
   4428			    int cmd)
   4429{
   4430	bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
   4431			 cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
   4432	bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
   4433	struct bpf_map *map;
   4434	int err, ufd;
   4435	struct fd f;
   4436
   4437	if (CHECK_ATTR(BPF_MAP_BATCH))
   4438		return -EINVAL;
   4439
   4440	ufd = attr->batch.map_fd;
   4441	f = fdget(ufd);
   4442	map = __bpf_map_get(f);
   4443	if (IS_ERR(map))
   4444		return PTR_ERR(map);
   4445	if (has_write)
   4446		bpf_map_write_active_inc(map);
   4447	if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
   4448		err = -EPERM;
   4449		goto err_put;
   4450	}
   4451	if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
   4452		err = -EPERM;
   4453		goto err_put;
   4454	}
   4455
   4456	if (cmd == BPF_MAP_LOOKUP_BATCH)
   4457		BPF_DO_BATCH(map->ops->map_lookup_batch);
   4458	else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
   4459		BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
   4460	else if (cmd == BPF_MAP_UPDATE_BATCH)
   4461		BPF_DO_BATCH(map->ops->map_update_batch);
   4462	else
   4463		BPF_DO_BATCH(map->ops->map_delete_batch);
   4464err_put:
   4465	if (has_write)
   4466		bpf_map_write_active_dec(map);
   4467	fdput(f);
   4468	return err;
   4469}
   4470
   4471#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
   4472static int link_create(union bpf_attr *attr, bpfptr_t uattr)
   4473{
   4474	enum bpf_prog_type ptype;
   4475	struct bpf_prog *prog;
   4476	int ret;
   4477
   4478	if (CHECK_ATTR(BPF_LINK_CREATE))
   4479		return -EINVAL;
   4480
   4481	prog = bpf_prog_get(attr->link_create.prog_fd);
   4482	if (IS_ERR(prog))
   4483		return PTR_ERR(prog);
   4484
   4485	ret = bpf_prog_attach_check_attach_type(prog,
   4486						attr->link_create.attach_type);
   4487	if (ret)
   4488		goto out;
   4489
   4490	switch (prog->type) {
   4491	case BPF_PROG_TYPE_EXT:
   4492		break;
   4493	case BPF_PROG_TYPE_PERF_EVENT:
   4494	case BPF_PROG_TYPE_TRACEPOINT:
   4495		if (attr->link_create.attach_type != BPF_PERF_EVENT) {
   4496			ret = -EINVAL;
   4497			goto out;
   4498		}
   4499		break;
   4500	case BPF_PROG_TYPE_KPROBE:
   4501		if (attr->link_create.attach_type != BPF_PERF_EVENT &&
   4502		    attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
   4503			ret = -EINVAL;
   4504			goto out;
   4505		}
   4506		break;
   4507	default:
   4508		ptype = attach_type_to_prog_type(attr->link_create.attach_type);
   4509		if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
   4510			ret = -EINVAL;
   4511			goto out;
   4512		}
   4513		break;
   4514	}
   4515
   4516	switch (prog->type) {
   4517	case BPF_PROG_TYPE_CGROUP_SKB:
   4518	case BPF_PROG_TYPE_CGROUP_SOCK:
   4519	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
   4520	case BPF_PROG_TYPE_SOCK_OPS:
   4521	case BPF_PROG_TYPE_CGROUP_DEVICE:
   4522	case BPF_PROG_TYPE_CGROUP_SYSCTL:
   4523	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
   4524		ret = cgroup_bpf_link_attach(attr, prog);
   4525		break;
   4526	case BPF_PROG_TYPE_EXT:
   4527		ret = bpf_tracing_prog_attach(prog,
   4528					      attr->link_create.target_fd,
   4529					      attr->link_create.target_btf_id,
   4530					      attr->link_create.tracing.cookie);
   4531		break;
   4532	case BPF_PROG_TYPE_LSM:
   4533	case BPF_PROG_TYPE_TRACING:
   4534		if (attr->link_create.attach_type != prog->expected_attach_type) {
   4535			ret = -EINVAL;
   4536			goto out;
   4537		}
   4538		if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
   4539			ret = bpf_raw_tp_link_attach(prog, NULL);
   4540		else if (prog->expected_attach_type == BPF_TRACE_ITER)
   4541			ret = bpf_iter_link_attach(attr, uattr, prog);
   4542		else
   4543			ret = bpf_tracing_prog_attach(prog,
   4544						      attr->link_create.target_fd,
   4545						      attr->link_create.target_btf_id,
   4546						      attr->link_create.tracing.cookie);
   4547		break;
   4548	case BPF_PROG_TYPE_FLOW_DISSECTOR:
   4549	case BPF_PROG_TYPE_SK_LOOKUP:
   4550		ret = netns_bpf_link_create(attr, prog);
   4551		break;
   4552#ifdef CONFIG_NET
   4553	case BPF_PROG_TYPE_XDP:
   4554		ret = bpf_xdp_link_attach(attr, prog);
   4555		break;
   4556#endif
   4557	case BPF_PROG_TYPE_PERF_EVENT:
   4558	case BPF_PROG_TYPE_TRACEPOINT:
   4559		ret = bpf_perf_link_attach(attr, prog);
   4560		break;
   4561	case BPF_PROG_TYPE_KPROBE:
   4562		if (attr->link_create.attach_type == BPF_PERF_EVENT)
   4563			ret = bpf_perf_link_attach(attr, prog);
   4564		else
   4565			ret = bpf_kprobe_multi_link_attach(attr, prog);
   4566		break;
   4567	default:
   4568		ret = -EINVAL;
   4569	}
   4570
   4571out:
   4572	if (ret < 0)
   4573		bpf_prog_put(prog);
   4574	return ret;
   4575}
   4576
   4577#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
   4578
   4579static int link_update(union bpf_attr *attr)
   4580{
   4581	struct bpf_prog *old_prog = NULL, *new_prog;
   4582	struct bpf_link *link;
   4583	u32 flags;
   4584	int ret;
   4585
   4586	if (CHECK_ATTR(BPF_LINK_UPDATE))
   4587		return -EINVAL;
   4588
   4589	flags = attr->link_update.flags;
   4590	if (flags & ~BPF_F_REPLACE)
   4591		return -EINVAL;
   4592
   4593	link = bpf_link_get_from_fd(attr->link_update.link_fd);
   4594	if (IS_ERR(link))
   4595		return PTR_ERR(link);
   4596
   4597	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
   4598	if (IS_ERR(new_prog)) {
   4599		ret = PTR_ERR(new_prog);
   4600		goto out_put_link;
   4601	}
   4602
   4603	if (flags & BPF_F_REPLACE) {
   4604		old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
   4605		if (IS_ERR(old_prog)) {
   4606			ret = PTR_ERR(old_prog);
   4607			old_prog = NULL;
   4608			goto out_put_progs;
   4609		}
   4610	} else if (attr->link_update.old_prog_fd) {
   4611		ret = -EINVAL;
   4612		goto out_put_progs;
   4613	}
   4614
   4615	if (link->ops->update_prog)
   4616		ret = link->ops->update_prog(link, new_prog, old_prog);
   4617	else
   4618		ret = -EINVAL;
   4619
   4620out_put_progs:
   4621	if (old_prog)
   4622		bpf_prog_put(old_prog);
   4623	if (ret)
   4624		bpf_prog_put(new_prog);
   4625out_put_link:
   4626	bpf_link_put(link);
   4627	return ret;
   4628}
   4629
   4630#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
   4631
   4632static int link_detach(union bpf_attr *attr)
   4633{
   4634	struct bpf_link *link;
   4635	int ret;
   4636
   4637	if (CHECK_ATTR(BPF_LINK_DETACH))
   4638		return -EINVAL;
   4639
   4640	link = bpf_link_get_from_fd(attr->link_detach.link_fd);
   4641	if (IS_ERR(link))
   4642		return PTR_ERR(link);
   4643
   4644	if (link->ops->detach)
   4645		ret = link->ops->detach(link);
   4646	else
   4647		ret = -EOPNOTSUPP;
   4648
   4649	bpf_link_put(link);
   4650	return ret;
   4651}
   4652
   4653static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
   4654{
   4655	return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
   4656}
   4657
   4658struct bpf_link *bpf_link_by_id(u32 id)
   4659{
   4660	struct bpf_link *link;
   4661
   4662	if (!id)
   4663		return ERR_PTR(-ENOENT);
   4664
   4665	spin_lock_bh(&link_idr_lock);
   4666	/* before link is "settled", ID is 0, pretend it doesn't exist yet */
   4667	link = idr_find(&link_idr, id);
   4668	if (link) {
   4669		if (link->id)
   4670			link = bpf_link_inc_not_zero(link);
   4671		else
   4672			link = ERR_PTR(-EAGAIN);
   4673	} else {
   4674		link = ERR_PTR(-ENOENT);
   4675	}
   4676	spin_unlock_bh(&link_idr_lock);
   4677	return link;
   4678}
   4679
   4680struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
   4681{
   4682	struct bpf_link *link;
   4683
   4684	spin_lock_bh(&link_idr_lock);
   4685again:
   4686	link = idr_get_next(&link_idr, id);
   4687	if (link) {
   4688		link = bpf_link_inc_not_zero(link);
   4689		if (IS_ERR(link)) {
   4690			(*id)++;
   4691			goto again;
   4692		}
   4693	}
   4694	spin_unlock_bh(&link_idr_lock);
   4695
   4696	return link;
   4697}
   4698
   4699#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
   4700
   4701static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
   4702{
   4703	struct bpf_link *link;
   4704	u32 id = attr->link_id;
   4705	int fd;
   4706
   4707	if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
   4708		return -EINVAL;
   4709
   4710	if (!capable(CAP_SYS_ADMIN))
   4711		return -EPERM;
   4712
   4713	link = bpf_link_by_id(id);
   4714	if (IS_ERR(link))
   4715		return PTR_ERR(link);
   4716
   4717	fd = bpf_link_new_fd(link);
   4718	if (fd < 0)
   4719		bpf_link_put(link);
   4720
   4721	return fd;
   4722}
   4723
   4724DEFINE_MUTEX(bpf_stats_enabled_mutex);
   4725
   4726static int bpf_stats_release(struct inode *inode, struct file *file)
   4727{
   4728	mutex_lock(&bpf_stats_enabled_mutex);
   4729	static_key_slow_dec(&bpf_stats_enabled_key.key);
   4730	mutex_unlock(&bpf_stats_enabled_mutex);
   4731	return 0;
   4732}
   4733
   4734static const struct file_operations bpf_stats_fops = {
   4735	.release = bpf_stats_release,
   4736};
   4737
   4738static int bpf_enable_runtime_stats(void)
   4739{
   4740	int fd;
   4741
   4742	mutex_lock(&bpf_stats_enabled_mutex);
   4743
   4744	/* Set a very high limit to avoid overflow */
   4745	if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
   4746		mutex_unlock(&bpf_stats_enabled_mutex);
   4747		return -EBUSY;
   4748	}
   4749
   4750	fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
   4751	if (fd >= 0)
   4752		static_key_slow_inc(&bpf_stats_enabled_key.key);
   4753
   4754	mutex_unlock(&bpf_stats_enabled_mutex);
   4755	return fd;
   4756}
   4757
   4758#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
   4759
   4760static int bpf_enable_stats(union bpf_attr *attr)
   4761{
   4762
   4763	if (CHECK_ATTR(BPF_ENABLE_STATS))
   4764		return -EINVAL;
   4765
   4766	if (!capable(CAP_SYS_ADMIN))
   4767		return -EPERM;
   4768
   4769	switch (attr->enable_stats.type) {
   4770	case BPF_STATS_RUN_TIME:
   4771		return bpf_enable_runtime_stats();
   4772	default:
   4773		break;
   4774	}
   4775	return -EINVAL;
   4776}
   4777
   4778#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
   4779
   4780static int bpf_iter_create(union bpf_attr *attr)
   4781{
   4782	struct bpf_link *link;
   4783	int err;
   4784
   4785	if (CHECK_ATTR(BPF_ITER_CREATE))
   4786		return -EINVAL;
   4787
   4788	if (attr->iter_create.flags)
   4789		return -EINVAL;
   4790
   4791	link = bpf_link_get_from_fd(attr->iter_create.link_fd);
   4792	if (IS_ERR(link))
   4793		return PTR_ERR(link);
   4794
   4795	err = bpf_iter_new_fd(link);
   4796	bpf_link_put(link);
   4797
   4798	return err;
   4799}
   4800
   4801#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
   4802
   4803static int bpf_prog_bind_map(union bpf_attr *attr)
   4804{
   4805	struct bpf_prog *prog;
   4806	struct bpf_map *map;
   4807	struct bpf_map **used_maps_old, **used_maps_new;
   4808	int i, ret = 0;
   4809
   4810	if (CHECK_ATTR(BPF_PROG_BIND_MAP))
   4811		return -EINVAL;
   4812
   4813	if (attr->prog_bind_map.flags)
   4814		return -EINVAL;
   4815
   4816	prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
   4817	if (IS_ERR(prog))
   4818		return PTR_ERR(prog);
   4819
   4820	map = bpf_map_get(attr->prog_bind_map.map_fd);
   4821	if (IS_ERR(map)) {
   4822		ret = PTR_ERR(map);
   4823		goto out_prog_put;
   4824	}
   4825
   4826	mutex_lock(&prog->aux->used_maps_mutex);
   4827
   4828	used_maps_old = prog->aux->used_maps;
   4829
   4830	for (i = 0; i < prog->aux->used_map_cnt; i++)
   4831		if (used_maps_old[i] == map) {
   4832			bpf_map_put(map);
   4833			goto out_unlock;
   4834		}
   4835
   4836	used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
   4837				      sizeof(used_maps_new[0]),
   4838				      GFP_KERNEL);
   4839	if (!used_maps_new) {
   4840		ret = -ENOMEM;
   4841		goto out_unlock;
   4842	}
   4843
   4844	memcpy(used_maps_new, used_maps_old,
   4845	       sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
   4846	used_maps_new[prog->aux->used_map_cnt] = map;
   4847
   4848	prog->aux->used_map_cnt++;
   4849	prog->aux->used_maps = used_maps_new;
   4850
   4851	kfree(used_maps_old);
   4852
   4853out_unlock:
   4854	mutex_unlock(&prog->aux->used_maps_mutex);
   4855
   4856	if (ret)
   4857		bpf_map_put(map);
   4858out_prog_put:
   4859	bpf_prog_put(prog);
   4860	return ret;
   4861}
   4862
   4863static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
   4864{
   4865	union bpf_attr attr;
   4866	bool capable;
   4867	int err;
   4868
   4869	capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;
   4870
   4871	/* Intent here is for unprivileged_bpf_disabled to block key object
   4872	 * creation commands for unprivileged users; other actions depend
   4873	 * of fd availability and access to bpffs, so are dependent on
   4874	 * object creation success.  Capabilities are later verified for
   4875	 * operations such as load and map create, so even with unprivileged
   4876	 * BPF disabled, capability checks are still carried out for these
   4877	 * and other operations.
   4878	 */
   4879	if (!capable &&
   4880	    (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))
   4881		return -EPERM;
   4882
   4883	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
   4884	if (err)
   4885		return err;
   4886	size = min_t(u32, size, sizeof(attr));
   4887
   4888	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
   4889	memset(&attr, 0, sizeof(attr));
   4890	if (copy_from_bpfptr(&attr, uattr, size) != 0)
   4891		return -EFAULT;
   4892
   4893	err = security_bpf(cmd, &attr, size);
   4894	if (err < 0)
   4895		return err;
   4896
   4897	switch (cmd) {
   4898	case BPF_MAP_CREATE:
   4899		err = map_create(&attr);
   4900		break;
   4901	case BPF_MAP_LOOKUP_ELEM:
   4902		err = map_lookup_elem(&attr);
   4903		break;
   4904	case BPF_MAP_UPDATE_ELEM:
   4905		err = map_update_elem(&attr, uattr);
   4906		break;
   4907	case BPF_MAP_DELETE_ELEM:
   4908		err = map_delete_elem(&attr);
   4909		break;
   4910	case BPF_MAP_GET_NEXT_KEY:
   4911		err = map_get_next_key(&attr);
   4912		break;
   4913	case BPF_MAP_FREEZE:
   4914		err = map_freeze(&attr);
   4915		break;
   4916	case BPF_PROG_LOAD:
   4917		err = bpf_prog_load(&attr, uattr);
   4918		break;
   4919	case BPF_OBJ_PIN:
   4920		err = bpf_obj_pin(&attr);
   4921		break;
   4922	case BPF_OBJ_GET:
   4923		err = bpf_obj_get(&attr);
   4924		break;
   4925	case BPF_PROG_ATTACH:
   4926		err = bpf_prog_attach(&attr);
   4927		break;
   4928	case BPF_PROG_DETACH:
   4929		err = bpf_prog_detach(&attr);
   4930		break;
   4931	case BPF_PROG_QUERY:
   4932		err = bpf_prog_query(&attr, uattr.user);
   4933		break;
   4934	case BPF_PROG_TEST_RUN:
   4935		err = bpf_prog_test_run(&attr, uattr.user);
   4936		break;
   4937	case BPF_PROG_GET_NEXT_ID:
   4938		err = bpf_obj_get_next_id(&attr, uattr.user,
   4939					  &prog_idr, &prog_idr_lock);
   4940		break;
   4941	case BPF_MAP_GET_NEXT_ID:
   4942		err = bpf_obj_get_next_id(&attr, uattr.user,
   4943					  &map_idr, &map_idr_lock);
   4944		break;
   4945	case BPF_BTF_GET_NEXT_ID:
   4946		err = bpf_obj_get_next_id(&attr, uattr.user,
   4947					  &btf_idr, &btf_idr_lock);
   4948		break;
   4949	case BPF_PROG_GET_FD_BY_ID:
   4950		err = bpf_prog_get_fd_by_id(&attr);
   4951		break;
   4952	case BPF_MAP_GET_FD_BY_ID:
   4953		err = bpf_map_get_fd_by_id(&attr);
   4954		break;
   4955	case BPF_OBJ_GET_INFO_BY_FD:
   4956		err = bpf_obj_get_info_by_fd(&attr, uattr.user);
   4957		break;
   4958	case BPF_RAW_TRACEPOINT_OPEN:
   4959		err = bpf_raw_tracepoint_open(&attr);
   4960		break;
   4961	case BPF_BTF_LOAD:
   4962		err = bpf_btf_load(&attr, uattr);
   4963		break;
   4964	case BPF_BTF_GET_FD_BY_ID:
   4965		err = bpf_btf_get_fd_by_id(&attr);
   4966		break;
   4967	case BPF_TASK_FD_QUERY:
   4968		err = bpf_task_fd_query(&attr, uattr.user);
   4969		break;
   4970	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
   4971		err = map_lookup_and_delete_elem(&attr);
   4972		break;
   4973	case BPF_MAP_LOOKUP_BATCH:
   4974		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
   4975		break;
   4976	case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
   4977		err = bpf_map_do_batch(&attr, uattr.user,
   4978				       BPF_MAP_LOOKUP_AND_DELETE_BATCH);
   4979		break;
   4980	case BPF_MAP_UPDATE_BATCH:
   4981		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
   4982		break;
   4983	case BPF_MAP_DELETE_BATCH:
   4984		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
   4985		break;
   4986	case BPF_LINK_CREATE:
   4987		err = link_create(&attr, uattr);
   4988		break;
   4989	case BPF_LINK_UPDATE:
   4990		err = link_update(&attr);
   4991		break;
   4992	case BPF_LINK_GET_FD_BY_ID:
   4993		err = bpf_link_get_fd_by_id(&attr);
   4994		break;
   4995	case BPF_LINK_GET_NEXT_ID:
   4996		err = bpf_obj_get_next_id(&attr, uattr.user,
   4997					  &link_idr, &link_idr_lock);
   4998		break;
   4999	case BPF_ENABLE_STATS:
   5000		err = bpf_enable_stats(&attr);
   5001		break;
   5002	case BPF_ITER_CREATE:
   5003		err = bpf_iter_create(&attr);
   5004		break;
   5005	case BPF_LINK_DETACH:
   5006		err = link_detach(&attr);
   5007		break;
   5008	case BPF_PROG_BIND_MAP:
   5009		err = bpf_prog_bind_map(&attr);
   5010		break;
   5011	default:
   5012		err = -EINVAL;
   5013		break;
   5014	}
   5015
   5016	return err;
   5017}
   5018
   5019SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
   5020{
   5021	return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
   5022}
   5023
   5024static bool syscall_prog_is_valid_access(int off, int size,
   5025					 enum bpf_access_type type,
   5026					 const struct bpf_prog *prog,
   5027					 struct bpf_insn_access_aux *info)
   5028{
   5029	if (off < 0 || off >= U16_MAX)
   5030		return false;
   5031	if (off % size != 0)
   5032		return false;
   5033	return true;
   5034}
   5035
   5036BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
   5037{
   5038	struct bpf_prog * __maybe_unused prog;
   5039	struct bpf_tramp_run_ctx __maybe_unused run_ctx;
   5040
   5041	switch (cmd) {
   5042	case BPF_MAP_CREATE:
   5043	case BPF_MAP_UPDATE_ELEM:
   5044	case BPF_MAP_FREEZE:
   5045	case BPF_PROG_LOAD:
   5046	case BPF_BTF_LOAD:
   5047	case BPF_LINK_CREATE:
   5048	case BPF_RAW_TRACEPOINT_OPEN:
   5049		break;
   5050#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
   5051	case BPF_PROG_TEST_RUN:
   5052		if (attr->test.data_in || attr->test.data_out ||
   5053		    attr->test.ctx_out || attr->test.duration ||
   5054		    attr->test.repeat || attr->test.flags)
   5055			return -EINVAL;
   5056
   5057		prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
   5058		if (IS_ERR(prog))
   5059			return PTR_ERR(prog);
   5060
   5061		if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
   5062		    attr->test.ctx_size_in > U16_MAX) {
   5063			bpf_prog_put(prog);
   5064			return -EINVAL;
   5065		}
   5066
   5067		run_ctx.bpf_cookie = 0;
   5068		run_ctx.saved_run_ctx = NULL;
   5069		if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) {
   5070			/* recursion detected */
   5071			bpf_prog_put(prog);
   5072			return -EBUSY;
   5073		}
   5074		attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
   5075		__bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx);
   5076		bpf_prog_put(prog);
   5077		return 0;
   5078#endif
   5079	default:
   5080		return -EINVAL;
   5081	}
   5082	return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
   5083}
   5084EXPORT_SYMBOL(bpf_sys_bpf);
   5085
   5086static const struct bpf_func_proto bpf_sys_bpf_proto = {
   5087	.func		= bpf_sys_bpf,
   5088	.gpl_only	= false,
   5089	.ret_type	= RET_INTEGER,
   5090	.arg1_type	= ARG_ANYTHING,
   5091	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
   5092	.arg3_type	= ARG_CONST_SIZE,
   5093};
   5094
   5095const struct bpf_func_proto * __weak
   5096tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
   5097{
   5098	return bpf_base_func_proto(func_id);
   5099}
   5100
   5101BPF_CALL_1(bpf_sys_close, u32, fd)
   5102{
   5103	/* When bpf program calls this helper there should not be
   5104	 * an fdget() without matching completed fdput().
   5105	 * This helper is allowed in the following callchain only:
   5106	 * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
   5107	 */
   5108	return close_fd(fd);
   5109}
   5110
   5111static const struct bpf_func_proto bpf_sys_close_proto = {
   5112	.func		= bpf_sys_close,
   5113	.gpl_only	= false,
   5114	.ret_type	= RET_INTEGER,
   5115	.arg1_type	= ARG_ANYTHING,
   5116};
   5117
   5118BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
   5119{
   5120	if (flags)
   5121		return -EINVAL;
   5122
   5123	if (name_sz <= 1 || name[name_sz - 1])
   5124		return -EINVAL;
   5125
   5126	if (!bpf_dump_raw_ok(current_cred()))
   5127		return -EPERM;
   5128
   5129	*res = kallsyms_lookup_name(name);
   5130	return *res ? 0 : -ENOENT;
   5131}
   5132
   5133const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
   5134	.func		= bpf_kallsyms_lookup_name,
   5135	.gpl_only	= false,
   5136	.ret_type	= RET_INTEGER,
   5137	.arg1_type	= ARG_PTR_TO_MEM,
   5138	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
   5139	.arg3_type	= ARG_ANYTHING,
   5140	.arg4_type	= ARG_PTR_TO_LONG,
   5141};
   5142
   5143static const struct bpf_func_proto *
   5144syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
   5145{
   5146	switch (func_id) {
   5147	case BPF_FUNC_sys_bpf:
   5148		return &bpf_sys_bpf_proto;
   5149	case BPF_FUNC_btf_find_by_name_kind:
   5150		return &bpf_btf_find_by_name_kind_proto;
   5151	case BPF_FUNC_sys_close:
   5152		return &bpf_sys_close_proto;
   5153	case BPF_FUNC_kallsyms_lookup_name:
   5154		return &bpf_kallsyms_lookup_name_proto;
   5155	default:
   5156		return tracing_prog_func_proto(func_id, prog);
   5157	}
   5158}
   5159
   5160const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
   5161	.get_func_proto  = syscall_prog_func_proto,
   5162	.is_valid_access = syscall_prog_is_valid_access,
   5163};
   5164
   5165const struct bpf_prog_ops bpf_syscall_prog_ops = {
   5166	.test_run = bpf_prog_test_run_syscall,
   5167};
   5168
   5169#ifdef CONFIG_SYSCTL
   5170static int bpf_stats_handler(struct ctl_table *table, int write,
   5171			     void *buffer, size_t *lenp, loff_t *ppos)
   5172{
   5173	struct static_key *key = (struct static_key *)table->data;
   5174	static int saved_val;
   5175	int val, ret;
   5176	struct ctl_table tmp = {
   5177		.data   = &val,
   5178		.maxlen = sizeof(val),
   5179		.mode   = table->mode,
   5180		.extra1 = SYSCTL_ZERO,
   5181		.extra2 = SYSCTL_ONE,
   5182	};
   5183
   5184	if (write && !capable(CAP_SYS_ADMIN))
   5185		return -EPERM;
   5186
   5187	mutex_lock(&bpf_stats_enabled_mutex);
   5188	val = saved_val;
   5189	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
   5190	if (write && !ret && val != saved_val) {
   5191		if (val)
   5192			static_key_slow_inc(key);
   5193		else
   5194			static_key_slow_dec(key);
   5195		saved_val = val;
   5196	}
   5197	mutex_unlock(&bpf_stats_enabled_mutex);
   5198	return ret;
   5199}
   5200
   5201void __weak unpriv_ebpf_notify(int new_state)
   5202{
   5203}
   5204
   5205static int bpf_unpriv_handler(struct ctl_table *table, int write,
   5206			      void *buffer, size_t *lenp, loff_t *ppos)
   5207{
   5208	int ret, unpriv_enable = *(int *)table->data;
   5209	bool locked_state = unpriv_enable == 1;
   5210	struct ctl_table tmp = *table;
   5211
   5212	if (write && !capable(CAP_SYS_ADMIN))
   5213		return -EPERM;
   5214
   5215	tmp.data = &unpriv_enable;
   5216	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
   5217	if (write && !ret) {
   5218		if (locked_state && unpriv_enable != 1)
   5219			return -EPERM;
   5220		*(int *)table->data = unpriv_enable;
   5221	}
   5222
   5223	unpriv_ebpf_notify(unpriv_enable);
   5224
   5225	return ret;
   5226}
   5227
   5228static struct ctl_table bpf_syscall_table[] = {
   5229	{
   5230		.procname	= "unprivileged_bpf_disabled",
   5231		.data		= &sysctl_unprivileged_bpf_disabled,
   5232		.maxlen		= sizeof(sysctl_unprivileged_bpf_disabled),
   5233		.mode		= 0644,
   5234		.proc_handler	= bpf_unpriv_handler,
   5235		.extra1		= SYSCTL_ZERO,
   5236		.extra2		= SYSCTL_TWO,
   5237	},
   5238	{
   5239		.procname	= "bpf_stats_enabled",
   5240		.data		= &bpf_stats_enabled_key.key,
   5241		.maxlen		= sizeof(bpf_stats_enabled_key),
   5242		.mode		= 0644,
   5243		.proc_handler	= bpf_stats_handler,
   5244	},
   5245	{ }
   5246};
   5247
   5248static int __init bpf_syscall_sysctl_init(void)
   5249{
   5250	register_sysctl_init("kernel", bpf_syscall_table);
   5251	return 0;
   5252}
   5253late_initcall(bpf_syscall_sysctl_init);
   5254#endif /* CONFIG_SYSCTL */