cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

proc_sysctl.c (49088B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * /proc/sys support
      4 */
      5#include <linux/init.h>
      6#include <linux/sysctl.h>
      7#include <linux/poll.h>
      8#include <linux/proc_fs.h>
      9#include <linux/printk.h>
     10#include <linux/security.h>
     11#include <linux/sched.h>
     12#include <linux/cred.h>
     13#include <linux/namei.h>
     14#include <linux/mm.h>
     15#include <linux/uio.h>
     16#include <linux/module.h>
     17#include <linux/bpf-cgroup.h>
     18#include <linux/mount.h>
     19#include <linux/kmemleak.h>
     20#include "internal.h"
     21
     22#define list_for_each_table_entry(entry, table) \
     23	for ((entry) = (table); (entry)->procname; (entry)++)
     24
     25static const struct dentry_operations proc_sys_dentry_operations;
     26static const struct file_operations proc_sys_file_operations;
     27static const struct inode_operations proc_sys_inode_operations;
     28static const struct file_operations proc_sys_dir_file_operations;
     29static const struct inode_operations proc_sys_dir_operations;
     30
     31/* shared constants to be used in various sysctls */
     32const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
     33EXPORT_SYMBOL(sysctl_vals);
     34
     35const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
     36EXPORT_SYMBOL_GPL(sysctl_long_vals);
     37
     38/* Support for permanently empty directories */
     39
     40struct ctl_table sysctl_mount_point[] = {
     41	{ }
     42};
     43
     44/**
     45 * register_sysctl_mount_point() - registers a sysctl mount point
     46 * @path: path for the mount point
     47 *
     48 * Used to create a permanently empty directory to serve as mount point.
     49 * There are some subtle but important permission checks this allows in the
     50 * case of unprivileged mounts.
     51 */
     52struct ctl_table_header *register_sysctl_mount_point(const char *path)
     53{
     54	return register_sysctl(path, sysctl_mount_point);
     55}
     56EXPORT_SYMBOL(register_sysctl_mount_point);
     57
     58static bool is_empty_dir(struct ctl_table_header *head)
     59{
     60	return head->ctl_table[0].child == sysctl_mount_point;
     61}
     62
     63static void set_empty_dir(struct ctl_dir *dir)
     64{
     65	dir->header.ctl_table[0].child = sysctl_mount_point;
     66}
     67
     68static void clear_empty_dir(struct ctl_dir *dir)
     69
     70{
     71	dir->header.ctl_table[0].child = NULL;
     72}
     73
     74void proc_sys_poll_notify(struct ctl_table_poll *poll)
     75{
     76	if (!poll)
     77		return;
     78
     79	atomic_inc(&poll->event);
     80	wake_up_interruptible(&poll->wait);
     81}
     82
     83static struct ctl_table root_table[] = {
     84	{
     85		.procname = "",
     86		.mode = S_IFDIR|S_IRUGO|S_IXUGO,
     87	},
     88	{ }
     89};
     90static struct ctl_table_root sysctl_table_root = {
     91	.default_set.dir.header = {
     92		{{.count = 1,
     93		  .nreg = 1,
     94		  .ctl_table = root_table }},
     95		.ctl_table_arg = root_table,
     96		.root = &sysctl_table_root,
     97		.set = &sysctl_table_root.default_set,
     98	},
     99};
    100
    101static DEFINE_SPINLOCK(sysctl_lock);
    102
    103static void drop_sysctl_table(struct ctl_table_header *header);
    104static int sysctl_follow_link(struct ctl_table_header **phead,
    105	struct ctl_table **pentry);
    106static int insert_links(struct ctl_table_header *head);
    107static void put_links(struct ctl_table_header *header);
    108
    109static void sysctl_print_dir(struct ctl_dir *dir)
    110{
    111	if (dir->header.parent)
    112		sysctl_print_dir(dir->header.parent);
    113	pr_cont("%s/", dir->header.ctl_table[0].procname);
    114}
    115
    116static int namecmp(const char *name1, int len1, const char *name2, int len2)
    117{
    118	int cmp;
    119
    120	cmp = memcmp(name1, name2, min(len1, len2));
    121	if (cmp == 0)
    122		cmp = len1 - len2;
    123	return cmp;
    124}
    125
    126/* Called under sysctl_lock */
    127static struct ctl_table *find_entry(struct ctl_table_header **phead,
    128	struct ctl_dir *dir, const char *name, int namelen)
    129{
    130	struct ctl_table_header *head;
    131	struct ctl_table *entry;
    132	struct rb_node *node = dir->root.rb_node;
    133
    134	while (node)
    135	{
    136		struct ctl_node *ctl_node;
    137		const char *procname;
    138		int cmp;
    139
    140		ctl_node = rb_entry(node, struct ctl_node, node);
    141		head = ctl_node->header;
    142		entry = &head->ctl_table[ctl_node - head->node];
    143		procname = entry->procname;
    144
    145		cmp = namecmp(name, namelen, procname, strlen(procname));
    146		if (cmp < 0)
    147			node = node->rb_left;
    148		else if (cmp > 0)
    149			node = node->rb_right;
    150		else {
    151			*phead = head;
    152			return entry;
    153		}
    154	}
    155	return NULL;
    156}
    157
    158static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
    159{
    160	struct rb_node *node = &head->node[entry - head->ctl_table].node;
    161	struct rb_node **p = &head->parent->root.rb_node;
    162	struct rb_node *parent = NULL;
    163	const char *name = entry->procname;
    164	int namelen = strlen(name);
    165
    166	while (*p) {
    167		struct ctl_table_header *parent_head;
    168		struct ctl_table *parent_entry;
    169		struct ctl_node *parent_node;
    170		const char *parent_name;
    171		int cmp;
    172
    173		parent = *p;
    174		parent_node = rb_entry(parent, struct ctl_node, node);
    175		parent_head = parent_node->header;
    176		parent_entry = &parent_head->ctl_table[parent_node - parent_head->node];
    177		parent_name = parent_entry->procname;
    178
    179		cmp = namecmp(name, namelen, parent_name, strlen(parent_name));
    180		if (cmp < 0)
    181			p = &(*p)->rb_left;
    182		else if (cmp > 0)
    183			p = &(*p)->rb_right;
    184		else {
    185			pr_err("sysctl duplicate entry: ");
    186			sysctl_print_dir(head->parent);
    187			pr_cont("%s\n", entry->procname);
    188			return -EEXIST;
    189		}
    190	}
    191
    192	rb_link_node(node, parent, p);
    193	rb_insert_color(node, &head->parent->root);
    194	return 0;
    195}
    196
    197static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
    198{
    199	struct rb_node *node = &head->node[entry - head->ctl_table].node;
    200
    201	rb_erase(node, &head->parent->root);
    202}
    203
    204static void init_header(struct ctl_table_header *head,
    205	struct ctl_table_root *root, struct ctl_table_set *set,
    206	struct ctl_node *node, struct ctl_table *table)
    207{
    208	head->ctl_table = table;
    209	head->ctl_table_arg = table;
    210	head->used = 0;
    211	head->count = 1;
    212	head->nreg = 1;
    213	head->unregistering = NULL;
    214	head->root = root;
    215	head->set = set;
    216	head->parent = NULL;
    217	head->node = node;
    218	INIT_HLIST_HEAD(&head->inodes);
    219	if (node) {
    220		struct ctl_table *entry;
    221
    222		list_for_each_table_entry(entry, table) {
    223			node->header = head;
    224			node++;
    225		}
    226	}
    227}
    228
    229static void erase_header(struct ctl_table_header *head)
    230{
    231	struct ctl_table *entry;
    232
    233	list_for_each_table_entry(entry, head->ctl_table)
    234		erase_entry(head, entry);
    235}
    236
    237static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
    238{
    239	struct ctl_table *entry;
    240	int err;
    241
    242	/* Is this a permanently empty directory? */
    243	if (is_empty_dir(&dir->header))
    244		return -EROFS;
    245
    246	/* Am I creating a permanently empty directory? */
    247	if (header->ctl_table == sysctl_mount_point) {
    248		if (!RB_EMPTY_ROOT(&dir->root))
    249			return -EINVAL;
    250		set_empty_dir(dir);
    251	}
    252
    253	dir->header.nreg++;
    254	header->parent = dir;
    255	err = insert_links(header);
    256	if (err)
    257		goto fail_links;
    258	list_for_each_table_entry(entry, header->ctl_table) {
    259		err = insert_entry(header, entry);
    260		if (err)
    261			goto fail;
    262	}
    263	return 0;
    264fail:
    265	erase_header(header);
    266	put_links(header);
    267fail_links:
    268	if (header->ctl_table == sysctl_mount_point)
    269		clear_empty_dir(dir);
    270	header->parent = NULL;
    271	drop_sysctl_table(&dir->header);
    272	return err;
    273}
    274
    275/* called under sysctl_lock */
    276static int use_table(struct ctl_table_header *p)
    277{
    278	if (unlikely(p->unregistering))
    279		return 0;
    280	p->used++;
    281	return 1;
    282}
    283
    284/* called under sysctl_lock */
    285static void unuse_table(struct ctl_table_header *p)
    286{
    287	if (!--p->used)
    288		if (unlikely(p->unregistering))
    289			complete(p->unregistering);
    290}
    291
    292static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
    293{
    294	proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
    295}
    296
    297/* called under sysctl_lock, will reacquire if has to wait */
    298static void start_unregistering(struct ctl_table_header *p)
    299{
    300	/*
    301	 * if p->used is 0, nobody will ever touch that entry again;
    302	 * we'll eliminate all paths to it before dropping sysctl_lock
    303	 */
    304	if (unlikely(p->used)) {
    305		struct completion wait;
    306		init_completion(&wait);
    307		p->unregistering = &wait;
    308		spin_unlock(&sysctl_lock);
    309		wait_for_completion(&wait);
    310	} else {
    311		/* anything non-NULL; we'll never dereference it */
    312		p->unregistering = ERR_PTR(-EINVAL);
    313		spin_unlock(&sysctl_lock);
    314	}
    315	/*
    316	 * Invalidate dentries for unregistered sysctls: namespaced sysctls
    317	 * can have duplicate names and contaminate dcache very badly.
    318	 */
    319	proc_sys_invalidate_dcache(p);
    320	/*
    321	 * do not remove from the list until nobody holds it; walking the
    322	 * list in do_sysctl() relies on that.
    323	 */
    324	spin_lock(&sysctl_lock);
    325	erase_header(p);
    326}
    327
    328static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
    329{
    330	BUG_ON(!head);
    331	spin_lock(&sysctl_lock);
    332	if (!use_table(head))
    333		head = ERR_PTR(-ENOENT);
    334	spin_unlock(&sysctl_lock);
    335	return head;
    336}
    337
    338static void sysctl_head_finish(struct ctl_table_header *head)
    339{
    340	if (!head)
    341		return;
    342	spin_lock(&sysctl_lock);
    343	unuse_table(head);
    344	spin_unlock(&sysctl_lock);
    345}
    346
    347static struct ctl_table_set *
    348lookup_header_set(struct ctl_table_root *root)
    349{
    350	struct ctl_table_set *set = &root->default_set;
    351	if (root->lookup)
    352		set = root->lookup(root);
    353	return set;
    354}
    355
    356static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
    357				      struct ctl_dir *dir,
    358				      const char *name, int namelen)
    359{
    360	struct ctl_table_header *head;
    361	struct ctl_table *entry;
    362
    363	spin_lock(&sysctl_lock);
    364	entry = find_entry(&head, dir, name, namelen);
    365	if (entry && use_table(head))
    366		*phead = head;
    367	else
    368		entry = NULL;
    369	spin_unlock(&sysctl_lock);
    370	return entry;
    371}
    372
    373static struct ctl_node *first_usable_entry(struct rb_node *node)
    374{
    375	struct ctl_node *ctl_node;
    376
    377	for (;node; node = rb_next(node)) {
    378		ctl_node = rb_entry(node, struct ctl_node, node);
    379		if (use_table(ctl_node->header))
    380			return ctl_node;
    381	}
    382	return NULL;
    383}
    384
    385static void first_entry(struct ctl_dir *dir,
    386	struct ctl_table_header **phead, struct ctl_table **pentry)
    387{
    388	struct ctl_table_header *head = NULL;
    389	struct ctl_table *entry = NULL;
    390	struct ctl_node *ctl_node;
    391
    392	spin_lock(&sysctl_lock);
    393	ctl_node = first_usable_entry(rb_first(&dir->root));
    394	spin_unlock(&sysctl_lock);
    395	if (ctl_node) {
    396		head = ctl_node->header;
    397		entry = &head->ctl_table[ctl_node - head->node];
    398	}
    399	*phead = head;
    400	*pentry = entry;
    401}
    402
    403static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
    404{
    405	struct ctl_table_header *head = *phead;
    406	struct ctl_table *entry = *pentry;
    407	struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
    408
    409	spin_lock(&sysctl_lock);
    410	unuse_table(head);
    411
    412	ctl_node = first_usable_entry(rb_next(&ctl_node->node));
    413	spin_unlock(&sysctl_lock);
    414	head = NULL;
    415	if (ctl_node) {
    416		head = ctl_node->header;
    417		entry = &head->ctl_table[ctl_node - head->node];
    418	}
    419	*phead = head;
    420	*pentry = entry;
    421}
    422
    423/*
    424 * sysctl_perm does NOT grant the superuser all rights automatically, because
    425 * some sysctl variables are readonly even to root.
    426 */
    427
    428static int test_perm(int mode, int op)
    429{
    430	if (uid_eq(current_euid(), GLOBAL_ROOT_UID))
    431		mode >>= 6;
    432	else if (in_egroup_p(GLOBAL_ROOT_GID))
    433		mode >>= 3;
    434	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
    435		return 0;
    436	return -EACCES;
    437}
    438
    439static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
    440{
    441	struct ctl_table_root *root = head->root;
    442	int mode;
    443
    444	if (root->permissions)
    445		mode = root->permissions(head, table);
    446	else
    447		mode = table->mode;
    448
    449	return test_perm(mode, op);
    450}
    451
    452static struct inode *proc_sys_make_inode(struct super_block *sb,
    453		struct ctl_table_header *head, struct ctl_table *table)
    454{
    455	struct ctl_table_root *root = head->root;
    456	struct inode *inode;
    457	struct proc_inode *ei;
    458
    459	inode = new_inode(sb);
    460	if (!inode)
    461		return ERR_PTR(-ENOMEM);
    462
    463	inode->i_ino = get_next_ino();
    464
    465	ei = PROC_I(inode);
    466
    467	spin_lock(&sysctl_lock);
    468	if (unlikely(head->unregistering)) {
    469		spin_unlock(&sysctl_lock);
    470		iput(inode);
    471		return ERR_PTR(-ENOENT);
    472	}
    473	ei->sysctl = head;
    474	ei->sysctl_entry = table;
    475	hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
    476	head->count++;
    477	spin_unlock(&sysctl_lock);
    478
    479	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
    480	inode->i_mode = table->mode;
    481	if (!S_ISDIR(table->mode)) {
    482		inode->i_mode |= S_IFREG;
    483		inode->i_op = &proc_sys_inode_operations;
    484		inode->i_fop = &proc_sys_file_operations;
    485	} else {
    486		inode->i_mode |= S_IFDIR;
    487		inode->i_op = &proc_sys_dir_operations;
    488		inode->i_fop = &proc_sys_dir_file_operations;
    489		if (is_empty_dir(head))
    490			make_empty_dir_inode(inode);
    491	}
    492
    493	if (root->set_ownership)
    494		root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
    495	else {
    496		inode->i_uid = GLOBAL_ROOT_UID;
    497		inode->i_gid = GLOBAL_ROOT_GID;
    498	}
    499
    500	return inode;
    501}
    502
    503void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
    504{
    505	spin_lock(&sysctl_lock);
    506	hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
    507	if (!--head->count)
    508		kfree_rcu(head, rcu);
    509	spin_unlock(&sysctl_lock);
    510}
    511
    512static struct ctl_table_header *grab_header(struct inode *inode)
    513{
    514	struct ctl_table_header *head = PROC_I(inode)->sysctl;
    515	if (!head)
    516		head = &sysctl_table_root.default_set.dir.header;
    517	return sysctl_head_grab(head);
    518}
    519
    520static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
    521					unsigned int flags)
    522{
    523	struct ctl_table_header *head = grab_header(dir);
    524	struct ctl_table_header *h = NULL;
    525	const struct qstr *name = &dentry->d_name;
    526	struct ctl_table *p;
    527	struct inode *inode;
    528	struct dentry *err = ERR_PTR(-ENOENT);
    529	struct ctl_dir *ctl_dir;
    530	int ret;
    531
    532	if (IS_ERR(head))
    533		return ERR_CAST(head);
    534
    535	ctl_dir = container_of(head, struct ctl_dir, header);
    536
    537	p = lookup_entry(&h, ctl_dir, name->name, name->len);
    538	if (!p)
    539		goto out;
    540
    541	if (S_ISLNK(p->mode)) {
    542		ret = sysctl_follow_link(&h, &p);
    543		err = ERR_PTR(ret);
    544		if (ret)
    545			goto out;
    546	}
    547
    548	inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
    549	if (IS_ERR(inode)) {
    550		err = ERR_CAST(inode);
    551		goto out;
    552	}
    553
    554	d_set_d_op(dentry, &proc_sys_dentry_operations);
    555	err = d_splice_alias(inode, dentry);
    556
    557out:
    558	if (h)
    559		sysctl_head_finish(h);
    560	sysctl_head_finish(head);
    561	return err;
    562}
    563
    564static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
    565		int write)
    566{
    567	struct inode *inode = file_inode(iocb->ki_filp);
    568	struct ctl_table_header *head = grab_header(inode);
    569	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
    570	size_t count = iov_iter_count(iter);
    571	char *kbuf;
    572	ssize_t error;
    573
    574	if (IS_ERR(head))
    575		return PTR_ERR(head);
    576
    577	/*
    578	 * At this point we know that the sysctl was not unregistered
    579	 * and won't be until we finish.
    580	 */
    581	error = -EPERM;
    582	if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
    583		goto out;
    584
    585	/* if that can happen at all, it should be -EINVAL, not -EISDIR */
    586	error = -EINVAL;
    587	if (!table->proc_handler)
    588		goto out;
    589
    590	/* don't even try if the size is too large */
    591	error = -ENOMEM;
    592	if (count >= KMALLOC_MAX_SIZE)
    593		goto out;
    594	kbuf = kvzalloc(count + 1, GFP_KERNEL);
    595	if (!kbuf)
    596		goto out;
    597
    598	if (write) {
    599		error = -EFAULT;
    600		if (!copy_from_iter_full(kbuf, count, iter))
    601			goto out_free_buf;
    602		kbuf[count] = '\0';
    603	}
    604
    605	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
    606					   &iocb->ki_pos);
    607	if (error)
    608		goto out_free_buf;
    609
    610	/* careful: calling conventions are nasty here */
    611	error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
    612	if (error)
    613		goto out_free_buf;
    614
    615	if (!write) {
    616		error = -EFAULT;
    617		if (copy_to_iter(kbuf, count, iter) < count)
    618			goto out_free_buf;
    619	}
    620
    621	error = count;
    622out_free_buf:
    623	kvfree(kbuf);
    624out:
    625	sysctl_head_finish(head);
    626
    627	return error;
    628}
    629
    630static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
    631{
    632	return proc_sys_call_handler(iocb, iter, 0);
    633}
    634
    635static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
    636{
    637	return proc_sys_call_handler(iocb, iter, 1);
    638}
    639
    640static int proc_sys_open(struct inode *inode, struct file *filp)
    641{
    642	struct ctl_table_header *head = grab_header(inode);
    643	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
    644
    645	/* sysctl was unregistered */
    646	if (IS_ERR(head))
    647		return PTR_ERR(head);
    648
    649	if (table->poll)
    650		filp->private_data = proc_sys_poll_event(table->poll);
    651
    652	sysctl_head_finish(head);
    653
    654	return 0;
    655}
    656
    657static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
    658{
    659	struct inode *inode = file_inode(filp);
    660	struct ctl_table_header *head = grab_header(inode);
    661	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
    662	__poll_t ret = DEFAULT_POLLMASK;
    663	unsigned long event;
    664
    665	/* sysctl was unregistered */
    666	if (IS_ERR(head))
    667		return EPOLLERR | EPOLLHUP;
    668
    669	if (!table->proc_handler)
    670		goto out;
    671
    672	if (!table->poll)
    673		goto out;
    674
    675	event = (unsigned long)filp->private_data;
    676	poll_wait(filp, &table->poll->wait, wait);
    677
    678	if (event != atomic_read(&table->poll->event)) {
    679		filp->private_data = proc_sys_poll_event(table->poll);
    680		ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
    681	}
    682
    683out:
    684	sysctl_head_finish(head);
    685
    686	return ret;
    687}
    688
    689static bool proc_sys_fill_cache(struct file *file,
    690				struct dir_context *ctx,
    691				struct ctl_table_header *head,
    692				struct ctl_table *table)
    693{
    694	struct dentry *child, *dir = file->f_path.dentry;
    695	struct inode *inode;
    696	struct qstr qname;
    697	ino_t ino = 0;
    698	unsigned type = DT_UNKNOWN;
    699
    700	qname.name = table->procname;
    701	qname.len  = strlen(table->procname);
    702	qname.hash = full_name_hash(dir, qname.name, qname.len);
    703
    704	child = d_lookup(dir, &qname);
    705	if (!child) {
    706		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
    707		child = d_alloc_parallel(dir, &qname, &wq);
    708		if (IS_ERR(child))
    709			return false;
    710		if (d_in_lookup(child)) {
    711			struct dentry *res;
    712			inode = proc_sys_make_inode(dir->d_sb, head, table);
    713			if (IS_ERR(inode)) {
    714				d_lookup_done(child);
    715				dput(child);
    716				return false;
    717			}
    718			d_set_d_op(child, &proc_sys_dentry_operations);
    719			res = d_splice_alias(inode, child);
    720			d_lookup_done(child);
    721			if (unlikely(res)) {
    722				if (IS_ERR(res)) {
    723					dput(child);
    724					return false;
    725				}
    726				dput(child);
    727				child = res;
    728			}
    729		}
    730	}
    731	inode = d_inode(child);
    732	ino  = inode->i_ino;
    733	type = inode->i_mode >> 12;
    734	dput(child);
    735	return dir_emit(ctx, qname.name, qname.len, ino, type);
    736}
    737
    738static bool proc_sys_link_fill_cache(struct file *file,
    739				    struct dir_context *ctx,
    740				    struct ctl_table_header *head,
    741				    struct ctl_table *table)
    742{
    743	bool ret = true;
    744
    745	head = sysctl_head_grab(head);
    746	if (IS_ERR(head))
    747		return false;
    748
    749	/* It is not an error if we can not follow the link ignore it */
    750	if (sysctl_follow_link(&head, &table))
    751		goto out;
    752
    753	ret = proc_sys_fill_cache(file, ctx, head, table);
    754out:
    755	sysctl_head_finish(head);
    756	return ret;
    757}
    758
    759static int scan(struct ctl_table_header *head, struct ctl_table *table,
    760		unsigned long *pos, struct file *file,
    761		struct dir_context *ctx)
    762{
    763	bool res;
    764
    765	if ((*pos)++ < ctx->pos)
    766		return true;
    767
    768	if (unlikely(S_ISLNK(table->mode)))
    769		res = proc_sys_link_fill_cache(file, ctx, head, table);
    770	else
    771		res = proc_sys_fill_cache(file, ctx, head, table);
    772
    773	if (res)
    774		ctx->pos = *pos;
    775
    776	return res;
    777}
    778
    779static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
    780{
    781	struct ctl_table_header *head = grab_header(file_inode(file));
    782	struct ctl_table_header *h = NULL;
    783	struct ctl_table *entry;
    784	struct ctl_dir *ctl_dir;
    785	unsigned long pos;
    786
    787	if (IS_ERR(head))
    788		return PTR_ERR(head);
    789
    790	ctl_dir = container_of(head, struct ctl_dir, header);
    791
    792	if (!dir_emit_dots(file, ctx))
    793		goto out;
    794
    795	pos = 2;
    796
    797	for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
    798		if (!scan(h, entry, &pos, file, ctx)) {
    799			sysctl_head_finish(h);
    800			break;
    801		}
    802	}
    803out:
    804	sysctl_head_finish(head);
    805	return 0;
    806}
    807
    808static int proc_sys_permission(struct user_namespace *mnt_userns,
    809			       struct inode *inode, int mask)
    810{
    811	/*
    812	 * sysctl entries that are not writeable,
    813	 * are _NOT_ writeable, capabilities or not.
    814	 */
    815	struct ctl_table_header *head;
    816	struct ctl_table *table;
    817	int error;
    818
    819	/* Executable files are not allowed under /proc/sys/ */
    820	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
    821		return -EACCES;
    822
    823	head = grab_header(inode);
    824	if (IS_ERR(head))
    825		return PTR_ERR(head);
    826
    827	table = PROC_I(inode)->sysctl_entry;
    828	if (!table) /* global root - r-xr-xr-x */
    829		error = mask & MAY_WRITE ? -EACCES : 0;
    830	else /* Use the permissions on the sysctl table entry */
    831		error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
    832
    833	sysctl_head_finish(head);
    834	return error;
    835}
    836
    837static int proc_sys_setattr(struct user_namespace *mnt_userns,
    838			    struct dentry *dentry, struct iattr *attr)
    839{
    840	struct inode *inode = d_inode(dentry);
    841	int error;
    842
    843	if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
    844		return -EPERM;
    845
    846	error = setattr_prepare(&init_user_ns, dentry, attr);
    847	if (error)
    848		return error;
    849
    850	setattr_copy(&init_user_ns, inode, attr);
    851	mark_inode_dirty(inode);
    852	return 0;
    853}
    854
    855static int proc_sys_getattr(struct user_namespace *mnt_userns,
    856			    const struct path *path, struct kstat *stat,
    857			    u32 request_mask, unsigned int query_flags)
    858{
    859	struct inode *inode = d_inode(path->dentry);
    860	struct ctl_table_header *head = grab_header(inode);
    861	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
    862
    863	if (IS_ERR(head))
    864		return PTR_ERR(head);
    865
    866	generic_fillattr(&init_user_ns, inode, stat);
    867	if (table)
    868		stat->mode = (stat->mode & S_IFMT) | table->mode;
    869
    870	sysctl_head_finish(head);
    871	return 0;
    872}
    873
    874static const struct file_operations proc_sys_file_operations = {
    875	.open		= proc_sys_open,
    876	.poll		= proc_sys_poll,
    877	.read_iter	= proc_sys_read,
    878	.write_iter	= proc_sys_write,
    879	.splice_read	= generic_file_splice_read,
    880	.splice_write	= iter_file_splice_write,
    881	.llseek		= default_llseek,
    882};
    883
    884static const struct file_operations proc_sys_dir_file_operations = {
    885	.read		= generic_read_dir,
    886	.iterate_shared	= proc_sys_readdir,
    887	.llseek		= generic_file_llseek,
    888};
    889
    890static const struct inode_operations proc_sys_inode_operations = {
    891	.permission	= proc_sys_permission,
    892	.setattr	= proc_sys_setattr,
    893	.getattr	= proc_sys_getattr,
    894};
    895
    896static const struct inode_operations proc_sys_dir_operations = {
    897	.lookup		= proc_sys_lookup,
    898	.permission	= proc_sys_permission,
    899	.setattr	= proc_sys_setattr,
    900	.getattr	= proc_sys_getattr,
    901};
    902
    903static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
    904{
    905	if (flags & LOOKUP_RCU)
    906		return -ECHILD;
    907	return !PROC_I(d_inode(dentry))->sysctl->unregistering;
    908}
    909
    910static int proc_sys_delete(const struct dentry *dentry)
    911{
    912	return !!PROC_I(d_inode(dentry))->sysctl->unregistering;
    913}
    914
    915static int sysctl_is_seen(struct ctl_table_header *p)
    916{
    917	struct ctl_table_set *set = p->set;
    918	int res;
    919	spin_lock(&sysctl_lock);
    920	if (p->unregistering)
    921		res = 0;
    922	else if (!set->is_seen)
    923		res = 1;
    924	else
    925		res = set->is_seen(set);
    926	spin_unlock(&sysctl_lock);
    927	return res;
    928}
    929
    930static int proc_sys_compare(const struct dentry *dentry,
    931		unsigned int len, const char *str, const struct qstr *name)
    932{
    933	struct ctl_table_header *head;
    934	struct inode *inode;
    935
    936	/* Although proc doesn't have negative dentries, rcu-walk means
    937	 * that inode here can be NULL */
    938	/* AV: can it, indeed? */
    939	inode = d_inode_rcu(dentry);
    940	if (!inode)
    941		return 1;
    942	if (name->len != len)
    943		return 1;
    944	if (memcmp(name->name, str, len))
    945		return 1;
    946	head = rcu_dereference(PROC_I(inode)->sysctl);
    947	return !head || !sysctl_is_seen(head);
    948}
    949
    950static const struct dentry_operations proc_sys_dentry_operations = {
    951	.d_revalidate	= proc_sys_revalidate,
    952	.d_delete	= proc_sys_delete,
    953	.d_compare	= proc_sys_compare,
    954};
    955
    956static struct ctl_dir *find_subdir(struct ctl_dir *dir,
    957				   const char *name, int namelen)
    958{
    959	struct ctl_table_header *head;
    960	struct ctl_table *entry;
    961
    962	entry = find_entry(&head, dir, name, namelen);
    963	if (!entry)
    964		return ERR_PTR(-ENOENT);
    965	if (!S_ISDIR(entry->mode))
    966		return ERR_PTR(-ENOTDIR);
    967	return container_of(head, struct ctl_dir, header);
    968}
    969
    970static struct ctl_dir *new_dir(struct ctl_table_set *set,
    971			       const char *name, int namelen)
    972{
    973	struct ctl_table *table;
    974	struct ctl_dir *new;
    975	struct ctl_node *node;
    976	char *new_name;
    977
    978	new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
    979		      sizeof(struct ctl_table)*2 +  namelen + 1,
    980		      GFP_KERNEL);
    981	if (!new)
    982		return NULL;
    983
    984	node = (struct ctl_node *)(new + 1);
    985	table = (struct ctl_table *)(node + 1);
    986	new_name = (char *)(table + 2);
    987	memcpy(new_name, name, namelen);
    988	table[0].procname = new_name;
    989	table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
    990	init_header(&new->header, set->dir.header.root, set, node, table);
    991
    992	return new;
    993}
    994
    995/**
    996 * get_subdir - find or create a subdir with the specified name.
    997 * @dir:  Directory to create the subdirectory in
    998 * @name: The name of the subdirectory to find or create
    999 * @namelen: The length of name
   1000 *
   1001 * Takes a directory with an elevated reference count so we know that
   1002 * if we drop the lock the directory will not go away.  Upon success
   1003 * the reference is moved from @dir to the returned subdirectory.
   1004 * Upon error an error code is returned and the reference on @dir is
   1005 * simply dropped.
   1006 */
   1007static struct ctl_dir *get_subdir(struct ctl_dir *dir,
   1008				  const char *name, int namelen)
   1009{
   1010	struct ctl_table_set *set = dir->header.set;
   1011	struct ctl_dir *subdir, *new = NULL;
   1012	int err;
   1013
   1014	spin_lock(&sysctl_lock);
   1015	subdir = find_subdir(dir, name, namelen);
   1016	if (!IS_ERR(subdir))
   1017		goto found;
   1018	if (PTR_ERR(subdir) != -ENOENT)
   1019		goto failed;
   1020
   1021	spin_unlock(&sysctl_lock);
   1022	new = new_dir(set, name, namelen);
   1023	spin_lock(&sysctl_lock);
   1024	subdir = ERR_PTR(-ENOMEM);
   1025	if (!new)
   1026		goto failed;
   1027
   1028	/* Was the subdir added while we dropped the lock? */
   1029	subdir = find_subdir(dir, name, namelen);
   1030	if (!IS_ERR(subdir))
   1031		goto found;
   1032	if (PTR_ERR(subdir) != -ENOENT)
   1033		goto failed;
   1034
   1035	/* Nope.  Use the our freshly made directory entry. */
   1036	err = insert_header(dir, &new->header);
   1037	subdir = ERR_PTR(err);
   1038	if (err)
   1039		goto failed;
   1040	subdir = new;
   1041found:
   1042	subdir->header.nreg++;
   1043failed:
   1044	if (IS_ERR(subdir)) {
   1045		pr_err("sysctl could not get directory: ");
   1046		sysctl_print_dir(dir);
   1047		pr_cont("%*.*s %ld\n", namelen, namelen, name,
   1048			PTR_ERR(subdir));
   1049	}
   1050	drop_sysctl_table(&dir->header);
   1051	if (new)
   1052		drop_sysctl_table(&new->header);
   1053	spin_unlock(&sysctl_lock);
   1054	return subdir;
   1055}
   1056
   1057static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
   1058{
   1059	struct ctl_dir *parent;
   1060	const char *procname;
   1061	if (!dir->header.parent)
   1062		return &set->dir;
   1063	parent = xlate_dir(set, dir->header.parent);
   1064	if (IS_ERR(parent))
   1065		return parent;
   1066	procname = dir->header.ctl_table[0].procname;
   1067	return find_subdir(parent, procname, strlen(procname));
   1068}
   1069
   1070static int sysctl_follow_link(struct ctl_table_header **phead,
   1071	struct ctl_table **pentry)
   1072{
   1073	struct ctl_table_header *head;
   1074	struct ctl_table_root *root;
   1075	struct ctl_table_set *set;
   1076	struct ctl_table *entry;
   1077	struct ctl_dir *dir;
   1078	int ret;
   1079
   1080	spin_lock(&sysctl_lock);
   1081	root = (*pentry)->data;
   1082	set = lookup_header_set(root);
   1083	dir = xlate_dir(set, (*phead)->parent);
   1084	if (IS_ERR(dir))
   1085		ret = PTR_ERR(dir);
   1086	else {
   1087		const char *procname = (*pentry)->procname;
   1088		head = NULL;
   1089		entry = find_entry(&head, dir, procname, strlen(procname));
   1090		ret = -ENOENT;
   1091		if (entry && use_table(head)) {
   1092			unuse_table(*phead);
   1093			*phead = head;
   1094			*pentry = entry;
   1095			ret = 0;
   1096		}
   1097	}
   1098
   1099	spin_unlock(&sysctl_lock);
   1100	return ret;
   1101}
   1102
   1103static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
   1104{
   1105	struct va_format vaf;
   1106	va_list args;
   1107
   1108	va_start(args, fmt);
   1109	vaf.fmt = fmt;
   1110	vaf.va = &args;
   1111
   1112	pr_err("sysctl table check failed: %s/%s %pV\n",
   1113	       path, table->procname, &vaf);
   1114
   1115	va_end(args);
   1116	return -EINVAL;
   1117}
   1118
   1119static int sysctl_check_table_array(const char *path, struct ctl_table *table)
   1120{
   1121	int err = 0;
   1122
   1123	if ((table->proc_handler == proc_douintvec) ||
   1124	    (table->proc_handler == proc_douintvec_minmax)) {
   1125		if (table->maxlen != sizeof(unsigned int))
   1126			err |= sysctl_err(path, table, "array not allowed");
   1127	}
   1128
   1129	if (table->proc_handler == proc_dou8vec_minmax) {
   1130		if (table->maxlen != sizeof(u8))
   1131			err |= sysctl_err(path, table, "array not allowed");
   1132	}
   1133
   1134	return err;
   1135}
   1136
   1137static int sysctl_check_table(const char *path, struct ctl_table *table)
   1138{
   1139	struct ctl_table *entry;
   1140	int err = 0;
   1141	list_for_each_table_entry(entry, table) {
   1142		if (entry->child)
   1143			err |= sysctl_err(path, entry, "Not a file");
   1144
   1145		if ((entry->proc_handler == proc_dostring) ||
   1146		    (entry->proc_handler == proc_dointvec) ||
   1147		    (entry->proc_handler == proc_douintvec) ||
   1148		    (entry->proc_handler == proc_douintvec_minmax) ||
   1149		    (entry->proc_handler == proc_dointvec_minmax) ||
   1150		    (entry->proc_handler == proc_dou8vec_minmax) ||
   1151		    (entry->proc_handler == proc_dointvec_jiffies) ||
   1152		    (entry->proc_handler == proc_dointvec_userhz_jiffies) ||
   1153		    (entry->proc_handler == proc_dointvec_ms_jiffies) ||
   1154		    (entry->proc_handler == proc_doulongvec_minmax) ||
   1155		    (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
   1156			if (!entry->data)
   1157				err |= sysctl_err(path, entry, "No data");
   1158			if (!entry->maxlen)
   1159				err |= sysctl_err(path, entry, "No maxlen");
   1160			else
   1161				err |= sysctl_check_table_array(path, entry);
   1162		}
   1163		if (!entry->proc_handler)
   1164			err |= sysctl_err(path, entry, "No proc_handler");
   1165
   1166		if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode)
   1167			err |= sysctl_err(path, entry, "bogus .mode 0%o",
   1168				entry->mode);
   1169	}
   1170	return err;
   1171}
   1172
   1173static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
   1174	struct ctl_table_root *link_root)
   1175{
   1176	struct ctl_table *link_table, *entry, *link;
   1177	struct ctl_table_header *links;
   1178	struct ctl_node *node;
   1179	char *link_name;
   1180	int nr_entries, name_bytes;
   1181
   1182	name_bytes = 0;
   1183	nr_entries = 0;
   1184	list_for_each_table_entry(entry, table) {
   1185		nr_entries++;
   1186		name_bytes += strlen(entry->procname) + 1;
   1187	}
   1188
   1189	links = kzalloc(sizeof(struct ctl_table_header) +
   1190			sizeof(struct ctl_node)*nr_entries +
   1191			sizeof(struct ctl_table)*(nr_entries + 1) +
   1192			name_bytes,
   1193			GFP_KERNEL);
   1194
   1195	if (!links)
   1196		return NULL;
   1197
   1198	node = (struct ctl_node *)(links + 1);
   1199	link_table = (struct ctl_table *)(node + nr_entries);
   1200	link_name = (char *)&link_table[nr_entries + 1];
   1201	link = link_table;
   1202
   1203	list_for_each_table_entry(entry, table) {
   1204		int len = strlen(entry->procname) + 1;
   1205		memcpy(link_name, entry->procname, len);
   1206		link->procname = link_name;
   1207		link->mode = S_IFLNK|S_IRWXUGO;
   1208		link->data = link_root;
   1209		link_name += len;
   1210		link++;
   1211	}
   1212	init_header(links, dir->header.root, dir->header.set, node, link_table);
   1213	links->nreg = nr_entries;
   1214
   1215	return links;
   1216}
   1217
   1218static bool get_links(struct ctl_dir *dir,
   1219	struct ctl_table *table, struct ctl_table_root *link_root)
   1220{
   1221	struct ctl_table_header *head;
   1222	struct ctl_table *entry, *link;
   1223
   1224	/* Are there links available for every entry in table? */
   1225	list_for_each_table_entry(entry, table) {
   1226		const char *procname = entry->procname;
   1227		link = find_entry(&head, dir, procname, strlen(procname));
   1228		if (!link)
   1229			return false;
   1230		if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
   1231			continue;
   1232		if (S_ISLNK(link->mode) && (link->data == link_root))
   1233			continue;
   1234		return false;
   1235	}
   1236
   1237	/* The checks passed.  Increase the registration count on the links */
   1238	list_for_each_table_entry(entry, table) {
   1239		const char *procname = entry->procname;
   1240		link = find_entry(&head, dir, procname, strlen(procname));
   1241		head->nreg++;
   1242	}
   1243	return true;
   1244}
   1245
   1246static int insert_links(struct ctl_table_header *head)
   1247{
   1248	struct ctl_table_set *root_set = &sysctl_table_root.default_set;
   1249	struct ctl_dir *core_parent = NULL;
   1250	struct ctl_table_header *links;
   1251	int err;
   1252
   1253	if (head->set == root_set)
   1254		return 0;
   1255
   1256	core_parent = xlate_dir(root_set, head->parent);
   1257	if (IS_ERR(core_parent))
   1258		return 0;
   1259
   1260	if (get_links(core_parent, head->ctl_table, head->root))
   1261		return 0;
   1262
   1263	core_parent->header.nreg++;
   1264	spin_unlock(&sysctl_lock);
   1265
   1266	links = new_links(core_parent, head->ctl_table, head->root);
   1267
   1268	spin_lock(&sysctl_lock);
   1269	err = -ENOMEM;
   1270	if (!links)
   1271		goto out;
   1272
   1273	err = 0;
   1274	if (get_links(core_parent, head->ctl_table, head->root)) {
   1275		kfree(links);
   1276		goto out;
   1277	}
   1278
   1279	err = insert_header(core_parent, links);
   1280	if (err)
   1281		kfree(links);
   1282out:
   1283	drop_sysctl_table(&core_parent->header);
   1284	return err;
   1285}
   1286
   1287/**
   1288 * __register_sysctl_table - register a leaf sysctl table
   1289 * @set: Sysctl tree to register on
   1290 * @path: The path to the directory the sysctl table is in.
   1291 * @table: the top-level table structure
   1292 *
   1293 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
   1294 * array. A completely 0 filled entry terminates the table.
   1295 *
   1296 * The members of the &struct ctl_table structure are used as follows:
   1297 *
   1298 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
   1299 *            enter a sysctl file
   1300 *
   1301 * data - a pointer to data for use by proc_handler
   1302 *
   1303 * maxlen - the maximum size in bytes of the data
   1304 *
   1305 * mode - the file permissions for the /proc/sys file
   1306 *
   1307 * child - must be %NULL.
   1308 *
   1309 * proc_handler - the text handler routine (described below)
   1310 *
   1311 * extra1, extra2 - extra pointers usable by the proc handler routines
   1312 *
   1313 * Leaf nodes in the sysctl tree will be represented by a single file
   1314 * under /proc; non-leaf nodes will be represented by directories.
   1315 *
   1316 * There must be a proc_handler routine for any terminal nodes.
   1317 * Several default handlers are available to cover common cases -
   1318 *
   1319 * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
   1320 * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
   1321 * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
   1322 *
   1323 * It is the handler's job to read the input buffer from user memory
   1324 * and process it. The handler should return 0 on success.
   1325 *
   1326 * This routine returns %NULL on a failure to register, and a pointer
   1327 * to the table header on success.
   1328 */
   1329struct ctl_table_header *__register_sysctl_table(
   1330	struct ctl_table_set *set,
   1331	const char *path, struct ctl_table *table)
   1332{
   1333	struct ctl_table_root *root = set->dir.header.root;
   1334	struct ctl_table_header *header;
   1335	const char *name, *nextname;
   1336	struct ctl_dir *dir;
   1337	struct ctl_table *entry;
   1338	struct ctl_node *node;
   1339	int nr_entries = 0;
   1340
   1341	list_for_each_table_entry(entry, table)
   1342		nr_entries++;
   1343
   1344	header = kzalloc(sizeof(struct ctl_table_header) +
   1345			 sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT);
   1346	if (!header)
   1347		return NULL;
   1348
   1349	node = (struct ctl_node *)(header + 1);
   1350	init_header(header, root, set, node, table);
   1351	if (sysctl_check_table(path, table))
   1352		goto fail;
   1353
   1354	spin_lock(&sysctl_lock);
   1355	dir = &set->dir;
   1356	/* Reference moved down the diretory tree get_subdir */
   1357	dir->header.nreg++;
   1358	spin_unlock(&sysctl_lock);
   1359
   1360	/* Find the directory for the ctl_table */
   1361	for (name = path; name; name = nextname) {
   1362		int namelen;
   1363		nextname = strchr(name, '/');
   1364		if (nextname) {
   1365			namelen = nextname - name;
   1366			nextname++;
   1367		} else {
   1368			namelen = strlen(name);
   1369		}
   1370		if (namelen == 0)
   1371			continue;
   1372
   1373		dir = get_subdir(dir, name, namelen);
   1374		if (IS_ERR(dir))
   1375			goto fail;
   1376	}
   1377
   1378	spin_lock(&sysctl_lock);
   1379	if (insert_header(dir, header))
   1380		goto fail_put_dir_locked;
   1381
   1382	drop_sysctl_table(&dir->header);
   1383	spin_unlock(&sysctl_lock);
   1384
   1385	return header;
   1386
   1387fail_put_dir_locked:
   1388	drop_sysctl_table(&dir->header);
   1389	spin_unlock(&sysctl_lock);
   1390fail:
   1391	kfree(header);
   1392	dump_stack();
   1393	return NULL;
   1394}
   1395
   1396/**
   1397 * register_sysctl - register a sysctl table
   1398 * @path: The path to the directory the sysctl table is in.
   1399 * @table: the table structure
   1400 *
   1401 * Register a sysctl table. @table should be a filled in ctl_table
   1402 * array. A completely 0 filled entry terminates the table.
   1403 *
   1404 * See __register_sysctl_table for more details.
   1405 */
   1406struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
   1407{
   1408	return __register_sysctl_table(&sysctl_table_root.default_set,
   1409					path, table);
   1410}
   1411EXPORT_SYMBOL(register_sysctl);
   1412
   1413/**
   1414 * __register_sysctl_init() - register sysctl table to path
   1415 * @path: path name for sysctl base
   1416 * @table: This is the sysctl table that needs to be registered to the path
   1417 * @table_name: The name of sysctl table, only used for log printing when
   1418 *              registration fails
   1419 *
   1420 * The sysctl interface is used by userspace to query or modify at runtime
   1421 * a predefined value set on a variable. These variables however have default
   1422 * values pre-set. Code which depends on these variables will always work even
   1423 * if register_sysctl() fails. If register_sysctl() fails you'd just loose the
   1424 * ability to query or modify the sysctls dynamically at run time. Chances of
   1425 * register_sysctl() failing on init are extremely low, and so for both reasons
   1426 * this function does not return any error as it is used by initialization code.
   1427 *
   1428 * Context: Can only be called after your respective sysctl base path has been
   1429 * registered. So for instance, most base directories are registered early on
   1430 * init before init levels are processed through proc_sys_init() and
   1431 * sysctl_init_bases().
   1432 */
   1433void __init __register_sysctl_init(const char *path, struct ctl_table *table,
   1434				 const char *table_name)
   1435{
   1436	struct ctl_table_header *hdr = register_sysctl(path, table);
   1437
   1438	if (unlikely(!hdr)) {
   1439		pr_err("failed when register_sysctl %s to %s\n", table_name, path);
   1440		return;
   1441	}
   1442	kmemleak_not_leak(hdr);
   1443}
   1444
   1445static char *append_path(const char *path, char *pos, const char *name)
   1446{
   1447	int namelen;
   1448	namelen = strlen(name);
   1449	if (((pos - path) + namelen + 2) >= PATH_MAX)
   1450		return NULL;
   1451	memcpy(pos, name, namelen);
   1452	pos[namelen] = '/';
   1453	pos[namelen + 1] = '\0';
   1454	pos += namelen + 1;
   1455	return pos;
   1456}
   1457
   1458static int count_subheaders(struct ctl_table *table)
   1459{
   1460	int has_files = 0;
   1461	int nr_subheaders = 0;
   1462	struct ctl_table *entry;
   1463
   1464	/* special case: no directory and empty directory */
   1465	if (!table || !table->procname)
   1466		return 1;
   1467
   1468	list_for_each_table_entry(entry, table) {
   1469		if (entry->child)
   1470			nr_subheaders += count_subheaders(entry->child);
   1471		else
   1472			has_files = 1;
   1473	}
   1474	return nr_subheaders + has_files;
   1475}
   1476
   1477static int register_leaf_sysctl_tables(const char *path, char *pos,
   1478	struct ctl_table_header ***subheader, struct ctl_table_set *set,
   1479	struct ctl_table *table)
   1480{
   1481	struct ctl_table *ctl_table_arg = NULL;
   1482	struct ctl_table *entry, *files;
   1483	int nr_files = 0;
   1484	int nr_dirs = 0;
   1485	int err = -ENOMEM;
   1486
   1487	list_for_each_table_entry(entry, table) {
   1488		if (entry->child)
   1489			nr_dirs++;
   1490		else
   1491			nr_files++;
   1492	}
   1493
   1494	files = table;
   1495	/* If there are mixed files and directories we need a new table */
   1496	if (nr_dirs && nr_files) {
   1497		struct ctl_table *new;
   1498		files = kcalloc(nr_files + 1, sizeof(struct ctl_table),
   1499				GFP_KERNEL);
   1500		if (!files)
   1501			goto out;
   1502
   1503		ctl_table_arg = files;
   1504		new = files;
   1505
   1506		list_for_each_table_entry(entry, table) {
   1507			if (entry->child)
   1508				continue;
   1509			*new = *entry;
   1510			new++;
   1511		}
   1512	}
   1513
   1514	/* Register everything except a directory full of subdirectories */
   1515	if (nr_files || !nr_dirs) {
   1516		struct ctl_table_header *header;
   1517		header = __register_sysctl_table(set, path, files);
   1518		if (!header) {
   1519			kfree(ctl_table_arg);
   1520			goto out;
   1521		}
   1522
   1523		/* Remember if we need to free the file table */
   1524		header->ctl_table_arg = ctl_table_arg;
   1525		**subheader = header;
   1526		(*subheader)++;
   1527	}
   1528
   1529	/* Recurse into the subdirectories. */
   1530	list_for_each_table_entry(entry, table) {
   1531		char *child_pos;
   1532
   1533		if (!entry->child)
   1534			continue;
   1535
   1536		err = -ENAMETOOLONG;
   1537		child_pos = append_path(path, pos, entry->procname);
   1538		if (!child_pos)
   1539			goto out;
   1540
   1541		err = register_leaf_sysctl_tables(path, child_pos, subheader,
   1542						  set, entry->child);
   1543		pos[0] = '\0';
   1544		if (err)
   1545			goto out;
   1546	}
   1547	err = 0;
   1548out:
   1549	/* On failure our caller will unregister all registered subheaders */
   1550	return err;
   1551}
   1552
   1553/**
   1554 * __register_sysctl_paths - register a sysctl table hierarchy
   1555 * @set: Sysctl tree to register on
   1556 * @path: The path to the directory the sysctl table is in.
   1557 * @table: the top-level table structure
   1558 *
   1559 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
   1560 * array. A completely 0 filled entry terminates the table.
   1561 *
   1562 * See __register_sysctl_table for more details.
   1563 */
   1564struct ctl_table_header *__register_sysctl_paths(
   1565	struct ctl_table_set *set,
   1566	const struct ctl_path *path, struct ctl_table *table)
   1567{
   1568	struct ctl_table *ctl_table_arg = table;
   1569	int nr_subheaders = count_subheaders(table);
   1570	struct ctl_table_header *header = NULL, **subheaders, **subheader;
   1571	const struct ctl_path *component;
   1572	char *new_path, *pos;
   1573
   1574	pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
   1575	if (!new_path)
   1576		return NULL;
   1577
   1578	pos[0] = '\0';
   1579	for (component = path; component->procname; component++) {
   1580		pos = append_path(new_path, pos, component->procname);
   1581		if (!pos)
   1582			goto out;
   1583	}
   1584	while (table->procname && table->child && !table[1].procname) {
   1585		pos = append_path(new_path, pos, table->procname);
   1586		if (!pos)
   1587			goto out;
   1588		table = table->child;
   1589	}
   1590	if (nr_subheaders == 1) {
   1591		header = __register_sysctl_table(set, new_path, table);
   1592		if (header)
   1593			header->ctl_table_arg = ctl_table_arg;
   1594	} else {
   1595		header = kzalloc(sizeof(*header) +
   1596				 sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
   1597		if (!header)
   1598			goto out;
   1599
   1600		subheaders = (struct ctl_table_header **) (header + 1);
   1601		subheader = subheaders;
   1602		header->ctl_table_arg = ctl_table_arg;
   1603
   1604		if (register_leaf_sysctl_tables(new_path, pos, &subheader,
   1605						set, table))
   1606			goto err_register_leaves;
   1607	}
   1608
   1609out:
   1610	kfree(new_path);
   1611	return header;
   1612
   1613err_register_leaves:
   1614	while (subheader > subheaders) {
   1615		struct ctl_table_header *subh = *(--subheader);
   1616		struct ctl_table *table = subh->ctl_table_arg;
   1617		unregister_sysctl_table(subh);
   1618		kfree(table);
   1619	}
   1620	kfree(header);
   1621	header = NULL;
   1622	goto out;
   1623}
   1624
   1625/**
   1626 * register_sysctl_paths - register a sysctl table hierarchy
   1627 * @path: The path to the directory the sysctl table is in.
   1628 * @table: the top-level table structure
   1629 *
   1630 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
   1631 * array. A completely 0 filled entry terminates the table.
   1632 *
   1633 * See __register_sysctl_paths for more details.
   1634 */
   1635struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
   1636						struct ctl_table *table)
   1637{
   1638	return __register_sysctl_paths(&sysctl_table_root.default_set,
   1639					path, table);
   1640}
   1641EXPORT_SYMBOL(register_sysctl_paths);
   1642
   1643/**
   1644 * register_sysctl_table - register a sysctl table hierarchy
   1645 * @table: the top-level table structure
   1646 *
   1647 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
   1648 * array. A completely 0 filled entry terminates the table.
   1649 *
   1650 * See register_sysctl_paths for more details.
   1651 */
   1652struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
   1653{
   1654	static const struct ctl_path null_path[] = { {} };
   1655
   1656	return register_sysctl_paths(null_path, table);
   1657}
   1658EXPORT_SYMBOL(register_sysctl_table);
   1659
   1660int __register_sysctl_base(struct ctl_table *base_table)
   1661{
   1662	struct ctl_table_header *hdr;
   1663
   1664	hdr = register_sysctl_table(base_table);
   1665	kmemleak_not_leak(hdr);
   1666	return 0;
   1667}
   1668
   1669static void put_links(struct ctl_table_header *header)
   1670{
   1671	struct ctl_table_set *root_set = &sysctl_table_root.default_set;
   1672	struct ctl_table_root *root = header->root;
   1673	struct ctl_dir *parent = header->parent;
   1674	struct ctl_dir *core_parent;
   1675	struct ctl_table *entry;
   1676
   1677	if (header->set == root_set)
   1678		return;
   1679
   1680	core_parent = xlate_dir(root_set, parent);
   1681	if (IS_ERR(core_parent))
   1682		return;
   1683
   1684	list_for_each_table_entry(entry, header->ctl_table) {
   1685		struct ctl_table_header *link_head;
   1686		struct ctl_table *link;
   1687		const char *name = entry->procname;
   1688
   1689		link = find_entry(&link_head, core_parent, name, strlen(name));
   1690		if (link &&
   1691		    ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) ||
   1692		     (S_ISLNK(link->mode) && (link->data == root)))) {
   1693			drop_sysctl_table(link_head);
   1694		}
   1695		else {
   1696			pr_err("sysctl link missing during unregister: ");
   1697			sysctl_print_dir(parent);
   1698			pr_cont("%s\n", name);
   1699		}
   1700	}
   1701}
   1702
   1703static void drop_sysctl_table(struct ctl_table_header *header)
   1704{
   1705	struct ctl_dir *parent = header->parent;
   1706
   1707	if (--header->nreg)
   1708		return;
   1709
   1710	if (parent) {
   1711		put_links(header);
   1712		start_unregistering(header);
   1713	}
   1714
   1715	if (!--header->count)
   1716		kfree_rcu(header, rcu);
   1717
   1718	if (parent)
   1719		drop_sysctl_table(&parent->header);
   1720}
   1721
   1722/**
   1723 * unregister_sysctl_table - unregister a sysctl table hierarchy
   1724 * @header: the header returned from register_sysctl_table
   1725 *
   1726 * Unregisters the sysctl table and all children. proc entries may not
   1727 * actually be removed until they are no longer used by anyone.
   1728 */
   1729void unregister_sysctl_table(struct ctl_table_header * header)
   1730{
   1731	int nr_subheaders;
   1732	might_sleep();
   1733
   1734	if (header == NULL)
   1735		return;
   1736
   1737	nr_subheaders = count_subheaders(header->ctl_table_arg);
   1738	if (unlikely(nr_subheaders > 1)) {
   1739		struct ctl_table_header **subheaders;
   1740		int i;
   1741
   1742		subheaders = (struct ctl_table_header **)(header + 1);
   1743		for (i = nr_subheaders -1; i >= 0; i--) {
   1744			struct ctl_table_header *subh = subheaders[i];
   1745			struct ctl_table *table = subh->ctl_table_arg;
   1746			unregister_sysctl_table(subh);
   1747			kfree(table);
   1748		}
   1749		kfree(header);
   1750		return;
   1751	}
   1752
   1753	spin_lock(&sysctl_lock);
   1754	drop_sysctl_table(header);
   1755	spin_unlock(&sysctl_lock);
   1756}
   1757EXPORT_SYMBOL(unregister_sysctl_table);
   1758
   1759void setup_sysctl_set(struct ctl_table_set *set,
   1760	struct ctl_table_root *root,
   1761	int (*is_seen)(struct ctl_table_set *))
   1762{
   1763	memset(set, 0, sizeof(*set));
   1764	set->is_seen = is_seen;
   1765	init_header(&set->dir.header, root, set, NULL, root_table);
   1766}
   1767
   1768void retire_sysctl_set(struct ctl_table_set *set)
   1769{
   1770	WARN_ON(!RB_EMPTY_ROOT(&set->dir.root));
   1771}
   1772
   1773int __init proc_sys_init(void)
   1774{
   1775	struct proc_dir_entry *proc_sys_root;
   1776
   1777	proc_sys_root = proc_mkdir("sys", NULL);
   1778	proc_sys_root->proc_iops = &proc_sys_dir_operations;
   1779	proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations;
   1780	proc_sys_root->nlink = 0;
   1781
   1782	return sysctl_init_bases();
   1783}
   1784
   1785struct sysctl_alias {
   1786	const char *kernel_param;
   1787	const char *sysctl_param;
   1788};
   1789
   1790/*
   1791 * Historically some settings had both sysctl and a command line parameter.
   1792 * With the generic sysctl. parameter support, we can handle them at a single
   1793 * place and only keep the historical name for compatibility. This is not meant
   1794 * to add brand new aliases. When adding existing aliases, consider whether
   1795 * the possibly different moment of changing the value (e.g. from early_param
   1796 * to the moment do_sysctl_args() is called) is an issue for the specific
   1797 * parameter.
   1798 */
   1799static const struct sysctl_alias sysctl_aliases[] = {
   1800	{"hardlockup_all_cpu_backtrace",	"kernel.hardlockup_all_cpu_backtrace" },
   1801	{"hung_task_panic",			"kernel.hung_task_panic" },
   1802	{"numa_zonelist_order",			"vm.numa_zonelist_order" },
   1803	{"softlockup_all_cpu_backtrace",	"kernel.softlockup_all_cpu_backtrace" },
   1804	{"softlockup_panic",			"kernel.softlockup_panic" },
   1805	{ }
   1806};
   1807
   1808static const char *sysctl_find_alias(char *param)
   1809{
   1810	const struct sysctl_alias *alias;
   1811
   1812	for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) {
   1813		if (strcmp(alias->kernel_param, param) == 0)
   1814			return alias->sysctl_param;
   1815	}
   1816
   1817	return NULL;
   1818}
   1819
   1820/* Set sysctl value passed on kernel command line. */
   1821static int process_sysctl_arg(char *param, char *val,
   1822			       const char *unused, void *arg)
   1823{
   1824	char *path;
   1825	struct vfsmount **proc_mnt = arg;
   1826	struct file_system_type *proc_fs_type;
   1827	struct file *file;
   1828	int len;
   1829	int err;
   1830	loff_t pos = 0;
   1831	ssize_t wret;
   1832
   1833	if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) {
   1834		param += sizeof("sysctl") - 1;
   1835
   1836		if (param[0] != '/' && param[0] != '.')
   1837			return 0;
   1838
   1839		param++;
   1840	} else {
   1841		param = (char *) sysctl_find_alias(param);
   1842		if (!param)
   1843			return 0;
   1844	}
   1845
   1846	if (!val)
   1847		return -EINVAL;
   1848	len = strlen(val);
   1849	if (len == 0)
   1850		return -EINVAL;
   1851
   1852	/*
   1853	 * To set sysctl options, we use a temporary mount of proc, look up the
   1854	 * respective sys/ file and write to it. To avoid mounting it when no
   1855	 * options were given, we mount it only when the first sysctl option is
   1856	 * found. Why not a persistent mount? There are problems with a
   1857	 * persistent mount of proc in that it forces userspace not to use any
   1858	 * proc mount options.
   1859	 */
   1860	if (!*proc_mnt) {
   1861		proc_fs_type = get_fs_type("proc");
   1862		if (!proc_fs_type) {
   1863			pr_err("Failed to find procfs to set sysctl from command line\n");
   1864			return 0;
   1865		}
   1866		*proc_mnt = kern_mount(proc_fs_type);
   1867		put_filesystem(proc_fs_type);
   1868		if (IS_ERR(*proc_mnt)) {
   1869			pr_err("Failed to mount procfs to set sysctl from command line\n");
   1870			return 0;
   1871		}
   1872	}
   1873
   1874	path = kasprintf(GFP_KERNEL, "sys/%s", param);
   1875	if (!path)
   1876		panic("%s: Failed to allocate path for %s\n", __func__, param);
   1877	strreplace(path, '.', '/');
   1878
   1879	file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0);
   1880	if (IS_ERR(file)) {
   1881		err = PTR_ERR(file);
   1882		if (err == -ENOENT)
   1883			pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n",
   1884				param, val);
   1885		else if (err == -EACCES)
   1886			pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n",
   1887				param, val);
   1888		else
   1889			pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n",
   1890				file, param, val);
   1891		goto out;
   1892	}
   1893	wret = kernel_write(file, val, len, &pos);
   1894	if (wret < 0) {
   1895		err = wret;
   1896		if (err == -EINVAL)
   1897			pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n",
   1898				param, val);
   1899		else
   1900			pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n",
   1901				ERR_PTR(err), param, val);
   1902	} else if (wret != len) {
   1903		pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n",
   1904			wret, len, path, param, val);
   1905	}
   1906
   1907	err = filp_close(file, NULL);
   1908	if (err)
   1909		pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n",
   1910			ERR_PTR(err), param, val);
   1911out:
   1912	kfree(path);
   1913	return 0;
   1914}
   1915
   1916void do_sysctl_args(void)
   1917{
   1918	char *command_line;
   1919	struct vfsmount *proc_mnt = NULL;
   1920
   1921	command_line = kstrdup(saved_command_line, GFP_KERNEL);
   1922	if (!command_line)
   1923		panic("%s: Failed to allocate copy of command line\n", __func__);
   1924
   1925	parse_args("Setting sysctl args", command_line,
   1926		   NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg);
   1927
   1928	if (proc_mnt)
   1929		kern_unmount(proc_mnt);
   1930
   1931	kfree(command_line);
   1932}