cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sch_api.c (55968B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * net/sched/sch_api.c	Packet scheduler API.
      4 *
      5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
      6 *
      7 * Fixes:
      8 *
      9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
     10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
     11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
     12 */
     13
     14#include <linux/module.h>
     15#include <linux/types.h>
     16#include <linux/kernel.h>
     17#include <linux/string.h>
     18#include <linux/errno.h>
     19#include <linux/skbuff.h>
     20#include <linux/init.h>
     21#include <linux/proc_fs.h>
     22#include <linux/seq_file.h>
     23#include <linux/kmod.h>
     24#include <linux/list.h>
     25#include <linux/hrtimer.h>
     26#include <linux/slab.h>
     27#include <linux/hashtable.h>
     28
     29#include <net/net_namespace.h>
     30#include <net/sock.h>
     31#include <net/netlink.h>
     32#include <net/pkt_sched.h>
     33#include <net/pkt_cls.h>
     34
     35#include <trace/events/qdisc.h>
     36
     37/*
     38
     39   Short review.
     40   -------------
     41
     42   This file consists of two interrelated parts:
     43
     44   1. queueing disciplines manager frontend.
     45   2. traffic classes manager frontend.
     46
     47   Generally, queueing discipline ("qdisc") is a black box,
     48   which is able to enqueue packets and to dequeue them (when
     49   device is ready to send something) in order and at times
     50   determined by algorithm hidden in it.
     51
     52   qdisc's are divided to two categories:
     53   - "queues", which have no internal structure visible from outside.
     54   - "schedulers", which split all the packets to "traffic classes",
     55     using "packet classifiers" (look at cls_api.c)
     56
     57   In turn, classes may have child qdiscs (as rule, queues)
     58   attached to them etc. etc. etc.
     59
     60   The goal of the routines in this file is to translate
     61   information supplied by user in the form of handles
     62   to more intelligible for kernel form, to make some sanity
     63   checks and part of work, which is common to all qdiscs
     64   and to provide rtnetlink notifications.
     65
     66   All real intelligent work is done inside qdisc modules.
     67
     68
     69
     70   Every discipline has two major routines: enqueue and dequeue.
     71
     72   ---dequeue
     73
     74   dequeue usually returns a skb to send. It is allowed to return NULL,
     75   but it does not mean that queue is empty, it just means that
     76   discipline does not want to send anything this time.
     77   Queue is really empty if q->q.qlen == 0.
     78   For complicated disciplines with multiple queues q->q is not
     79   real packet queue, but however q->q.qlen must be valid.
     80
     81   ---enqueue
     82
     83   enqueue returns 0, if packet was enqueued successfully.
     84   If packet (this one or another one) was dropped, it returns
     85   not zero error code.
     86   NET_XMIT_DROP 	- this packet dropped
     87     Expected action: do not backoff, but wait until queue will clear.
     88   NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
     89     Expected action: backoff or ignore
     90
     91   Auxiliary routines:
     92
     93   ---peek
     94
     95   like dequeue but without removing a packet from the queue
     96
     97   ---reset
     98
     99   returns qdisc to initial state: purge all buffers, clear all
    100   timers, counters (except for statistics) etc.
    101
    102   ---init
    103
    104   initializes newly created qdisc.
    105
    106   ---destroy
    107
    108   destroys resources allocated by init and during lifetime of qdisc.
    109
    110   ---change
    111
    112   changes qdisc parameters.
    113 */
    114
    115/* Protects list of registered TC modules. It is pure SMP lock. */
    116static DEFINE_RWLOCK(qdisc_mod_lock);
    117
    118
    119/************************************************
    120 *	Queueing disciplines manipulation.	*
    121 ************************************************/
    122
    123
    124/* The list of all installed queueing disciplines. */
    125
    126static struct Qdisc_ops *qdisc_base;
    127
    128/* Register/unregister queueing discipline */
    129
    130int register_qdisc(struct Qdisc_ops *qops)
    131{
    132	struct Qdisc_ops *q, **qp;
    133	int rc = -EEXIST;
    134
    135	write_lock(&qdisc_mod_lock);
    136	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
    137		if (!strcmp(qops->id, q->id))
    138			goto out;
    139
    140	if (qops->enqueue == NULL)
    141		qops->enqueue = noop_qdisc_ops.enqueue;
    142	if (qops->peek == NULL) {
    143		if (qops->dequeue == NULL)
    144			qops->peek = noop_qdisc_ops.peek;
    145		else
    146			goto out_einval;
    147	}
    148	if (qops->dequeue == NULL)
    149		qops->dequeue = noop_qdisc_ops.dequeue;
    150
    151	if (qops->cl_ops) {
    152		const struct Qdisc_class_ops *cops = qops->cl_ops;
    153
    154		if (!(cops->find && cops->walk && cops->leaf))
    155			goto out_einval;
    156
    157		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
    158			goto out_einval;
    159	}
    160
    161	qops->next = NULL;
    162	*qp = qops;
    163	rc = 0;
    164out:
    165	write_unlock(&qdisc_mod_lock);
    166	return rc;
    167
    168out_einval:
    169	rc = -EINVAL;
    170	goto out;
    171}
    172EXPORT_SYMBOL(register_qdisc);
    173
    174int unregister_qdisc(struct Qdisc_ops *qops)
    175{
    176	struct Qdisc_ops *q, **qp;
    177	int err = -ENOENT;
    178
    179	write_lock(&qdisc_mod_lock);
    180	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
    181		if (q == qops)
    182			break;
    183	if (q) {
    184		*qp = q->next;
    185		q->next = NULL;
    186		err = 0;
    187	}
    188	write_unlock(&qdisc_mod_lock);
    189	return err;
    190}
    191EXPORT_SYMBOL(unregister_qdisc);
    192
    193/* Get default qdisc if not otherwise specified */
    194void qdisc_get_default(char *name, size_t len)
    195{
    196	read_lock(&qdisc_mod_lock);
    197	strlcpy(name, default_qdisc_ops->id, len);
    198	read_unlock(&qdisc_mod_lock);
    199}
    200
    201static struct Qdisc_ops *qdisc_lookup_default(const char *name)
    202{
    203	struct Qdisc_ops *q = NULL;
    204
    205	for (q = qdisc_base; q; q = q->next) {
    206		if (!strcmp(name, q->id)) {
    207			if (!try_module_get(q->owner))
    208				q = NULL;
    209			break;
    210		}
    211	}
    212
    213	return q;
    214}
    215
    216/* Set new default qdisc to use */
    217int qdisc_set_default(const char *name)
    218{
    219	const struct Qdisc_ops *ops;
    220
    221	if (!capable(CAP_NET_ADMIN))
    222		return -EPERM;
    223
    224	write_lock(&qdisc_mod_lock);
    225	ops = qdisc_lookup_default(name);
    226	if (!ops) {
    227		/* Not found, drop lock and try to load module */
    228		write_unlock(&qdisc_mod_lock);
    229		request_module("sch_%s", name);
    230		write_lock(&qdisc_mod_lock);
    231
    232		ops = qdisc_lookup_default(name);
    233	}
    234
    235	if (ops) {
    236		/* Set new default */
    237		module_put(default_qdisc_ops->owner);
    238		default_qdisc_ops = ops;
    239	}
    240	write_unlock(&qdisc_mod_lock);
    241
    242	return ops ? 0 : -ENOENT;
    243}
    244
    245#ifdef CONFIG_NET_SCH_DEFAULT
    246/* Set default value from kernel config */
    247static int __init sch_default_qdisc(void)
    248{
    249	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
    250}
    251late_initcall(sch_default_qdisc);
    252#endif
    253
    254/* We know handle. Find qdisc among all qdisc's attached to device
    255 * (root qdisc, all its children, children of children etc.)
    256 * Note: caller either uses rtnl or rcu_read_lock()
    257 */
    258
    259static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
    260{
    261	struct Qdisc *q;
    262
    263	if (!qdisc_dev(root))
    264		return (root->handle == handle ? root : NULL);
    265
    266	if (!(root->flags & TCQ_F_BUILTIN) &&
    267	    root->handle == handle)
    268		return root;
    269
    270	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
    271				   lockdep_rtnl_is_held()) {
    272		if (q->handle == handle)
    273			return q;
    274	}
    275	return NULL;
    276}
    277
    278void qdisc_hash_add(struct Qdisc *q, bool invisible)
    279{
    280	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
    281		ASSERT_RTNL();
    282		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
    283		if (invisible)
    284			q->flags |= TCQ_F_INVISIBLE;
    285	}
    286}
    287EXPORT_SYMBOL(qdisc_hash_add);
    288
    289void qdisc_hash_del(struct Qdisc *q)
    290{
    291	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
    292		ASSERT_RTNL();
    293		hash_del_rcu(&q->hash);
    294	}
    295}
    296EXPORT_SYMBOL(qdisc_hash_del);
    297
    298struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
    299{
    300	struct Qdisc *q;
    301
    302	if (!handle)
    303		return NULL;
    304	q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
    305	if (q)
    306		goto out;
    307
    308	if (dev_ingress_queue(dev))
    309		q = qdisc_match_from_root(
    310			dev_ingress_queue(dev)->qdisc_sleeping,
    311			handle);
    312out:
    313	return q;
    314}
    315
    316struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
    317{
    318	struct netdev_queue *nq;
    319	struct Qdisc *q;
    320
    321	if (!handle)
    322		return NULL;
    323	q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
    324	if (q)
    325		goto out;
    326
    327	nq = dev_ingress_queue_rcu(dev);
    328	if (nq)
    329		q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
    330out:
    331	return q;
    332}
    333
    334static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
    335{
    336	unsigned long cl;
    337	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
    338
    339	if (cops == NULL)
    340		return NULL;
    341	cl = cops->find(p, classid);
    342
    343	if (cl == 0)
    344		return NULL;
    345	return cops->leaf(p, cl);
    346}
    347
    348/* Find queueing discipline by name */
    349
    350static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
    351{
    352	struct Qdisc_ops *q = NULL;
    353
    354	if (kind) {
    355		read_lock(&qdisc_mod_lock);
    356		for (q = qdisc_base; q; q = q->next) {
    357			if (nla_strcmp(kind, q->id) == 0) {
    358				if (!try_module_get(q->owner))
    359					q = NULL;
    360				break;
    361			}
    362		}
    363		read_unlock(&qdisc_mod_lock);
    364	}
    365	return q;
    366}
    367
    368/* The linklayer setting were not transferred from iproute2, in older
    369 * versions, and the rate tables lookup systems have been dropped in
    370 * the kernel. To keep backward compatible with older iproute2 tc
    371 * utils, we detect the linklayer setting by detecting if the rate
    372 * table were modified.
    373 *
    374 * For linklayer ATM table entries, the rate table will be aligned to
    375 * 48 bytes, thus some table entries will contain the same value.  The
    376 * mpu (min packet unit) is also encoded into the old rate table, thus
    377 * starting from the mpu, we find low and high table entries for
    378 * mapping this cell.  If these entries contain the same value, when
    379 * the rate tables have been modified for linklayer ATM.
    380 *
    381 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
    382 * and then roundup to the next cell, calc the table entry one below,
    383 * and compare.
    384 */
    385static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
    386{
    387	int low       = roundup(r->mpu, 48);
    388	int high      = roundup(low+1, 48);
    389	int cell_low  = low >> r->cell_log;
    390	int cell_high = (high >> r->cell_log) - 1;
    391
    392	/* rtab is too inaccurate at rates > 100Mbit/s */
    393	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
    394		pr_debug("TC linklayer: Giving up ATM detection\n");
    395		return TC_LINKLAYER_ETHERNET;
    396	}
    397
    398	if ((cell_high > cell_low) && (cell_high < 256)
    399	    && (rtab[cell_low] == rtab[cell_high])) {
    400		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
    401			 cell_low, cell_high, rtab[cell_high]);
    402		return TC_LINKLAYER_ATM;
    403	}
    404	return TC_LINKLAYER_ETHERNET;
    405}
    406
    407static struct qdisc_rate_table *qdisc_rtab_list;
    408
    409struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
    410					struct nlattr *tab,
    411					struct netlink_ext_ack *extack)
    412{
    413	struct qdisc_rate_table *rtab;
    414
    415	if (tab == NULL || r->rate == 0 ||
    416	    r->cell_log == 0 || r->cell_log >= 32 ||
    417	    nla_len(tab) != TC_RTAB_SIZE) {
    418		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
    419		return NULL;
    420	}
    421
    422	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
    423		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
    424		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
    425			rtab->refcnt++;
    426			return rtab;
    427		}
    428	}
    429
    430	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
    431	if (rtab) {
    432		rtab->rate = *r;
    433		rtab->refcnt = 1;
    434		memcpy(rtab->data, nla_data(tab), 1024);
    435		if (r->linklayer == TC_LINKLAYER_UNAWARE)
    436			r->linklayer = __detect_linklayer(r, rtab->data);
    437		rtab->next = qdisc_rtab_list;
    438		qdisc_rtab_list = rtab;
    439	} else {
    440		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
    441	}
    442	return rtab;
    443}
    444EXPORT_SYMBOL(qdisc_get_rtab);
    445
    446void qdisc_put_rtab(struct qdisc_rate_table *tab)
    447{
    448	struct qdisc_rate_table *rtab, **rtabp;
    449
    450	if (!tab || --tab->refcnt)
    451		return;
    452
    453	for (rtabp = &qdisc_rtab_list;
    454	     (rtab = *rtabp) != NULL;
    455	     rtabp = &rtab->next) {
    456		if (rtab == tab) {
    457			*rtabp = rtab->next;
    458			kfree(rtab);
    459			return;
    460		}
    461	}
    462}
    463EXPORT_SYMBOL(qdisc_put_rtab);
    464
    465static LIST_HEAD(qdisc_stab_list);
    466
    467static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
    468	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
    469	[TCA_STAB_DATA] = { .type = NLA_BINARY },
    470};
    471
    472static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
    473					       struct netlink_ext_ack *extack)
    474{
    475	struct nlattr *tb[TCA_STAB_MAX + 1];
    476	struct qdisc_size_table *stab;
    477	struct tc_sizespec *s;
    478	unsigned int tsize = 0;
    479	u16 *tab = NULL;
    480	int err;
    481
    482	err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
    483					  extack);
    484	if (err < 0)
    485		return ERR_PTR(err);
    486	if (!tb[TCA_STAB_BASE]) {
    487		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
    488		return ERR_PTR(-EINVAL);
    489	}
    490
    491	s = nla_data(tb[TCA_STAB_BASE]);
    492
    493	if (s->tsize > 0) {
    494		if (!tb[TCA_STAB_DATA]) {
    495			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
    496			return ERR_PTR(-EINVAL);
    497		}
    498		tab = nla_data(tb[TCA_STAB_DATA]);
    499		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
    500	}
    501
    502	if (tsize != s->tsize || (!tab && tsize > 0)) {
    503		NL_SET_ERR_MSG(extack, "Invalid size of size table");
    504		return ERR_PTR(-EINVAL);
    505	}
    506
    507	list_for_each_entry(stab, &qdisc_stab_list, list) {
    508		if (memcmp(&stab->szopts, s, sizeof(*s)))
    509			continue;
    510		if (tsize > 0 &&
    511		    memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
    512			continue;
    513		stab->refcnt++;
    514		return stab;
    515	}
    516
    517	if (s->size_log > STAB_SIZE_LOG_MAX ||
    518	    s->cell_log > STAB_SIZE_LOG_MAX) {
    519		NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
    520		return ERR_PTR(-EINVAL);
    521	}
    522
    523	stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
    524	if (!stab)
    525		return ERR_PTR(-ENOMEM);
    526
    527	stab->refcnt = 1;
    528	stab->szopts = *s;
    529	if (tsize > 0)
    530		memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
    531
    532	list_add_tail(&stab->list, &qdisc_stab_list);
    533
    534	return stab;
    535}
    536
    537void qdisc_put_stab(struct qdisc_size_table *tab)
    538{
    539	if (!tab)
    540		return;
    541
    542	if (--tab->refcnt == 0) {
    543		list_del(&tab->list);
    544		kfree_rcu(tab, rcu);
    545	}
    546}
    547EXPORT_SYMBOL(qdisc_put_stab);
    548
    549static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
    550{
    551	struct nlattr *nest;
    552
    553	nest = nla_nest_start_noflag(skb, TCA_STAB);
    554	if (nest == NULL)
    555		goto nla_put_failure;
    556	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
    557		goto nla_put_failure;
    558	nla_nest_end(skb, nest);
    559
    560	return skb->len;
    561
    562nla_put_failure:
    563	return -1;
    564}
    565
    566void __qdisc_calculate_pkt_len(struct sk_buff *skb,
    567			       const struct qdisc_size_table *stab)
    568{
    569	int pkt_len, slot;
    570
    571	pkt_len = skb->len + stab->szopts.overhead;
    572	if (unlikely(!stab->szopts.tsize))
    573		goto out;
    574
    575	slot = pkt_len + stab->szopts.cell_align;
    576	if (unlikely(slot < 0))
    577		slot = 0;
    578
    579	slot >>= stab->szopts.cell_log;
    580	if (likely(slot < stab->szopts.tsize))
    581		pkt_len = stab->data[slot];
    582	else
    583		pkt_len = stab->data[stab->szopts.tsize - 1] *
    584				(slot / stab->szopts.tsize) +
    585				stab->data[slot % stab->szopts.tsize];
    586
    587	pkt_len <<= stab->szopts.size_log;
    588out:
    589	if (unlikely(pkt_len < 1))
    590		pkt_len = 1;
    591	qdisc_skb_cb(skb)->pkt_len = pkt_len;
    592}
    593EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
    594
    595void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
    596{
    597	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
    598		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
    599			txt, qdisc->ops->id, qdisc->handle >> 16);
    600		qdisc->flags |= TCQ_F_WARN_NONWC;
    601	}
    602}
    603EXPORT_SYMBOL(qdisc_warn_nonwc);
    604
    605static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
    606{
    607	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
    608						 timer);
    609
    610	rcu_read_lock();
    611	__netif_schedule(qdisc_root(wd->qdisc));
    612	rcu_read_unlock();
    613
    614	return HRTIMER_NORESTART;
    615}
    616
    617void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
    618				 clockid_t clockid)
    619{
    620	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
    621	wd->timer.function = qdisc_watchdog;
    622	wd->qdisc = qdisc;
    623}
    624EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
    625
    626void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
    627{
    628	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
    629}
    630EXPORT_SYMBOL(qdisc_watchdog_init);
    631
    632void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
    633				      u64 delta_ns)
    634{
    635	if (test_bit(__QDISC_STATE_DEACTIVATED,
    636		     &qdisc_root_sleeping(wd->qdisc)->state))
    637		return;
    638
    639	if (hrtimer_is_queued(&wd->timer)) {
    640		/* If timer is already set in [expires, expires + delta_ns],
    641		 * do not reprogram it.
    642		 */
    643		if (wd->last_expires - expires <= delta_ns)
    644			return;
    645	}
    646
    647	wd->last_expires = expires;
    648	hrtimer_start_range_ns(&wd->timer,
    649			       ns_to_ktime(expires),
    650			       delta_ns,
    651			       HRTIMER_MODE_ABS_PINNED);
    652}
    653EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
    654
    655void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
    656{
    657	hrtimer_cancel(&wd->timer);
    658}
    659EXPORT_SYMBOL(qdisc_watchdog_cancel);
    660
    661static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
    662{
    663	struct hlist_head *h;
    664	unsigned int i;
    665
    666	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
    667
    668	if (h != NULL) {
    669		for (i = 0; i < n; i++)
    670			INIT_HLIST_HEAD(&h[i]);
    671	}
    672	return h;
    673}
    674
    675void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
    676{
    677	struct Qdisc_class_common *cl;
    678	struct hlist_node *next;
    679	struct hlist_head *nhash, *ohash;
    680	unsigned int nsize, nmask, osize;
    681	unsigned int i, h;
    682
    683	/* Rehash when load factor exceeds 0.75 */
    684	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
    685		return;
    686	nsize = clhash->hashsize * 2;
    687	nmask = nsize - 1;
    688	nhash = qdisc_class_hash_alloc(nsize);
    689	if (nhash == NULL)
    690		return;
    691
    692	ohash = clhash->hash;
    693	osize = clhash->hashsize;
    694
    695	sch_tree_lock(sch);
    696	for (i = 0; i < osize; i++) {
    697		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
    698			h = qdisc_class_hash(cl->classid, nmask);
    699			hlist_add_head(&cl->hnode, &nhash[h]);
    700		}
    701	}
    702	clhash->hash     = nhash;
    703	clhash->hashsize = nsize;
    704	clhash->hashmask = nmask;
    705	sch_tree_unlock(sch);
    706
    707	kvfree(ohash);
    708}
    709EXPORT_SYMBOL(qdisc_class_hash_grow);
    710
    711int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
    712{
    713	unsigned int size = 4;
    714
    715	clhash->hash = qdisc_class_hash_alloc(size);
    716	if (!clhash->hash)
    717		return -ENOMEM;
    718	clhash->hashsize  = size;
    719	clhash->hashmask  = size - 1;
    720	clhash->hashelems = 0;
    721	return 0;
    722}
    723EXPORT_SYMBOL(qdisc_class_hash_init);
    724
    725void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
    726{
    727	kvfree(clhash->hash);
    728}
    729EXPORT_SYMBOL(qdisc_class_hash_destroy);
    730
    731void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
    732			     struct Qdisc_class_common *cl)
    733{
    734	unsigned int h;
    735
    736	INIT_HLIST_NODE(&cl->hnode);
    737	h = qdisc_class_hash(cl->classid, clhash->hashmask);
    738	hlist_add_head(&cl->hnode, &clhash->hash[h]);
    739	clhash->hashelems++;
    740}
    741EXPORT_SYMBOL(qdisc_class_hash_insert);
    742
    743void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
    744			     struct Qdisc_class_common *cl)
    745{
    746	hlist_del(&cl->hnode);
    747	clhash->hashelems--;
    748}
    749EXPORT_SYMBOL(qdisc_class_hash_remove);
    750
    751/* Allocate an unique handle from space managed by kernel
    752 * Possible range is [8000-FFFF]:0000 (0x8000 values)
    753 */
    754static u32 qdisc_alloc_handle(struct net_device *dev)
    755{
    756	int i = 0x8000;
    757	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
    758
    759	do {
    760		autohandle += TC_H_MAKE(0x10000U, 0);
    761		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
    762			autohandle = TC_H_MAKE(0x80000000U, 0);
    763		if (!qdisc_lookup(dev, autohandle))
    764			return autohandle;
    765		cond_resched();
    766	} while	(--i > 0);
    767
    768	return 0;
    769}
    770
    771void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
    772{
    773	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
    774	const struct Qdisc_class_ops *cops;
    775	unsigned long cl;
    776	u32 parentid;
    777	bool notify;
    778	int drops;
    779
    780	if (n == 0 && len == 0)
    781		return;
    782	drops = max_t(int, n, 0);
    783	rcu_read_lock();
    784	while ((parentid = sch->parent)) {
    785		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
    786			break;
    787
    788		if (sch->flags & TCQ_F_NOPARENT)
    789			break;
    790		/* Notify parent qdisc only if child qdisc becomes empty.
    791		 *
    792		 * If child was empty even before update then backlog
    793		 * counter is screwed and we skip notification because
    794		 * parent class is already passive.
    795		 *
    796		 * If the original child was offloaded then it is allowed
    797		 * to be seem as empty, so the parent is notified anyway.
    798		 */
    799		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
    800						       !qdisc_is_offloaded);
    801		/* TODO: perform the search on a per txq basis */
    802		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
    803		if (sch == NULL) {
    804			WARN_ON_ONCE(parentid != TC_H_ROOT);
    805			break;
    806		}
    807		cops = sch->ops->cl_ops;
    808		if (notify && cops->qlen_notify) {
    809			cl = cops->find(sch, parentid);
    810			cops->qlen_notify(sch, cl);
    811		}
    812		sch->q.qlen -= n;
    813		sch->qstats.backlog -= len;
    814		__qdisc_qstats_drop(sch, drops);
    815	}
    816	rcu_read_unlock();
    817}
    818EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
    819
    820int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
    821			      void *type_data)
    822{
    823	struct net_device *dev = qdisc_dev(sch);
    824	int err;
    825
    826	sch->flags &= ~TCQ_F_OFFLOADED;
    827	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
    828		return 0;
    829
    830	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
    831	if (err == -EOPNOTSUPP)
    832		return 0;
    833
    834	if (!err)
    835		sch->flags |= TCQ_F_OFFLOADED;
    836
    837	return err;
    838}
    839EXPORT_SYMBOL(qdisc_offload_dump_helper);
    840
    841void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
    842				struct Qdisc *new, struct Qdisc *old,
    843				enum tc_setup_type type, void *type_data,
    844				struct netlink_ext_ack *extack)
    845{
    846	bool any_qdisc_is_offloaded;
    847	int err;
    848
    849	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
    850		return;
    851
    852	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
    853
    854	/* Don't report error if the graft is part of destroy operation. */
    855	if (!err || !new || new == &noop_qdisc)
    856		return;
    857
    858	/* Don't report error if the parent, the old child and the new
    859	 * one are not offloaded.
    860	 */
    861	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
    862	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
    863	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
    864
    865	if (any_qdisc_is_offloaded)
    866		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
    867}
    868EXPORT_SYMBOL(qdisc_offload_graft_helper);
    869
    870static void qdisc_offload_graft_root(struct net_device *dev,
    871				     struct Qdisc *new, struct Qdisc *old,
    872				     struct netlink_ext_ack *extack)
    873{
    874	struct tc_root_qopt_offload graft_offload = {
    875		.command	= TC_ROOT_GRAFT,
    876		.handle		= new ? new->handle : 0,
    877		.ingress	= (new && new->flags & TCQ_F_INGRESS) ||
    878				  (old && old->flags & TCQ_F_INGRESS),
    879	};
    880
    881	qdisc_offload_graft_helper(dev, NULL, new, old,
    882				   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
    883}
    884
    885static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
    886			 u32 portid, u32 seq, u16 flags, int event)
    887{
    888	struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
    889	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
    890	struct tcmsg *tcm;
    891	struct nlmsghdr  *nlh;
    892	unsigned char *b = skb_tail_pointer(skb);
    893	struct gnet_dump d;
    894	struct qdisc_size_table *stab;
    895	u32 block_index;
    896	__u32 qlen;
    897
    898	cond_resched();
    899	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
    900	if (!nlh)
    901		goto out_nlmsg_trim;
    902	tcm = nlmsg_data(nlh);
    903	tcm->tcm_family = AF_UNSPEC;
    904	tcm->tcm__pad1 = 0;
    905	tcm->tcm__pad2 = 0;
    906	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
    907	tcm->tcm_parent = clid;
    908	tcm->tcm_handle = q->handle;
    909	tcm->tcm_info = refcount_read(&q->refcnt);
    910	if (nla_put_string(skb, TCA_KIND, q->ops->id))
    911		goto nla_put_failure;
    912	if (q->ops->ingress_block_get) {
    913		block_index = q->ops->ingress_block_get(q);
    914		if (block_index &&
    915		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
    916			goto nla_put_failure;
    917	}
    918	if (q->ops->egress_block_get) {
    919		block_index = q->ops->egress_block_get(q);
    920		if (block_index &&
    921		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
    922			goto nla_put_failure;
    923	}
    924	if (q->ops->dump && q->ops->dump(q, skb) < 0)
    925		goto nla_put_failure;
    926	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
    927		goto nla_put_failure;
    928	qlen = qdisc_qlen_sum(q);
    929
    930	stab = rtnl_dereference(q->stab);
    931	if (stab && qdisc_dump_stab(skb, stab) < 0)
    932		goto nla_put_failure;
    933
    934	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
    935					 NULL, &d, TCA_PAD) < 0)
    936		goto nla_put_failure;
    937
    938	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
    939		goto nla_put_failure;
    940
    941	if (qdisc_is_percpu_stats(q)) {
    942		cpu_bstats = q->cpu_bstats;
    943		cpu_qstats = q->cpu_qstats;
    944	}
    945
    946	if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
    947	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
    948	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
    949		goto nla_put_failure;
    950
    951	if (gnet_stats_finish_copy(&d) < 0)
    952		goto nla_put_failure;
    953
    954	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
    955	return skb->len;
    956
    957out_nlmsg_trim:
    958nla_put_failure:
    959	nlmsg_trim(skb, b);
    960	return -1;
    961}
    962
    963static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
    964{
    965	if (q->flags & TCQ_F_BUILTIN)
    966		return true;
    967	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
    968		return true;
    969
    970	return false;
    971}
    972
    973static int qdisc_notify(struct net *net, struct sk_buff *oskb,
    974			struct nlmsghdr *n, u32 clid,
    975			struct Qdisc *old, struct Qdisc *new)
    976{
    977	struct sk_buff *skb;
    978	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
    979
    980	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
    981	if (!skb)
    982		return -ENOBUFS;
    983
    984	if (old && !tc_qdisc_dump_ignore(old, false)) {
    985		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
    986				  0, RTM_DELQDISC) < 0)
    987			goto err_out;
    988	}
    989	if (new && !tc_qdisc_dump_ignore(new, false)) {
    990		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
    991				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
    992			goto err_out;
    993	}
    994
    995	if (skb->len)
    996		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
    997				      n->nlmsg_flags & NLM_F_ECHO);
    998
    999err_out:
   1000	kfree_skb(skb);
   1001	return -EINVAL;
   1002}
   1003
   1004static void notify_and_destroy(struct net *net, struct sk_buff *skb,
   1005			       struct nlmsghdr *n, u32 clid,
   1006			       struct Qdisc *old, struct Qdisc *new)
   1007{
   1008	if (new || old)
   1009		qdisc_notify(net, skb, n, clid, old, new);
   1010
   1011	if (old)
   1012		qdisc_put(old);
   1013}
   1014
   1015static void qdisc_clear_nolock(struct Qdisc *sch)
   1016{
   1017	sch->flags &= ~TCQ_F_NOLOCK;
   1018	if (!(sch->flags & TCQ_F_CPUSTATS))
   1019		return;
   1020
   1021	free_percpu(sch->cpu_bstats);
   1022	free_percpu(sch->cpu_qstats);
   1023	sch->cpu_bstats = NULL;
   1024	sch->cpu_qstats = NULL;
   1025	sch->flags &= ~TCQ_F_CPUSTATS;
   1026}
   1027
   1028/* Graft qdisc "new" to class "classid" of qdisc "parent" or
   1029 * to device "dev".
   1030 *
   1031 * When appropriate send a netlink notification using 'skb'
   1032 * and "n".
   1033 *
   1034 * On success, destroy old qdisc.
   1035 */
   1036
   1037static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
   1038		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
   1039		       struct Qdisc *new, struct Qdisc *old,
   1040		       struct netlink_ext_ack *extack)
   1041{
   1042	struct Qdisc *q = old;
   1043	struct net *net = dev_net(dev);
   1044
   1045	if (parent == NULL) {
   1046		unsigned int i, num_q, ingress;
   1047
   1048		ingress = 0;
   1049		num_q = dev->num_tx_queues;
   1050		if ((q && q->flags & TCQ_F_INGRESS) ||
   1051		    (new && new->flags & TCQ_F_INGRESS)) {
   1052			num_q = 1;
   1053			ingress = 1;
   1054			if (!dev_ingress_queue(dev)) {
   1055				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
   1056				return -ENOENT;
   1057			}
   1058		}
   1059
   1060		if (dev->flags & IFF_UP)
   1061			dev_deactivate(dev);
   1062
   1063		qdisc_offload_graft_root(dev, new, old, extack);
   1064
   1065		if (new && new->ops->attach && !ingress)
   1066			goto skip;
   1067
   1068		for (i = 0; i < num_q; i++) {
   1069			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
   1070
   1071			if (!ingress)
   1072				dev_queue = netdev_get_tx_queue(dev, i);
   1073
   1074			old = dev_graft_qdisc(dev_queue, new);
   1075			if (new && i > 0)
   1076				qdisc_refcount_inc(new);
   1077
   1078			if (!ingress)
   1079				qdisc_put(old);
   1080		}
   1081
   1082skip:
   1083		if (!ingress) {
   1084			notify_and_destroy(net, skb, n, classid,
   1085					   rtnl_dereference(dev->qdisc), new);
   1086			if (new && !new->ops->attach)
   1087				qdisc_refcount_inc(new);
   1088			rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
   1089
   1090			if (new && new->ops->attach)
   1091				new->ops->attach(new);
   1092		} else {
   1093			notify_and_destroy(net, skb, n, classid, old, new);
   1094		}
   1095
   1096		if (dev->flags & IFF_UP)
   1097			dev_activate(dev);
   1098	} else {
   1099		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
   1100		unsigned long cl;
   1101		int err;
   1102
   1103		/* Only support running class lockless if parent is lockless */
   1104		if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
   1105			qdisc_clear_nolock(new);
   1106
   1107		if (!cops || !cops->graft)
   1108			return -EOPNOTSUPP;
   1109
   1110		cl = cops->find(parent, classid);
   1111		if (!cl) {
   1112			NL_SET_ERR_MSG(extack, "Specified class not found");
   1113			return -ENOENT;
   1114		}
   1115
   1116		err = cops->graft(parent, cl, new, &old, extack);
   1117		if (err)
   1118			return err;
   1119		notify_and_destroy(net, skb, n, classid, old, new);
   1120	}
   1121	return 0;
   1122}
   1123
   1124static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
   1125				   struct netlink_ext_ack *extack)
   1126{
   1127	u32 block_index;
   1128
   1129	if (tca[TCA_INGRESS_BLOCK]) {
   1130		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
   1131
   1132		if (!block_index) {
   1133			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
   1134			return -EINVAL;
   1135		}
   1136		if (!sch->ops->ingress_block_set) {
   1137			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
   1138			return -EOPNOTSUPP;
   1139		}
   1140		sch->ops->ingress_block_set(sch, block_index);
   1141	}
   1142	if (tca[TCA_EGRESS_BLOCK]) {
   1143		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
   1144
   1145		if (!block_index) {
   1146			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
   1147			return -EINVAL;
   1148		}
   1149		if (!sch->ops->egress_block_set) {
   1150			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
   1151			return -EOPNOTSUPP;
   1152		}
   1153		sch->ops->egress_block_set(sch, block_index);
   1154	}
   1155	return 0;
   1156}
   1157
   1158/*
   1159   Allocate and initialize new qdisc.
   1160
   1161   Parameters are passed via opt.
   1162 */
   1163
   1164static struct Qdisc *qdisc_create(struct net_device *dev,
   1165				  struct netdev_queue *dev_queue,
   1166				  struct Qdisc *p, u32 parent, u32 handle,
   1167				  struct nlattr **tca, int *errp,
   1168				  struct netlink_ext_ack *extack)
   1169{
   1170	int err;
   1171	struct nlattr *kind = tca[TCA_KIND];
   1172	struct Qdisc *sch;
   1173	struct Qdisc_ops *ops;
   1174	struct qdisc_size_table *stab;
   1175
   1176	ops = qdisc_lookup_ops(kind);
   1177#ifdef CONFIG_MODULES
   1178	if (ops == NULL && kind != NULL) {
   1179		char name[IFNAMSIZ];
   1180		if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
   1181			/* We dropped the RTNL semaphore in order to
   1182			 * perform the module load.  So, even if we
   1183			 * succeeded in loading the module we have to
   1184			 * tell the caller to replay the request.  We
   1185			 * indicate this using -EAGAIN.
   1186			 * We replay the request because the device may
   1187			 * go away in the mean time.
   1188			 */
   1189			rtnl_unlock();
   1190			request_module("sch_%s", name);
   1191			rtnl_lock();
   1192			ops = qdisc_lookup_ops(kind);
   1193			if (ops != NULL) {
   1194				/* We will try again qdisc_lookup_ops,
   1195				 * so don't keep a reference.
   1196				 */
   1197				module_put(ops->owner);
   1198				err = -EAGAIN;
   1199				goto err_out;
   1200			}
   1201		}
   1202	}
   1203#endif
   1204
   1205	err = -ENOENT;
   1206	if (!ops) {
   1207		NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
   1208		goto err_out;
   1209	}
   1210
   1211	sch = qdisc_alloc(dev_queue, ops, extack);
   1212	if (IS_ERR(sch)) {
   1213		err = PTR_ERR(sch);
   1214		goto err_out2;
   1215	}
   1216
   1217	sch->parent = parent;
   1218
   1219	if (handle == TC_H_INGRESS) {
   1220		sch->flags |= TCQ_F_INGRESS;
   1221		handle = TC_H_MAKE(TC_H_INGRESS, 0);
   1222	} else {
   1223		if (handle == 0) {
   1224			handle = qdisc_alloc_handle(dev);
   1225			if (handle == 0) {
   1226				NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
   1227				err = -ENOSPC;
   1228				goto err_out3;
   1229			}
   1230		}
   1231		if (!netif_is_multiqueue(dev))
   1232			sch->flags |= TCQ_F_ONETXQUEUE;
   1233	}
   1234
   1235	sch->handle = handle;
   1236
   1237	/* This exist to keep backward compatible with a userspace
   1238	 * loophole, what allowed userspace to get IFF_NO_QUEUE
   1239	 * facility on older kernels by setting tx_queue_len=0 (prior
   1240	 * to qdisc init), and then forgot to reinit tx_queue_len
   1241	 * before again attaching a qdisc.
   1242	 */
   1243	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
   1244		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
   1245		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
   1246	}
   1247
   1248	err = qdisc_block_indexes_set(sch, tca, extack);
   1249	if (err)
   1250		goto err_out3;
   1251
   1252	if (ops->init) {
   1253		err = ops->init(sch, tca[TCA_OPTIONS], extack);
   1254		if (err != 0)
   1255			goto err_out5;
   1256	}
   1257
   1258	if (tca[TCA_STAB]) {
   1259		stab = qdisc_get_stab(tca[TCA_STAB], extack);
   1260		if (IS_ERR(stab)) {
   1261			err = PTR_ERR(stab);
   1262			goto err_out4;
   1263		}
   1264		rcu_assign_pointer(sch->stab, stab);
   1265	}
   1266	if (tca[TCA_RATE]) {
   1267		err = -EOPNOTSUPP;
   1268		if (sch->flags & TCQ_F_MQROOT) {
   1269			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
   1270			goto err_out4;
   1271		}
   1272
   1273		err = gen_new_estimator(&sch->bstats,
   1274					sch->cpu_bstats,
   1275					&sch->rate_est,
   1276					NULL,
   1277					true,
   1278					tca[TCA_RATE]);
   1279		if (err) {
   1280			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
   1281			goto err_out4;
   1282		}
   1283	}
   1284
   1285	qdisc_hash_add(sch, false);
   1286	trace_qdisc_create(ops, dev, parent);
   1287
   1288	return sch;
   1289
   1290err_out5:
   1291	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
   1292	if (ops->destroy)
   1293		ops->destroy(sch);
   1294err_out3:
   1295	dev_put_track(dev, &sch->dev_tracker);
   1296	qdisc_free(sch);
   1297err_out2:
   1298	module_put(ops->owner);
   1299err_out:
   1300	*errp = err;
   1301	return NULL;
   1302
   1303err_out4:
   1304	/*
   1305	 * Any broken qdiscs that would require a ops->reset() here?
   1306	 * The qdisc was never in action so it shouldn't be necessary.
   1307	 */
   1308	qdisc_put_stab(rtnl_dereference(sch->stab));
   1309	if (ops->destroy)
   1310		ops->destroy(sch);
   1311	goto err_out3;
   1312}
   1313
   1314static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
   1315			struct netlink_ext_ack *extack)
   1316{
   1317	struct qdisc_size_table *ostab, *stab = NULL;
   1318	int err = 0;
   1319
   1320	if (tca[TCA_OPTIONS]) {
   1321		if (!sch->ops->change) {
   1322			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
   1323			return -EINVAL;
   1324		}
   1325		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
   1326			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
   1327			return -EOPNOTSUPP;
   1328		}
   1329		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
   1330		if (err)
   1331			return err;
   1332	}
   1333
   1334	if (tca[TCA_STAB]) {
   1335		stab = qdisc_get_stab(tca[TCA_STAB], extack);
   1336		if (IS_ERR(stab))
   1337			return PTR_ERR(stab);
   1338	}
   1339
   1340	ostab = rtnl_dereference(sch->stab);
   1341	rcu_assign_pointer(sch->stab, stab);
   1342	qdisc_put_stab(ostab);
   1343
   1344	if (tca[TCA_RATE]) {
   1345		/* NB: ignores errors from replace_estimator
   1346		   because change can't be undone. */
   1347		if (sch->flags & TCQ_F_MQROOT)
   1348			goto out;
   1349		gen_replace_estimator(&sch->bstats,
   1350				      sch->cpu_bstats,
   1351				      &sch->rate_est,
   1352				      NULL,
   1353				      true,
   1354				      tca[TCA_RATE]);
   1355	}
   1356out:
   1357	return 0;
   1358}
   1359
   1360struct check_loop_arg {
   1361	struct qdisc_walker	w;
   1362	struct Qdisc		*p;
   1363	int			depth;
   1364};
   1365
   1366static int check_loop_fn(struct Qdisc *q, unsigned long cl,
   1367			 struct qdisc_walker *w);
   1368
   1369static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
   1370{
   1371	struct check_loop_arg	arg;
   1372
   1373	if (q->ops->cl_ops == NULL)
   1374		return 0;
   1375
   1376	arg.w.stop = arg.w.skip = arg.w.count = 0;
   1377	arg.w.fn = check_loop_fn;
   1378	arg.depth = depth;
   1379	arg.p = p;
   1380	q->ops->cl_ops->walk(q, &arg.w);
   1381	return arg.w.stop ? -ELOOP : 0;
   1382}
   1383
   1384static int
   1385check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
   1386{
   1387	struct Qdisc *leaf;
   1388	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
   1389	struct check_loop_arg *arg = (struct check_loop_arg *)w;
   1390
   1391	leaf = cops->leaf(q, cl);
   1392	if (leaf) {
   1393		if (leaf == arg->p || arg->depth > 7)
   1394			return -ELOOP;
   1395		return check_loop(leaf, arg->p, arg->depth + 1);
   1396	}
   1397	return 0;
   1398}
   1399
   1400const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
   1401	[TCA_KIND]		= { .type = NLA_STRING },
   1402	[TCA_RATE]		= { .type = NLA_BINARY,
   1403				    .len = sizeof(struct tc_estimator) },
   1404	[TCA_STAB]		= { .type = NLA_NESTED },
   1405	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
   1406	[TCA_CHAIN]		= { .type = NLA_U32 },
   1407	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
   1408	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
   1409};
   1410
   1411/*
   1412 * Delete/get qdisc.
   1413 */
   1414
   1415static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
   1416			struct netlink_ext_ack *extack)
   1417{
   1418	struct net *net = sock_net(skb->sk);
   1419	struct tcmsg *tcm = nlmsg_data(n);
   1420	struct nlattr *tca[TCA_MAX + 1];
   1421	struct net_device *dev;
   1422	u32 clid;
   1423	struct Qdisc *q = NULL;
   1424	struct Qdisc *p = NULL;
   1425	int err;
   1426
   1427	if ((n->nlmsg_type != RTM_GETQDISC) &&
   1428	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
   1429		return -EPERM;
   1430
   1431	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
   1432				     rtm_tca_policy, extack);
   1433	if (err < 0)
   1434		return err;
   1435
   1436	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
   1437	if (!dev)
   1438		return -ENODEV;
   1439
   1440	clid = tcm->tcm_parent;
   1441	if (clid) {
   1442		if (clid != TC_H_ROOT) {
   1443			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
   1444				p = qdisc_lookup(dev, TC_H_MAJ(clid));
   1445				if (!p) {
   1446					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
   1447					return -ENOENT;
   1448				}
   1449				q = qdisc_leaf(p, clid);
   1450			} else if (dev_ingress_queue(dev)) {
   1451				q = dev_ingress_queue(dev)->qdisc_sleeping;
   1452			}
   1453		} else {
   1454			q = rtnl_dereference(dev->qdisc);
   1455		}
   1456		if (!q) {
   1457			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
   1458			return -ENOENT;
   1459		}
   1460
   1461		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
   1462			NL_SET_ERR_MSG(extack, "Invalid handle");
   1463			return -EINVAL;
   1464		}
   1465	} else {
   1466		q = qdisc_lookup(dev, tcm->tcm_handle);
   1467		if (!q) {
   1468			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
   1469			return -ENOENT;
   1470		}
   1471	}
   1472
   1473	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
   1474		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
   1475		return -EINVAL;
   1476	}
   1477
   1478	if (n->nlmsg_type == RTM_DELQDISC) {
   1479		if (!clid) {
   1480			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
   1481			return -EINVAL;
   1482		}
   1483		if (q->handle == 0) {
   1484			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
   1485			return -ENOENT;
   1486		}
   1487		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
   1488		if (err != 0)
   1489			return err;
   1490	} else {
   1491		qdisc_notify(net, skb, n, clid, NULL, q);
   1492	}
   1493	return 0;
   1494}
   1495
   1496/*
   1497 * Create/change qdisc.
   1498 */
   1499
   1500static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
   1501			   struct netlink_ext_ack *extack)
   1502{
   1503	struct net *net = sock_net(skb->sk);
   1504	struct tcmsg *tcm;
   1505	struct nlattr *tca[TCA_MAX + 1];
   1506	struct net_device *dev;
   1507	u32 clid;
   1508	struct Qdisc *q, *p;
   1509	int err;
   1510
   1511	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
   1512		return -EPERM;
   1513
   1514replay:
   1515	/* Reinit, just in case something touches this. */
   1516	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
   1517				     rtm_tca_policy, extack);
   1518	if (err < 0)
   1519		return err;
   1520
   1521	tcm = nlmsg_data(n);
   1522	clid = tcm->tcm_parent;
   1523	q = p = NULL;
   1524
   1525	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
   1526	if (!dev)
   1527		return -ENODEV;
   1528
   1529
   1530	if (clid) {
   1531		if (clid != TC_H_ROOT) {
   1532			if (clid != TC_H_INGRESS) {
   1533				p = qdisc_lookup(dev, TC_H_MAJ(clid));
   1534				if (!p) {
   1535					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
   1536					return -ENOENT;
   1537				}
   1538				q = qdisc_leaf(p, clid);
   1539			} else if (dev_ingress_queue_create(dev)) {
   1540				q = dev_ingress_queue(dev)->qdisc_sleeping;
   1541			}
   1542		} else {
   1543			q = rtnl_dereference(dev->qdisc);
   1544		}
   1545
   1546		/* It may be default qdisc, ignore it */
   1547		if (q && q->handle == 0)
   1548			q = NULL;
   1549
   1550		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
   1551			if (tcm->tcm_handle) {
   1552				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
   1553					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
   1554					return -EEXIST;
   1555				}
   1556				if (TC_H_MIN(tcm->tcm_handle)) {
   1557					NL_SET_ERR_MSG(extack, "Invalid minor handle");
   1558					return -EINVAL;
   1559				}
   1560				q = qdisc_lookup(dev, tcm->tcm_handle);
   1561				if (!q)
   1562					goto create_n_graft;
   1563				if (n->nlmsg_flags & NLM_F_EXCL) {
   1564					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
   1565					return -EEXIST;
   1566				}
   1567				if (tca[TCA_KIND] &&
   1568				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
   1569					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
   1570					return -EINVAL;
   1571				}
   1572				if (q == p ||
   1573				    (p && check_loop(q, p, 0))) {
   1574					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
   1575					return -ELOOP;
   1576				}
   1577				qdisc_refcount_inc(q);
   1578				goto graft;
   1579			} else {
   1580				if (!q)
   1581					goto create_n_graft;
   1582
   1583				/* This magic test requires explanation.
   1584				 *
   1585				 *   We know, that some child q is already
   1586				 *   attached to this parent and have choice:
   1587				 *   either to change it or to create/graft new one.
   1588				 *
   1589				 *   1. We are allowed to create/graft only
   1590				 *   if CREATE and REPLACE flags are set.
   1591				 *
   1592				 *   2. If EXCL is set, requestor wanted to say,
   1593				 *   that qdisc tcm_handle is not expected
   1594				 *   to exist, so that we choose create/graft too.
   1595				 *
   1596				 *   3. The last case is when no flags are set.
   1597				 *   Alas, it is sort of hole in API, we
   1598				 *   cannot decide what to do unambiguously.
   1599				 *   For now we select create/graft, if
   1600				 *   user gave KIND, which does not match existing.
   1601				 */
   1602				if ((n->nlmsg_flags & NLM_F_CREATE) &&
   1603				    (n->nlmsg_flags & NLM_F_REPLACE) &&
   1604				    ((n->nlmsg_flags & NLM_F_EXCL) ||
   1605				     (tca[TCA_KIND] &&
   1606				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
   1607					goto create_n_graft;
   1608			}
   1609		}
   1610	} else {
   1611		if (!tcm->tcm_handle) {
   1612			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
   1613			return -EINVAL;
   1614		}
   1615		q = qdisc_lookup(dev, tcm->tcm_handle);
   1616	}
   1617
   1618	/* Change qdisc parameters */
   1619	if (!q) {
   1620		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
   1621		return -ENOENT;
   1622	}
   1623	if (n->nlmsg_flags & NLM_F_EXCL) {
   1624		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
   1625		return -EEXIST;
   1626	}
   1627	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
   1628		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
   1629		return -EINVAL;
   1630	}
   1631	err = qdisc_change(q, tca, extack);
   1632	if (err == 0)
   1633		qdisc_notify(net, skb, n, clid, NULL, q);
   1634	return err;
   1635
   1636create_n_graft:
   1637	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
   1638		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
   1639		return -ENOENT;
   1640	}
   1641	if (clid == TC_H_INGRESS) {
   1642		if (dev_ingress_queue(dev)) {
   1643			q = qdisc_create(dev, dev_ingress_queue(dev), p,
   1644					 tcm->tcm_parent, tcm->tcm_parent,
   1645					 tca, &err, extack);
   1646		} else {
   1647			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
   1648			err = -ENOENT;
   1649		}
   1650	} else {
   1651		struct netdev_queue *dev_queue;
   1652
   1653		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
   1654			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
   1655		else if (p)
   1656			dev_queue = p->dev_queue;
   1657		else
   1658			dev_queue = netdev_get_tx_queue(dev, 0);
   1659
   1660		q = qdisc_create(dev, dev_queue, p,
   1661				 tcm->tcm_parent, tcm->tcm_handle,
   1662				 tca, &err, extack);
   1663	}
   1664	if (q == NULL) {
   1665		if (err == -EAGAIN)
   1666			goto replay;
   1667		return err;
   1668	}
   1669
   1670graft:
   1671	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
   1672	if (err) {
   1673		if (q)
   1674			qdisc_put(q);
   1675		return err;
   1676	}
   1677
   1678	return 0;
   1679}
   1680
   1681static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
   1682			      struct netlink_callback *cb,
   1683			      int *q_idx_p, int s_q_idx, bool recur,
   1684			      bool dump_invisible)
   1685{
   1686	int ret = 0, q_idx = *q_idx_p;
   1687	struct Qdisc *q;
   1688	int b;
   1689
   1690	if (!root)
   1691		return 0;
   1692
   1693	q = root;
   1694	if (q_idx < s_q_idx) {
   1695		q_idx++;
   1696	} else {
   1697		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
   1698		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
   1699				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
   1700				  RTM_NEWQDISC) <= 0)
   1701			goto done;
   1702		q_idx++;
   1703	}
   1704
   1705	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
   1706	 * itself has already been dumped.
   1707	 *
   1708	 * If we've already dumped the top-level (ingress) qdisc above and the global
   1709	 * qdisc hashtable, we don't want to hit it again
   1710	 */
   1711	if (!qdisc_dev(root) || !recur)
   1712		goto out;
   1713
   1714	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
   1715		if (q_idx < s_q_idx) {
   1716			q_idx++;
   1717			continue;
   1718		}
   1719		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
   1720		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
   1721				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
   1722				  RTM_NEWQDISC) <= 0)
   1723			goto done;
   1724		q_idx++;
   1725	}
   1726
   1727out:
   1728	*q_idx_p = q_idx;
   1729	return ret;
   1730done:
   1731	ret = -1;
   1732	goto out;
   1733}
   1734
   1735static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
   1736{
   1737	struct net *net = sock_net(skb->sk);
   1738	int idx, q_idx;
   1739	int s_idx, s_q_idx;
   1740	struct net_device *dev;
   1741	const struct nlmsghdr *nlh = cb->nlh;
   1742	struct nlattr *tca[TCA_MAX + 1];
   1743	int err;
   1744
   1745	s_idx = cb->args[0];
   1746	s_q_idx = q_idx = cb->args[1];
   1747
   1748	idx = 0;
   1749	ASSERT_RTNL();
   1750
   1751	err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
   1752				     rtm_tca_policy, cb->extack);
   1753	if (err < 0)
   1754		return err;
   1755
   1756	for_each_netdev(net, dev) {
   1757		struct netdev_queue *dev_queue;
   1758
   1759		if (idx < s_idx)
   1760			goto cont;
   1761		if (idx > s_idx)
   1762			s_q_idx = 0;
   1763		q_idx = 0;
   1764
   1765		if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
   1766				       skb, cb, &q_idx, s_q_idx,
   1767				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
   1768			goto done;
   1769
   1770		dev_queue = dev_ingress_queue(dev);
   1771		if (dev_queue &&
   1772		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
   1773				       &q_idx, s_q_idx, false,
   1774				       tca[TCA_DUMP_INVISIBLE]) < 0)
   1775			goto done;
   1776
   1777cont:
   1778		idx++;
   1779	}
   1780
   1781done:
   1782	cb->args[0] = idx;
   1783	cb->args[1] = q_idx;
   1784
   1785	return skb->len;
   1786}
   1787
   1788
   1789
   1790/************************************************
   1791 *	Traffic classes manipulation.		*
   1792 ************************************************/
   1793
   1794static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
   1795			  unsigned long cl,
   1796			  u32 portid, u32 seq, u16 flags, int event)
   1797{
   1798	struct tcmsg *tcm;
   1799	struct nlmsghdr  *nlh;
   1800	unsigned char *b = skb_tail_pointer(skb);
   1801	struct gnet_dump d;
   1802	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
   1803
   1804	cond_resched();
   1805	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
   1806	if (!nlh)
   1807		goto out_nlmsg_trim;
   1808	tcm = nlmsg_data(nlh);
   1809	tcm->tcm_family = AF_UNSPEC;
   1810	tcm->tcm__pad1 = 0;
   1811	tcm->tcm__pad2 = 0;
   1812	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
   1813	tcm->tcm_parent = q->handle;
   1814	tcm->tcm_handle = q->handle;
   1815	tcm->tcm_info = 0;
   1816	if (nla_put_string(skb, TCA_KIND, q->ops->id))
   1817		goto nla_put_failure;
   1818	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
   1819		goto nla_put_failure;
   1820
   1821	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
   1822					 NULL, &d, TCA_PAD) < 0)
   1823		goto nla_put_failure;
   1824
   1825	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
   1826		goto nla_put_failure;
   1827
   1828	if (gnet_stats_finish_copy(&d) < 0)
   1829		goto nla_put_failure;
   1830
   1831	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
   1832	return skb->len;
   1833
   1834out_nlmsg_trim:
   1835nla_put_failure:
   1836	nlmsg_trim(skb, b);
   1837	return -1;
   1838}
   1839
   1840static int tclass_notify(struct net *net, struct sk_buff *oskb,
   1841			 struct nlmsghdr *n, struct Qdisc *q,
   1842			 unsigned long cl, int event)
   1843{
   1844	struct sk_buff *skb;
   1845	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
   1846
   1847	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
   1848	if (!skb)
   1849		return -ENOBUFS;
   1850
   1851	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
   1852		kfree_skb(skb);
   1853		return -EINVAL;
   1854	}
   1855
   1856	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
   1857			      n->nlmsg_flags & NLM_F_ECHO);
   1858}
   1859
   1860static int tclass_del_notify(struct net *net,
   1861			     const struct Qdisc_class_ops *cops,
   1862			     struct sk_buff *oskb, struct nlmsghdr *n,
   1863			     struct Qdisc *q, unsigned long cl,
   1864			     struct netlink_ext_ack *extack)
   1865{
   1866	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
   1867	struct sk_buff *skb;
   1868	int err = 0;
   1869
   1870	if (!cops->delete)
   1871		return -EOPNOTSUPP;
   1872
   1873	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
   1874	if (!skb)
   1875		return -ENOBUFS;
   1876
   1877	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
   1878			   RTM_DELTCLASS) < 0) {
   1879		kfree_skb(skb);
   1880		return -EINVAL;
   1881	}
   1882
   1883	err = cops->delete(q, cl, extack);
   1884	if (err) {
   1885		kfree_skb(skb);
   1886		return err;
   1887	}
   1888
   1889	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
   1890			     n->nlmsg_flags & NLM_F_ECHO);
   1891	return err;
   1892}
   1893
   1894#ifdef CONFIG_NET_CLS
   1895
   1896struct tcf_bind_args {
   1897	struct tcf_walker w;
   1898	unsigned long base;
   1899	unsigned long cl;
   1900	u32 classid;
   1901};
   1902
   1903static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
   1904{
   1905	struct tcf_bind_args *a = (void *)arg;
   1906
   1907	if (tp->ops->bind_class) {
   1908		struct Qdisc *q = tcf_block_q(tp->chain->block);
   1909
   1910		sch_tree_lock(q);
   1911		tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
   1912		sch_tree_unlock(q);
   1913	}
   1914	return 0;
   1915}
   1916
   1917struct tc_bind_class_args {
   1918	struct qdisc_walker w;
   1919	unsigned long new_cl;
   1920	u32 portid;
   1921	u32 clid;
   1922};
   1923
   1924static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
   1925				struct qdisc_walker *w)
   1926{
   1927	struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
   1928	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
   1929	struct tcf_block *block;
   1930	struct tcf_chain *chain;
   1931
   1932	block = cops->tcf_block(q, cl, NULL);
   1933	if (!block)
   1934		return 0;
   1935	for (chain = tcf_get_next_chain(block, NULL);
   1936	     chain;
   1937	     chain = tcf_get_next_chain(block, chain)) {
   1938		struct tcf_proto *tp;
   1939
   1940		for (tp = tcf_get_next_proto(chain, NULL);
   1941		     tp; tp = tcf_get_next_proto(chain, tp)) {
   1942			struct tcf_bind_args arg = {};
   1943
   1944			arg.w.fn = tcf_node_bind;
   1945			arg.classid = a->clid;
   1946			arg.base = cl;
   1947			arg.cl = a->new_cl;
   1948			tp->ops->walk(tp, &arg.w, true);
   1949		}
   1950	}
   1951
   1952	return 0;
   1953}
   1954
   1955static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
   1956			   unsigned long new_cl)
   1957{
   1958	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
   1959	struct tc_bind_class_args args = {};
   1960
   1961	if (!cops->tcf_block)
   1962		return;
   1963	args.portid = portid;
   1964	args.clid = clid;
   1965	args.new_cl = new_cl;
   1966	args.w.fn = tc_bind_class_walker;
   1967	q->ops->cl_ops->walk(q, &args.w);
   1968}
   1969
   1970#else
   1971
   1972static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
   1973			   unsigned long new_cl)
   1974{
   1975}
   1976
   1977#endif
   1978
   1979static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
   1980			 struct netlink_ext_ack *extack)
   1981{
   1982	struct net *net = sock_net(skb->sk);
   1983	struct tcmsg *tcm = nlmsg_data(n);
   1984	struct nlattr *tca[TCA_MAX + 1];
   1985	struct net_device *dev;
   1986	struct Qdisc *q = NULL;
   1987	const struct Qdisc_class_ops *cops;
   1988	unsigned long cl = 0;
   1989	unsigned long new_cl;
   1990	u32 portid;
   1991	u32 clid;
   1992	u32 qid;
   1993	int err;
   1994
   1995	if ((n->nlmsg_type != RTM_GETTCLASS) &&
   1996	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
   1997		return -EPERM;
   1998
   1999	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
   2000				     rtm_tca_policy, extack);
   2001	if (err < 0)
   2002		return err;
   2003
   2004	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
   2005	if (!dev)
   2006		return -ENODEV;
   2007
   2008	/*
   2009	   parent == TC_H_UNSPEC - unspecified parent.
   2010	   parent == TC_H_ROOT   - class is root, which has no parent.
   2011	   parent == X:0	 - parent is root class.
   2012	   parent == X:Y	 - parent is a node in hierarchy.
   2013	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
   2014
   2015	   handle == 0:0	 - generate handle from kernel pool.
   2016	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
   2017	   handle == X:Y	 - clear.
   2018	   handle == X:0	 - root class.
   2019	 */
   2020
   2021	/* Step 1. Determine qdisc handle X:0 */
   2022
   2023	portid = tcm->tcm_parent;
   2024	clid = tcm->tcm_handle;
   2025	qid = TC_H_MAJ(clid);
   2026
   2027	if (portid != TC_H_ROOT) {
   2028		u32 qid1 = TC_H_MAJ(portid);
   2029
   2030		if (qid && qid1) {
   2031			/* If both majors are known, they must be identical. */
   2032			if (qid != qid1)
   2033				return -EINVAL;
   2034		} else if (qid1) {
   2035			qid = qid1;
   2036		} else if (qid == 0)
   2037			qid = rtnl_dereference(dev->qdisc)->handle;
   2038
   2039		/* Now qid is genuine qdisc handle consistent
   2040		 * both with parent and child.
   2041		 *
   2042		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
   2043		 */
   2044		if (portid)
   2045			portid = TC_H_MAKE(qid, portid);
   2046	} else {
   2047		if (qid == 0)
   2048			qid = rtnl_dereference(dev->qdisc)->handle;
   2049	}
   2050
   2051	/* OK. Locate qdisc */
   2052	q = qdisc_lookup(dev, qid);
   2053	if (!q)
   2054		return -ENOENT;
   2055
   2056	/* An check that it supports classes */
   2057	cops = q->ops->cl_ops;
   2058	if (cops == NULL)
   2059		return -EINVAL;
   2060
   2061	/* Now try to get class */
   2062	if (clid == 0) {
   2063		if (portid == TC_H_ROOT)
   2064			clid = qid;
   2065	} else
   2066		clid = TC_H_MAKE(qid, clid);
   2067
   2068	if (clid)
   2069		cl = cops->find(q, clid);
   2070
   2071	if (cl == 0) {
   2072		err = -ENOENT;
   2073		if (n->nlmsg_type != RTM_NEWTCLASS ||
   2074		    !(n->nlmsg_flags & NLM_F_CREATE))
   2075			goto out;
   2076	} else {
   2077		switch (n->nlmsg_type) {
   2078		case RTM_NEWTCLASS:
   2079			err = -EEXIST;
   2080			if (n->nlmsg_flags & NLM_F_EXCL)
   2081				goto out;
   2082			break;
   2083		case RTM_DELTCLASS:
   2084			err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
   2085			/* Unbind the class with flilters with 0 */
   2086			tc_bind_tclass(q, portid, clid, 0);
   2087			goto out;
   2088		case RTM_GETTCLASS:
   2089			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
   2090			goto out;
   2091		default:
   2092			err = -EINVAL;
   2093			goto out;
   2094		}
   2095	}
   2096
   2097	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
   2098		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
   2099		return -EOPNOTSUPP;
   2100	}
   2101
   2102	new_cl = cl;
   2103	err = -EOPNOTSUPP;
   2104	if (cops->change)
   2105		err = cops->change(q, clid, portid, tca, &new_cl, extack);
   2106	if (err == 0) {
   2107		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
   2108		/* We just create a new class, need to do reverse binding. */
   2109		if (cl != new_cl)
   2110			tc_bind_tclass(q, portid, clid, new_cl);
   2111	}
   2112out:
   2113	return err;
   2114}
   2115
   2116struct qdisc_dump_args {
   2117	struct qdisc_walker	w;
   2118	struct sk_buff		*skb;
   2119	struct netlink_callback	*cb;
   2120};
   2121
   2122static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
   2123			    struct qdisc_walker *arg)
   2124{
   2125	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
   2126
   2127	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
   2128			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
   2129			      RTM_NEWTCLASS);
   2130}
   2131
   2132static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
   2133				struct tcmsg *tcm, struct netlink_callback *cb,
   2134				int *t_p, int s_t)
   2135{
   2136	struct qdisc_dump_args arg;
   2137
   2138	if (tc_qdisc_dump_ignore(q, false) ||
   2139	    *t_p < s_t || !q->ops->cl_ops ||
   2140	    (tcm->tcm_parent &&
   2141	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
   2142		(*t_p)++;
   2143		return 0;
   2144	}
   2145	if (*t_p > s_t)
   2146		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
   2147	arg.w.fn = qdisc_class_dump;
   2148	arg.skb = skb;
   2149	arg.cb = cb;
   2150	arg.w.stop  = 0;
   2151	arg.w.skip = cb->args[1];
   2152	arg.w.count = 0;
   2153	q->ops->cl_ops->walk(q, &arg.w);
   2154	cb->args[1] = arg.w.count;
   2155	if (arg.w.stop)
   2156		return -1;
   2157	(*t_p)++;
   2158	return 0;
   2159}
   2160
   2161static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
   2162			       struct tcmsg *tcm, struct netlink_callback *cb,
   2163			       int *t_p, int s_t, bool recur)
   2164{
   2165	struct Qdisc *q;
   2166	int b;
   2167
   2168	if (!root)
   2169		return 0;
   2170
   2171	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
   2172		return -1;
   2173
   2174	if (!qdisc_dev(root) || !recur)
   2175		return 0;
   2176
   2177	if (tcm->tcm_parent) {
   2178		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
   2179		if (q && q != root &&
   2180		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
   2181			return -1;
   2182		return 0;
   2183	}
   2184	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
   2185		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
   2186			return -1;
   2187	}
   2188
   2189	return 0;
   2190}
   2191
   2192static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
   2193{
   2194	struct tcmsg *tcm = nlmsg_data(cb->nlh);
   2195	struct net *net = sock_net(skb->sk);
   2196	struct netdev_queue *dev_queue;
   2197	struct net_device *dev;
   2198	int t, s_t;
   2199
   2200	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
   2201		return 0;
   2202	dev = dev_get_by_index(net, tcm->tcm_ifindex);
   2203	if (!dev)
   2204		return 0;
   2205
   2206	s_t = cb->args[0];
   2207	t = 0;
   2208
   2209	if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
   2210				skb, tcm, cb, &t, s_t, true) < 0)
   2211		goto done;
   2212
   2213	dev_queue = dev_ingress_queue(dev);
   2214	if (dev_queue &&
   2215	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
   2216				&t, s_t, false) < 0)
   2217		goto done;
   2218
   2219done:
   2220	cb->args[0] = t;
   2221
   2222	dev_put(dev);
   2223	return skb->len;
   2224}
   2225
   2226#ifdef CONFIG_PROC_FS
   2227static int psched_show(struct seq_file *seq, void *v)
   2228{
   2229	seq_printf(seq, "%08x %08x %08x %08x\n",
   2230		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
   2231		   1000000,
   2232		   (u32)NSEC_PER_SEC / hrtimer_resolution);
   2233
   2234	return 0;
   2235}
   2236
   2237static int __net_init psched_net_init(struct net *net)
   2238{
   2239	struct proc_dir_entry *e;
   2240
   2241	e = proc_create_single("psched", 0, net->proc_net, psched_show);
   2242	if (e == NULL)
   2243		return -ENOMEM;
   2244
   2245	return 0;
   2246}
   2247
   2248static void __net_exit psched_net_exit(struct net *net)
   2249{
   2250	remove_proc_entry("psched", net->proc_net);
   2251}
   2252#else
   2253static int __net_init psched_net_init(struct net *net)
   2254{
   2255	return 0;
   2256}
   2257
   2258static void __net_exit psched_net_exit(struct net *net)
   2259{
   2260}
   2261#endif
   2262
   2263static struct pernet_operations psched_net_ops = {
   2264	.init = psched_net_init,
   2265	.exit = psched_net_exit,
   2266};
   2267
   2268static int __init pktsched_init(void)
   2269{
   2270	int err;
   2271
   2272	err = register_pernet_subsys(&psched_net_ops);
   2273	if (err) {
   2274		pr_err("pktsched_init: "
   2275		       "cannot initialize per netns operations\n");
   2276		return err;
   2277	}
   2278
   2279	register_qdisc(&pfifo_fast_ops);
   2280	register_qdisc(&pfifo_qdisc_ops);
   2281	register_qdisc(&bfifo_qdisc_ops);
   2282	register_qdisc(&pfifo_head_drop_qdisc_ops);
   2283	register_qdisc(&mq_qdisc_ops);
   2284	register_qdisc(&noqueue_qdisc_ops);
   2285
   2286	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
   2287	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
   2288	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
   2289		      0);
   2290	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
   2291	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
   2292	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
   2293		      0);
   2294
   2295	return 0;
   2296}
   2297
   2298subsys_initcall(pktsched_init);