sch_cbq.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
sch_cbq.c (43035B)
      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
      4 *
      5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
      6 */
      7
      8#include <linux/module.h>
      9#include <linux/slab.h>
     10#include <linux/types.h>
     11#include <linux/kernel.h>
     12#include <linux/string.h>
     13#include <linux/errno.h>
     14#include <linux/skbuff.h>
     15#include <net/netlink.h>
     16#include <net/pkt_sched.h>
     17#include <net/pkt_cls.h>
     18
     19
     20/*	Class-Based Queueing (CBQ) algorithm.
     21	=======================================
     22
     23	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
     24		 Management Models for Packet Networks",
     25		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
     26
     27		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
     28
     29		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
     30		 Parameters", 1996
     31
     32		 [4] Sally Floyd and Michael Speer, "Experimental Results
     33		 for Class-Based Queueing", 1998, not published.
     34
     35	-----------------------------------------------------------------------
     36
     37	Algorithm skeleton was taken from NS simulator cbq.cc.
     38	If someone wants to check this code against the LBL version,
     39	he should take into account that ONLY the skeleton was borrowed,
     40	the implementation is different. Particularly:
     41
     42	--- The WRR algorithm is different. Our version looks more
     43	reasonable (I hope) and works when quanta are allowed to be
     44	less than MTU, which is always the case when real time classes
     45	have small rates. Note, that the statement of [3] is
     46	incomplete, delay may actually be estimated even if class
     47	per-round allotment is less than MTU. Namely, if per-round
     48	allotment is W*r_i, and r_1+...+r_k = r < 1
     49
     50	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
     51
     52	In the worst case we have IntServ estimate with D = W*r+k*MTU
     53	and C = MTU*r. The proof (if correct at all) is trivial.
     54
     55
     56	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
     57	interpret some places, which look like wrong translations
     58	from NS. Anyone is advised to find these differences
     59	and explain to me, why I am wrong 8).
     60
     61	--- Linux has no EOI event, so that we cannot estimate true class
     62	idle time. Workaround is to consider the next dequeue event
     63	as sign that previous packet is finished. This is wrong because of
     64	internal device queueing, but on a permanently loaded link it is true.
     65	Moreover, combined with clock integrator, this scheme looks
     66	very close to an ideal solution.  */
     67
     68struct cbq_sched_data;
     69
     70
     71struct cbq_class {
     72	struct Qdisc_class_common common;
     73	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
     74
     75/* Parameters */
     76	unsigned char		priority;	/* class priority */
     77	unsigned char		priority2;	/* priority to be used after overlimit */
     78	unsigned char		ewma_log;	/* time constant for idle time calculation */
     79
     80	u32			defmap;
     81
     82	/* Link-sharing scheduler parameters */
     83	long			maxidle;	/* Class parameters: see below. */
     84	long			offtime;
     85	long			minidle;
     86	u32			avpkt;
     87	struct qdisc_rate_table	*R_tab;
     88
     89	/* General scheduler (WRR) parameters */
     90	long			allot;
     91	long			quantum;	/* Allotment per WRR round */
     92	long			weight;		/* Relative allotment: see below */
     93
     94	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
     95	struct cbq_class	*split;		/* Ptr to split node */
     96	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
     97	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
     98	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
     99						   parent otherwise */
    100	struct cbq_class	*sibling;	/* Sibling chain */
    101	struct cbq_class	*children;	/* Pointer to children chain */
    102
    103	struct Qdisc		*q;		/* Elementary queueing discipline */
    104
    105
    106/* Variables */
    107	unsigned char		cpriority;	/* Effective priority */
    108	unsigned char		delayed;
    109	unsigned char		level;		/* level of the class in hierarchy:
    110						   0 for leaf classes, and maximal
    111						   level of children + 1 for nodes.
    112						 */
    113
    114	psched_time_t		last;		/* Last end of service */
    115	psched_time_t		undertime;
    116	long			avgidle;
    117	long			deficit;	/* Saved deficit for WRR */
    118	psched_time_t		penalized;
    119	struct gnet_stats_basic_sync bstats;
    120	struct gnet_stats_queue qstats;
    121	struct net_rate_estimator __rcu *rate_est;
    122	struct tc_cbq_xstats	xstats;
    123
    124	struct tcf_proto __rcu	*filter_list;
    125	struct tcf_block	*block;
    126
    127	int			filters;
    128
    129	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
    130};
    131
    132struct cbq_sched_data {
    133	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
    134	int			nclasses[TC_CBQ_MAXPRIO + 1];
    135	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
    136
    137	struct cbq_class	link;
    138
    139	unsigned int		activemask;
    140	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
    141								   with backlog */
    142
    143#ifdef CONFIG_NET_CLS_ACT
    144	struct cbq_class	*rx_class;
    145#endif
    146	struct cbq_class	*tx_class;
    147	struct cbq_class	*tx_borrowed;
    148	int			tx_len;
    149	psched_time_t		now;		/* Cached timestamp */
    150	unsigned int		pmask;
    151
    152	struct hrtimer		delay_timer;
    153	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
    154						   started when CBQ has
    155						   backlog, but cannot
    156						   transmit just now */
    157	psched_tdiff_t		wd_expires;
    158	int			toplevel;
    159	u32			hgenerator;
    160};
    161
    162
    163#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
    164
    165static inline struct cbq_class *
    166cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
    167{
    168	struct Qdisc_class_common *clc;
    169
    170	clc = qdisc_class_find(&q->clhash, classid);
    171	if (clc == NULL)
    172		return NULL;
    173	return container_of(clc, struct cbq_class, common);
    174}
    175
    176#ifdef CONFIG_NET_CLS_ACT
    177
    178static struct cbq_class *
    179cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
    180{
    181	struct cbq_class *cl;
    182
    183	for (cl = this->tparent; cl; cl = cl->tparent) {
    184		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
    185
    186		if (new != NULL && new != this)
    187			return new;
    188	}
    189	return NULL;
    190}
    191
    192#endif
    193
    194/* Classify packet. The procedure is pretty complicated, but
    195 * it allows us to combine link sharing and priority scheduling
    196 * transparently.
    197 *
    198 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
    199 * so that it resolves to split nodes. Then packets are classified
    200 * by logical priority, or a more specific classifier may be attached
    201 * to the split node.
    202 */
    203
    204static struct cbq_class *
    205cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
    206{
    207	struct cbq_sched_data *q = qdisc_priv(sch);
    208	struct cbq_class *head = &q->link;
    209	struct cbq_class **defmap;
    210	struct cbq_class *cl = NULL;
    211	u32 prio = skb->priority;
    212	struct tcf_proto *fl;
    213	struct tcf_result res;
    214
    215	/*
    216	 *  Step 1. If skb->priority points to one of our classes, use it.
    217	 */
    218	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
    219	    (cl = cbq_class_lookup(q, prio)) != NULL)
    220		return cl;
    221
    222	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
    223	for (;;) {
    224		int result = 0;
    225		defmap = head->defaults;
    226
    227		fl = rcu_dereference_bh(head->filter_list);
    228		/*
    229		 * Step 2+n. Apply classifier.
    230		 */
    231		result = tcf_classify(skb, NULL, fl, &res, true);
    232		if (!fl || result < 0)
    233			goto fallback;
    234
    235		cl = (void *)res.class;
    236		if (!cl) {
    237			if (TC_H_MAJ(res.classid))
    238				cl = cbq_class_lookup(q, res.classid);
    239			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
    240				cl = defmap[TC_PRIO_BESTEFFORT];
    241
    242			if (cl == NULL)
    243				goto fallback;
    244		}
    245		if (cl->level >= head->level)
    246			goto fallback;
    247#ifdef CONFIG_NET_CLS_ACT
    248		switch (result) {
    249		case TC_ACT_QUEUED:
    250		case TC_ACT_STOLEN:
    251		case TC_ACT_TRAP:
    252			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
    253			fallthrough;
    254		case TC_ACT_SHOT:
    255			return NULL;
    256		case TC_ACT_RECLASSIFY:
    257			return cbq_reclassify(skb, cl);
    258		}
    259#endif
    260		if (cl->level == 0)
    261			return cl;
    262
    263		/*
    264		 * Step 3+n. If classifier selected a link sharing class,
    265		 *	   apply agency specific classifier.
    266		 *	   Repeat this procedure until we hit a leaf node.
    267		 */
    268		head = cl;
    269	}
    270
    271fallback:
    272	cl = head;
    273
    274	/*
    275	 * Step 4. No success...
    276	 */
    277	if (TC_H_MAJ(prio) == 0 &&
    278	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
    279	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
    280		return head;
    281
    282	return cl;
    283}
    284
    285/*
    286 * A packet has just been enqueued on the empty class.
    287 * cbq_activate_class adds it to the tail of active class list
    288 * of its priority band.
    289 */
    290
    291static inline void cbq_activate_class(struct cbq_class *cl)
    292{
    293	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
    294	int prio = cl->cpriority;
    295	struct cbq_class *cl_tail;
    296
    297	cl_tail = q->active[prio];
    298	q->active[prio] = cl;
    299
    300	if (cl_tail != NULL) {
    301		cl->next_alive = cl_tail->next_alive;
    302		cl_tail->next_alive = cl;
    303	} else {
    304		cl->next_alive = cl;
    305		q->activemask |= (1<<prio);
    306	}
    307}
    308
    309/*
    310 * Unlink class from active chain.
    311 * Note that this same procedure is done directly in cbq_dequeue*
    312 * during round-robin procedure.
    313 */
    314
    315static void cbq_deactivate_class(struct cbq_class *this)
    316{
    317	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
    318	int prio = this->cpriority;
    319	struct cbq_class *cl;
    320	struct cbq_class *cl_prev = q->active[prio];
    321
    322	do {
    323		cl = cl_prev->next_alive;
    324		if (cl == this) {
    325			cl_prev->next_alive = cl->next_alive;
    326			cl->next_alive = NULL;
    327
    328			if (cl == q->active[prio]) {
    329				q->active[prio] = cl_prev;
    330				if (cl == q->active[prio]) {
    331					q->active[prio] = NULL;
    332					q->activemask &= ~(1<<prio);
    333					return;
    334				}
    335			}
    336			return;
    337		}
    338	} while ((cl_prev = cl) != q->active[prio]);
    339}
    340
    341static void
    342cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
    343{
    344	int toplevel = q->toplevel;
    345
    346	if (toplevel > cl->level) {
    347		psched_time_t now = psched_get_time();
    348
    349		do {
    350			if (cl->undertime < now) {
    351				q->toplevel = cl->level;
    352				return;
    353			}
    354		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
    355	}
    356}
    357
    358static int
    359cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
    360	    struct sk_buff **to_free)
    361{
    362	struct cbq_sched_data *q = qdisc_priv(sch);
    363	int ret;
    364	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
    365
    366#ifdef CONFIG_NET_CLS_ACT
    367	q->rx_class = cl;
    368#endif
    369	if (cl == NULL) {
    370		if (ret & __NET_XMIT_BYPASS)
    371			qdisc_qstats_drop(sch);
    372		__qdisc_drop(skb, to_free);
    373		return ret;
    374	}
    375
    376	ret = qdisc_enqueue(skb, cl->q, to_free);
    377	if (ret == NET_XMIT_SUCCESS) {
    378		sch->q.qlen++;
    379		cbq_mark_toplevel(q, cl);
    380		if (!cl->next_alive)
    381			cbq_activate_class(cl);
    382		return ret;
    383	}
    384
    385	if (net_xmit_drop_count(ret)) {
    386		qdisc_qstats_drop(sch);
    387		cbq_mark_toplevel(q, cl);
    388		cl->qstats.drops++;
    389	}
    390	return ret;
    391}
    392
    393/* Overlimit action: penalize leaf class by adding offtime */
    394static void cbq_overlimit(struct cbq_class *cl)
    395{
    396	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
    397	psched_tdiff_t delay = cl->undertime - q->now;
    398
    399	if (!cl->delayed) {
    400		delay += cl->offtime;
    401
    402		/*
    403		 * Class goes to sleep, so that it will have no
    404		 * chance to work avgidle. Let's forgive it 8)
    405		 *
    406		 * BTW cbq-2.0 has a crap in this
    407		 * place, apparently they forgot to shift it by cl->ewma_log.
    408		 */
    409		if (cl->avgidle < 0)
    410			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
    411		if (cl->avgidle < cl->minidle)
    412			cl->avgidle = cl->minidle;
    413		if (delay <= 0)
    414			delay = 1;
    415		cl->undertime = q->now + delay;
    416
    417		cl->xstats.overactions++;
    418		cl->delayed = 1;
    419	}
    420	if (q->wd_expires == 0 || q->wd_expires > delay)
    421		q->wd_expires = delay;
    422
    423	/* Dirty work! We must schedule wakeups based on
    424	 * real available rate, rather than leaf rate,
    425	 * which may be tiny (even zero).
    426	 */
    427	if (q->toplevel == TC_CBQ_MAXLEVEL) {
    428		struct cbq_class *b;
    429		psched_tdiff_t base_delay = q->wd_expires;
    430
    431		for (b = cl->borrow; b; b = b->borrow) {
    432			delay = b->undertime - q->now;
    433			if (delay < base_delay) {
    434				if (delay <= 0)
    435					delay = 1;
    436				base_delay = delay;
    437			}
    438		}
    439
    440		q->wd_expires = base_delay;
    441	}
    442}
    443
    444static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
    445				       psched_time_t now)
    446{
    447	struct cbq_class *cl;
    448	struct cbq_class *cl_prev = q->active[prio];
    449	psched_time_t sched = now;
    450
    451	if (cl_prev == NULL)
    452		return 0;
    453
    454	do {
    455		cl = cl_prev->next_alive;
    456		if (now - cl->penalized > 0) {
    457			cl_prev->next_alive = cl->next_alive;
    458			cl->next_alive = NULL;
    459			cl->cpriority = cl->priority;
    460			cl->delayed = 0;
    461			cbq_activate_class(cl);
    462
    463			if (cl == q->active[prio]) {
    464				q->active[prio] = cl_prev;
    465				if (cl == q->active[prio]) {
    466					q->active[prio] = NULL;
    467					return 0;
    468				}
    469			}
    470
    471			cl = cl_prev->next_alive;
    472		} else if (sched - cl->penalized > 0)
    473			sched = cl->penalized;
    474	} while ((cl_prev = cl) != q->active[prio]);
    475
    476	return sched - now;
    477}
    478
    479static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
    480{
    481	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
    482						delay_timer);
    483	struct Qdisc *sch = q->watchdog.qdisc;
    484	psched_time_t now;
    485	psched_tdiff_t delay = 0;
    486	unsigned int pmask;
    487
    488	now = psched_get_time();
    489
    490	pmask = q->pmask;
    491	q->pmask = 0;
    492
    493	while (pmask) {
    494		int prio = ffz(~pmask);
    495		psched_tdiff_t tmp;
    496
    497		pmask &= ~(1<<prio);
    498
    499		tmp = cbq_undelay_prio(q, prio, now);
    500		if (tmp > 0) {
    501			q->pmask |= 1<<prio;
    502			if (tmp < delay || delay == 0)
    503				delay = tmp;
    504		}
    505	}
    506
    507	if (delay) {
    508		ktime_t time;
    509
    510		time = 0;
    511		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
    512		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
    513	}
    514
    515	__netif_schedule(qdisc_root(sch));
    516	return HRTIMER_NORESTART;
    517}
    518
    519/*
    520 * It is mission critical procedure.
    521 *
    522 * We "regenerate" toplevel cutoff, if transmitting class
    523 * has backlog and it is not regulated. It is not part of
    524 * original CBQ description, but looks more reasonable.
    525 * Probably, it is wrong. This question needs further investigation.
    526 */
    527
    528static inline void
    529cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
    530		    struct cbq_class *borrowed)
    531{
    532	if (cl && q->toplevel >= borrowed->level) {
    533		if (cl->q->q.qlen > 1) {
    534			do {
    535				if (borrowed->undertime == PSCHED_PASTPERFECT) {
    536					q->toplevel = borrowed->level;
    537					return;
    538				}
    539			} while ((borrowed = borrowed->borrow) != NULL);
    540		}
    541#if 0
    542	/* It is not necessary now. Uncommenting it
    543	   will save CPU cycles, but decrease fairness.
    544	 */
    545		q->toplevel = TC_CBQ_MAXLEVEL;
    546#endif
    547	}
    548}
    549
    550static void
    551cbq_update(struct cbq_sched_data *q)
    552{
    553	struct cbq_class *this = q->tx_class;
    554	struct cbq_class *cl = this;
    555	int len = q->tx_len;
    556	psched_time_t now;
    557
    558	q->tx_class = NULL;
    559	/* Time integrator. We calculate EOS time
    560	 * by adding expected packet transmission time.
    561	 */
    562	now = q->now + L2T(&q->link, len);
    563
    564	for ( ; cl; cl = cl->share) {
    565		long avgidle = cl->avgidle;
    566		long idle;
    567
    568		_bstats_update(&cl->bstats, len, 1);
    569
    570		/*
    571		 * (now - last) is total time between packet right edges.
    572		 * (last_pktlen/rate) is "virtual" busy time, so that
    573		 *
    574		 *	idle = (now - last) - last_pktlen/rate
    575		 */
    576
    577		idle = now - cl->last;
    578		if ((unsigned long)idle > 128*1024*1024) {
    579			avgidle = cl->maxidle;
    580		} else {
    581			idle -= L2T(cl, len);
    582
    583		/* true_avgidle := (1-W)*true_avgidle + W*idle,
    584		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
    585		 * cl->avgidle == true_avgidle/W,
    586		 * hence:
    587		 */
    588			avgidle += idle - (avgidle>>cl->ewma_log);
    589		}
    590
    591		if (avgidle <= 0) {
    592			/* Overlimit or at-limit */
    593
    594			if (avgidle < cl->minidle)
    595				avgidle = cl->minidle;
    596
    597			cl->avgidle = avgidle;
    598
    599			/* Calculate expected time, when this class
    600			 * will be allowed to send.
    601			 * It will occur, when:
    602			 * (1-W)*true_avgidle + W*delay = 0, i.e.
    603			 * idle = (1/W - 1)*(-true_avgidle)
    604			 * or
    605			 * idle = (1 - W)*(-cl->avgidle);
    606			 */
    607			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
    608
    609			/*
    610			 * That is not all.
    611			 * To maintain the rate allocated to the class,
    612			 * we add to undertime virtual clock,
    613			 * necessary to complete transmitted packet.
    614			 * (len/phys_bandwidth has been already passed
    615			 * to the moment of cbq_update)
    616			 */
    617
    618			idle -= L2T(&q->link, len);
    619			idle += L2T(cl, len);
    620
    621			cl->undertime = now + idle;
    622		} else {
    623			/* Underlimit */
    624
    625			cl->undertime = PSCHED_PASTPERFECT;
    626			if (avgidle > cl->maxidle)
    627				cl->avgidle = cl->maxidle;
    628			else
    629				cl->avgidle = avgidle;
    630		}
    631		if ((s64)(now - cl->last) > 0)
    632			cl->last = now;
    633	}
    634
    635	cbq_update_toplevel(q, this, q->tx_borrowed);
    636}
    637
    638static inline struct cbq_class *
    639cbq_under_limit(struct cbq_class *cl)
    640{
    641	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
    642	struct cbq_class *this_cl = cl;
    643
    644	if (cl->tparent == NULL)
    645		return cl;
    646
    647	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
    648		cl->delayed = 0;
    649		return cl;
    650	}
    651
    652	do {
    653		/* It is very suspicious place. Now overlimit
    654		 * action is generated for not bounded classes
    655		 * only if link is completely congested.
    656		 * Though it is in agree with ancestor-only paradigm,
    657		 * it looks very stupid. Particularly,
    658		 * it means that this chunk of code will either
    659		 * never be called or result in strong amplification
    660		 * of burstiness. Dangerous, silly, and, however,
    661		 * no another solution exists.
    662		 */
    663		cl = cl->borrow;
    664		if (!cl) {
    665			this_cl->qstats.overlimits++;
    666			cbq_overlimit(this_cl);
    667			return NULL;
    668		}
    669		if (cl->level > q->toplevel)
    670			return NULL;
    671	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
    672
    673	cl->delayed = 0;
    674	return cl;
    675}
    676
    677static inline struct sk_buff *
    678cbq_dequeue_prio(struct Qdisc *sch, int prio)
    679{
    680	struct cbq_sched_data *q = qdisc_priv(sch);
    681	struct cbq_class *cl_tail, *cl_prev, *cl;
    682	struct sk_buff *skb;
    683	int deficit;
    684
    685	cl_tail = cl_prev = q->active[prio];
    686	cl = cl_prev->next_alive;
    687
    688	do {
    689		deficit = 0;
    690
    691		/* Start round */
    692		do {
    693			struct cbq_class *borrow = cl;
    694
    695			if (cl->q->q.qlen &&
    696			    (borrow = cbq_under_limit(cl)) == NULL)
    697				goto skip_class;
    698
    699			if (cl->deficit <= 0) {
    700				/* Class exhausted its allotment per
    701				 * this round. Switch to the next one.
    702				 */
    703				deficit = 1;
    704				cl->deficit += cl->quantum;
    705				goto next_class;
    706			}
    707
    708			skb = cl->q->dequeue(cl->q);
    709
    710			/* Class did not give us any skb :-(
    711			 * It could occur even if cl->q->q.qlen != 0
    712			 * f.e. if cl->q == "tbf"
    713			 */
    714			if (skb == NULL)
    715				goto skip_class;
    716
    717			cl->deficit -= qdisc_pkt_len(skb);
    718			q->tx_class = cl;
    719			q->tx_borrowed = borrow;
    720			if (borrow != cl) {
    721#ifndef CBQ_XSTATS_BORROWS_BYTES
    722				borrow->xstats.borrows++;
    723				cl->xstats.borrows++;
    724#else
    725				borrow->xstats.borrows += qdisc_pkt_len(skb);
    726				cl->xstats.borrows += qdisc_pkt_len(skb);
    727#endif
    728			}
    729			q->tx_len = qdisc_pkt_len(skb);
    730
    731			if (cl->deficit <= 0) {
    732				q->active[prio] = cl;
    733				cl = cl->next_alive;
    734				cl->deficit += cl->quantum;
    735			}
    736			return skb;
    737
    738skip_class:
    739			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
    740				/* Class is empty or penalized.
    741				 * Unlink it from active chain.
    742				 */
    743				cl_prev->next_alive = cl->next_alive;
    744				cl->next_alive = NULL;
    745
    746				/* Did cl_tail point to it? */
    747				if (cl == cl_tail) {
    748					/* Repair it! */
    749					cl_tail = cl_prev;
    750
    751					/* Was it the last class in this band? */
    752					if (cl == cl_tail) {
    753						/* Kill the band! */
    754						q->active[prio] = NULL;
    755						q->activemask &= ~(1<<prio);
    756						if (cl->q->q.qlen)
    757							cbq_activate_class(cl);
    758						return NULL;
    759					}
    760
    761					q->active[prio] = cl_tail;
    762				}
    763				if (cl->q->q.qlen)
    764					cbq_activate_class(cl);
    765
    766				cl = cl_prev;
    767			}
    768
    769next_class:
    770			cl_prev = cl;
    771			cl = cl->next_alive;
    772		} while (cl_prev != cl_tail);
    773	} while (deficit);
    774
    775	q->active[prio] = cl_prev;
    776
    777	return NULL;
    778}
    779
    780static inline struct sk_buff *
    781cbq_dequeue_1(struct Qdisc *sch)
    782{
    783	struct cbq_sched_data *q = qdisc_priv(sch);
    784	struct sk_buff *skb;
    785	unsigned int activemask;
    786
    787	activemask = q->activemask & 0xFF;
    788	while (activemask) {
    789		int prio = ffz(~activemask);
    790		activemask &= ~(1<<prio);
    791		skb = cbq_dequeue_prio(sch, prio);
    792		if (skb)
    793			return skb;
    794	}
    795	return NULL;
    796}
    797
    798static struct sk_buff *
    799cbq_dequeue(struct Qdisc *sch)
    800{
    801	struct sk_buff *skb;
    802	struct cbq_sched_data *q = qdisc_priv(sch);
    803	psched_time_t now;
    804
    805	now = psched_get_time();
    806
    807	if (q->tx_class)
    808		cbq_update(q);
    809
    810	q->now = now;
    811
    812	for (;;) {
    813		q->wd_expires = 0;
    814
    815		skb = cbq_dequeue_1(sch);
    816		if (skb) {
    817			qdisc_bstats_update(sch, skb);
    818			sch->q.qlen--;
    819			return skb;
    820		}
    821
    822		/* All the classes are overlimit.
    823		 *
    824		 * It is possible, if:
    825		 *
    826		 * 1. Scheduler is empty.
    827		 * 2. Toplevel cutoff inhibited borrowing.
    828		 * 3. Root class is overlimit.
    829		 *
    830		 * Reset 2d and 3d conditions and retry.
    831		 *
    832		 * Note, that NS and cbq-2.0 are buggy, peeking
    833		 * an arbitrary class is appropriate for ancestor-only
    834		 * sharing, but not for toplevel algorithm.
    835		 *
    836		 * Our version is better, but slower, because it requires
    837		 * two passes, but it is unavoidable with top-level sharing.
    838		 */
    839
    840		if (q->toplevel == TC_CBQ_MAXLEVEL &&
    841		    q->link.undertime == PSCHED_PASTPERFECT)
    842			break;
    843
    844		q->toplevel = TC_CBQ_MAXLEVEL;
    845		q->link.undertime = PSCHED_PASTPERFECT;
    846	}
    847
    848	/* No packets in scheduler or nobody wants to give them to us :-(
    849	 * Sigh... start watchdog timer in the last case.
    850	 */
    851
    852	if (sch->q.qlen) {
    853		qdisc_qstats_overlimit(sch);
    854		if (q->wd_expires)
    855			qdisc_watchdog_schedule(&q->watchdog,
    856						now + q->wd_expires);
    857	}
    858	return NULL;
    859}
    860
    861/* CBQ class maintenance routines */
    862
    863static void cbq_adjust_levels(struct cbq_class *this)
    864{
    865	if (this == NULL)
    866		return;
    867
    868	do {
    869		int level = 0;
    870		struct cbq_class *cl;
    871
    872		cl = this->children;
    873		if (cl) {
    874			do {
    875				if (cl->level > level)
    876					level = cl->level;
    877			} while ((cl = cl->sibling) != this->children);
    878		}
    879		this->level = level + 1;
    880	} while ((this = this->tparent) != NULL);
    881}
    882
    883static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
    884{
    885	struct cbq_class *cl;
    886	unsigned int h;
    887
    888	if (q->quanta[prio] == 0)
    889		return;
    890
    891	for (h = 0; h < q->clhash.hashsize; h++) {
    892		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
    893			/* BUGGGG... Beware! This expression suffer of
    894			 * arithmetic overflows!
    895			 */
    896			if (cl->priority == prio) {
    897				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
    898					q->quanta[prio];
    899			}
    900			if (cl->quantum <= 0 ||
    901			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
    902				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
    903					cl->common.classid, cl->quantum);
    904				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
    905			}
    906		}
    907	}
    908}
    909
    910static void cbq_sync_defmap(struct cbq_class *cl)
    911{
    912	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
    913	struct cbq_class *split = cl->split;
    914	unsigned int h;
    915	int i;
    916
    917	if (split == NULL)
    918		return;
    919
    920	for (i = 0; i <= TC_PRIO_MAX; i++) {
    921		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
    922			split->defaults[i] = NULL;
    923	}
    924
    925	for (i = 0; i <= TC_PRIO_MAX; i++) {
    926		int level = split->level;
    927
    928		if (split->defaults[i])
    929			continue;
    930
    931		for (h = 0; h < q->clhash.hashsize; h++) {
    932			struct cbq_class *c;
    933
    934			hlist_for_each_entry(c, &q->clhash.hash[h],
    935					     common.hnode) {
    936				if (c->split == split && c->level < level &&
    937				    c->defmap & (1<<i)) {
    938					split->defaults[i] = c;
    939					level = c->level;
    940				}
    941			}
    942		}
    943	}
    944}
    945
    946static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
    947{
    948	struct cbq_class *split = NULL;
    949
    950	if (splitid == 0) {
    951		split = cl->split;
    952		if (!split)
    953			return;
    954		splitid = split->common.classid;
    955	}
    956
    957	if (split == NULL || split->common.classid != splitid) {
    958		for (split = cl->tparent; split; split = split->tparent)
    959			if (split->common.classid == splitid)
    960				break;
    961	}
    962
    963	if (split == NULL)
    964		return;
    965
    966	if (cl->split != split) {
    967		cl->defmap = 0;
    968		cbq_sync_defmap(cl);
    969		cl->split = split;
    970		cl->defmap = def & mask;
    971	} else
    972		cl->defmap = (cl->defmap & ~mask) | (def & mask);
    973
    974	cbq_sync_defmap(cl);
    975}
    976
    977static void cbq_unlink_class(struct cbq_class *this)
    978{
    979	struct cbq_class *cl, **clp;
    980	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
    981
    982	qdisc_class_hash_remove(&q->clhash, &this->common);
    983
    984	if (this->tparent) {
    985		clp = &this->sibling;
    986		cl = *clp;
    987		do {
    988			if (cl == this) {
    989				*clp = cl->sibling;
    990				break;
    991			}
    992			clp = &cl->sibling;
    993		} while ((cl = *clp) != this->sibling);
    994
    995		if (this->tparent->children == this) {
    996			this->tparent->children = this->sibling;
    997			if (this->sibling == this)
    998				this->tparent->children = NULL;
    999		}
   1000	} else {
   1001		WARN_ON(this->sibling != this);
   1002	}
   1003}
   1004
   1005static void cbq_link_class(struct cbq_class *this)
   1006{
   1007	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
   1008	struct cbq_class *parent = this->tparent;
   1009
   1010	this->sibling = this;
   1011	qdisc_class_hash_insert(&q->clhash, &this->common);
   1012
   1013	if (parent == NULL)
   1014		return;
   1015
   1016	if (parent->children == NULL) {
   1017		parent->children = this;
   1018	} else {
   1019		this->sibling = parent->children->sibling;
   1020		parent->children->sibling = this;
   1021	}
   1022}
   1023
   1024static void
   1025cbq_reset(struct Qdisc *sch)
   1026{
   1027	struct cbq_sched_data *q = qdisc_priv(sch);
   1028	struct cbq_class *cl;
   1029	int prio;
   1030	unsigned int h;
   1031
   1032	q->activemask = 0;
   1033	q->pmask = 0;
   1034	q->tx_class = NULL;
   1035	q->tx_borrowed = NULL;
   1036	qdisc_watchdog_cancel(&q->watchdog);
   1037	hrtimer_cancel(&q->delay_timer);
   1038	q->toplevel = TC_CBQ_MAXLEVEL;
   1039	q->now = psched_get_time();
   1040
   1041	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
   1042		q->active[prio] = NULL;
   1043
   1044	for (h = 0; h < q->clhash.hashsize; h++) {
   1045		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
   1046			qdisc_reset(cl->q);
   1047
   1048			cl->next_alive = NULL;
   1049			cl->undertime = PSCHED_PASTPERFECT;
   1050			cl->avgidle = cl->maxidle;
   1051			cl->deficit = cl->quantum;
   1052			cl->cpriority = cl->priority;
   1053		}
   1054	}
   1055	sch->q.qlen = 0;
   1056}
   1057
   1058
   1059static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
   1060{
   1061	if (lss->change & TCF_CBQ_LSS_FLAGS) {
   1062		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
   1063		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
   1064	}
   1065	if (lss->change & TCF_CBQ_LSS_EWMA)
   1066		cl->ewma_log = lss->ewma_log;
   1067	if (lss->change & TCF_CBQ_LSS_AVPKT)
   1068		cl->avpkt = lss->avpkt;
   1069	if (lss->change & TCF_CBQ_LSS_MINIDLE)
   1070		cl->minidle = -(long)lss->minidle;
   1071	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
   1072		cl->maxidle = lss->maxidle;
   1073		cl->avgidle = lss->maxidle;
   1074	}
   1075	if (lss->change & TCF_CBQ_LSS_OFFTIME)
   1076		cl->offtime = lss->offtime;
   1077	return 0;
   1078}
   1079
   1080static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
   1081{
   1082	q->nclasses[cl->priority]--;
   1083	q->quanta[cl->priority] -= cl->weight;
   1084	cbq_normalize_quanta(q, cl->priority);
   1085}
   1086
   1087static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
   1088{
   1089	q->nclasses[cl->priority]++;
   1090	q->quanta[cl->priority] += cl->weight;
   1091	cbq_normalize_quanta(q, cl->priority);
   1092}
   1093
   1094static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
   1095{
   1096	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
   1097
   1098	if (wrr->allot)
   1099		cl->allot = wrr->allot;
   1100	if (wrr->weight)
   1101		cl->weight = wrr->weight;
   1102	if (wrr->priority) {
   1103		cl->priority = wrr->priority - 1;
   1104		cl->cpriority = cl->priority;
   1105		if (cl->priority >= cl->priority2)
   1106			cl->priority2 = TC_CBQ_MAXPRIO - 1;
   1107	}
   1108
   1109	cbq_addprio(q, cl);
   1110	return 0;
   1111}
   1112
   1113static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
   1114{
   1115	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
   1116	return 0;
   1117}
   1118
   1119static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
   1120	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
   1121	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
   1122	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
   1123	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
   1124	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
   1125	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
   1126	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
   1127};
   1128
   1129static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
   1130			 struct nlattr *opt,
   1131			 struct netlink_ext_ack *extack)
   1132{
   1133	int err;
   1134
   1135	if (!opt) {
   1136		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
   1137		return -EINVAL;
   1138	}
   1139
   1140	err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
   1141					  cbq_policy, extack);
   1142	if (err < 0)
   1143		return err;
   1144
   1145	if (tb[TCA_CBQ_WRROPT]) {
   1146		const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
   1147
   1148		if (wrr->priority > TC_CBQ_MAXPRIO) {
   1149			NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
   1150			err = -EINVAL;
   1151		}
   1152	}
   1153	return err;
   1154}
   1155
   1156static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
   1157		    struct netlink_ext_ack *extack)
   1158{
   1159	struct cbq_sched_data *q = qdisc_priv(sch);
   1160	struct nlattr *tb[TCA_CBQ_MAX + 1];
   1161	struct tc_ratespec *r;
   1162	int err;
   1163
   1164	qdisc_watchdog_init(&q->watchdog, sch);
   1165	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
   1166	q->delay_timer.function = cbq_undelay;
   1167
   1168	err = cbq_opt_parse(tb, opt, extack);
   1169	if (err < 0)
   1170		return err;
   1171
   1172	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
   1173		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
   1174		return -EINVAL;
   1175	}
   1176
   1177	r = nla_data(tb[TCA_CBQ_RATE]);
   1178
   1179	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
   1180	if (!q->link.R_tab)
   1181		return -EINVAL;
   1182
   1183	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
   1184	if (err)
   1185		goto put_rtab;
   1186
   1187	err = qdisc_class_hash_init(&q->clhash);
   1188	if (err < 0)
   1189		goto put_block;
   1190
   1191	q->link.sibling = &q->link;
   1192	q->link.common.classid = sch->handle;
   1193	q->link.qdisc = sch;
   1194	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
   1195				      sch->handle, NULL);
   1196	if (!q->link.q)
   1197		q->link.q = &noop_qdisc;
   1198	else
   1199		qdisc_hash_add(q->link.q, true);
   1200
   1201	q->link.priority = TC_CBQ_MAXPRIO - 1;
   1202	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
   1203	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
   1204	q->link.allot = psched_mtu(qdisc_dev(sch));
   1205	q->link.quantum = q->link.allot;
   1206	q->link.weight = q->link.R_tab->rate.rate;
   1207
   1208	q->link.ewma_log = TC_CBQ_DEF_EWMA;
   1209	q->link.avpkt = q->link.allot/2;
   1210	q->link.minidle = -0x7FFFFFFF;
   1211
   1212	q->toplevel = TC_CBQ_MAXLEVEL;
   1213	q->now = psched_get_time();
   1214
   1215	cbq_link_class(&q->link);
   1216
   1217	if (tb[TCA_CBQ_LSSOPT])
   1218		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
   1219
   1220	cbq_addprio(q, &q->link);
   1221	return 0;
   1222
   1223put_block:
   1224	tcf_block_put(q->link.block);
   1225
   1226put_rtab:
   1227	qdisc_put_rtab(q->link.R_tab);
   1228	return err;
   1229}
   1230
   1231static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
   1232{
   1233	unsigned char *b = skb_tail_pointer(skb);
   1234
   1235	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
   1236		goto nla_put_failure;
   1237	return skb->len;
   1238
   1239nla_put_failure:
   1240	nlmsg_trim(skb, b);
   1241	return -1;
   1242}
   1243
   1244static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
   1245{
   1246	unsigned char *b = skb_tail_pointer(skb);
   1247	struct tc_cbq_lssopt opt;
   1248
   1249	opt.flags = 0;
   1250	if (cl->borrow == NULL)
   1251		opt.flags |= TCF_CBQ_LSS_BOUNDED;
   1252	if (cl->share == NULL)
   1253		opt.flags |= TCF_CBQ_LSS_ISOLATED;
   1254	opt.ewma_log = cl->ewma_log;
   1255	opt.level = cl->level;
   1256	opt.avpkt = cl->avpkt;
   1257	opt.maxidle = cl->maxidle;
   1258	opt.minidle = (u32)(-cl->minidle);
   1259	opt.offtime = cl->offtime;
   1260	opt.change = ~0;
   1261	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
   1262		goto nla_put_failure;
   1263	return skb->len;
   1264
   1265nla_put_failure:
   1266	nlmsg_trim(skb, b);
   1267	return -1;
   1268}
   1269
   1270static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
   1271{
   1272	unsigned char *b = skb_tail_pointer(skb);
   1273	struct tc_cbq_wrropt opt;
   1274
   1275	memset(&opt, 0, sizeof(opt));
   1276	opt.flags = 0;
   1277	opt.allot = cl->allot;
   1278	opt.priority = cl->priority + 1;
   1279	opt.cpriority = cl->cpriority + 1;
   1280	opt.weight = cl->weight;
   1281	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
   1282		goto nla_put_failure;
   1283	return skb->len;
   1284
   1285nla_put_failure:
   1286	nlmsg_trim(skb, b);
   1287	return -1;
   1288}
   1289
   1290static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
   1291{
   1292	unsigned char *b = skb_tail_pointer(skb);
   1293	struct tc_cbq_fopt opt;
   1294
   1295	if (cl->split || cl->defmap) {
   1296		opt.split = cl->split ? cl->split->common.classid : 0;
   1297		opt.defmap = cl->defmap;
   1298		opt.defchange = ~0;
   1299		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
   1300			goto nla_put_failure;
   1301	}
   1302	return skb->len;
   1303
   1304nla_put_failure:
   1305	nlmsg_trim(skb, b);
   1306	return -1;
   1307}
   1308
   1309static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
   1310{
   1311	if (cbq_dump_lss(skb, cl) < 0 ||
   1312	    cbq_dump_rate(skb, cl) < 0 ||
   1313	    cbq_dump_wrr(skb, cl) < 0 ||
   1314	    cbq_dump_fopt(skb, cl) < 0)
   1315		return -1;
   1316	return 0;
   1317}
   1318
   1319static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
   1320{
   1321	struct cbq_sched_data *q = qdisc_priv(sch);
   1322	struct nlattr *nest;
   1323
   1324	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
   1325	if (nest == NULL)
   1326		goto nla_put_failure;
   1327	if (cbq_dump_attr(skb, &q->link) < 0)
   1328		goto nla_put_failure;
   1329	return nla_nest_end(skb, nest);
   1330
   1331nla_put_failure:
   1332	nla_nest_cancel(skb, nest);
   1333	return -1;
   1334}
   1335
   1336static int
   1337cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
   1338{
   1339	struct cbq_sched_data *q = qdisc_priv(sch);
   1340
   1341	q->link.xstats.avgidle = q->link.avgidle;
   1342	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
   1343}
   1344
   1345static int
   1346cbq_dump_class(struct Qdisc *sch, unsigned long arg,
   1347	       struct sk_buff *skb, struct tcmsg *tcm)
   1348{
   1349	struct cbq_class *cl = (struct cbq_class *)arg;
   1350	struct nlattr *nest;
   1351
   1352	if (cl->tparent)
   1353		tcm->tcm_parent = cl->tparent->common.classid;
   1354	else
   1355		tcm->tcm_parent = TC_H_ROOT;
   1356	tcm->tcm_handle = cl->common.classid;
   1357	tcm->tcm_info = cl->q->handle;
   1358
   1359	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
   1360	if (nest == NULL)
   1361		goto nla_put_failure;
   1362	if (cbq_dump_attr(skb, cl) < 0)
   1363		goto nla_put_failure;
   1364	return nla_nest_end(skb, nest);
   1365
   1366nla_put_failure:
   1367	nla_nest_cancel(skb, nest);
   1368	return -1;
   1369}
   1370
   1371static int
   1372cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
   1373	struct gnet_dump *d)
   1374{
   1375	struct cbq_sched_data *q = qdisc_priv(sch);
   1376	struct cbq_class *cl = (struct cbq_class *)arg;
   1377	__u32 qlen;
   1378
   1379	cl->xstats.avgidle = cl->avgidle;
   1380	cl->xstats.undertime = 0;
   1381	qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
   1382
   1383	if (cl->undertime != PSCHED_PASTPERFECT)
   1384		cl->xstats.undertime = cl->undertime - q->now;
   1385
   1386	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
   1387	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
   1388	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
   1389		return -1;
   1390
   1391	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
   1392}
   1393
   1394static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
   1395		     struct Qdisc **old, struct netlink_ext_ack *extack)
   1396{
   1397	struct cbq_class *cl = (struct cbq_class *)arg;
   1398
   1399	if (new == NULL) {
   1400		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
   1401					cl->common.classid, extack);
   1402		if (new == NULL)
   1403			return -ENOBUFS;
   1404	}
   1405
   1406	*old = qdisc_replace(sch, new, &cl->q);
   1407	return 0;
   1408}
   1409
   1410static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
   1411{
   1412	struct cbq_class *cl = (struct cbq_class *)arg;
   1413
   1414	return cl->q;
   1415}
   1416
   1417static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
   1418{
   1419	struct cbq_class *cl = (struct cbq_class *)arg;
   1420
   1421	cbq_deactivate_class(cl);
   1422}
   1423
   1424static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
   1425{
   1426	struct cbq_sched_data *q = qdisc_priv(sch);
   1427
   1428	return (unsigned long)cbq_class_lookup(q, classid);
   1429}
   1430
   1431static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
   1432{
   1433	struct cbq_sched_data *q = qdisc_priv(sch);
   1434
   1435	WARN_ON(cl->filters);
   1436
   1437	tcf_block_put(cl->block);
   1438	qdisc_put(cl->q);
   1439	qdisc_put_rtab(cl->R_tab);
   1440	gen_kill_estimator(&cl->rate_est);
   1441	if (cl != &q->link)
   1442		kfree(cl);
   1443}
   1444
   1445static void cbq_destroy(struct Qdisc *sch)
   1446{
   1447	struct cbq_sched_data *q = qdisc_priv(sch);
   1448	struct hlist_node *next;
   1449	struct cbq_class *cl;
   1450	unsigned int h;
   1451
   1452#ifdef CONFIG_NET_CLS_ACT
   1453	q->rx_class = NULL;
   1454#endif
   1455	/*
   1456	 * Filters must be destroyed first because we don't destroy the
   1457	 * classes from root to leafs which means that filters can still
   1458	 * be bound to classes which have been destroyed already. --TGR '04
   1459	 */
   1460	for (h = 0; h < q->clhash.hashsize; h++) {
   1461		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
   1462			tcf_block_put(cl->block);
   1463			cl->block = NULL;
   1464		}
   1465	}
   1466	for (h = 0; h < q->clhash.hashsize; h++) {
   1467		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
   1468					  common.hnode)
   1469			cbq_destroy_class(sch, cl);
   1470	}
   1471	qdisc_class_hash_destroy(&q->clhash);
   1472}
   1473
   1474static int
   1475cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
   1476		 unsigned long *arg, struct netlink_ext_ack *extack)
   1477{
   1478	int err;
   1479	struct cbq_sched_data *q = qdisc_priv(sch);
   1480	struct cbq_class *cl = (struct cbq_class *)*arg;
   1481	struct nlattr *opt = tca[TCA_OPTIONS];
   1482	struct nlattr *tb[TCA_CBQ_MAX + 1];
   1483	struct cbq_class *parent;
   1484	struct qdisc_rate_table *rtab = NULL;
   1485
   1486	err = cbq_opt_parse(tb, opt, extack);
   1487	if (err < 0)
   1488		return err;
   1489
   1490	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
   1491		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
   1492		return -EOPNOTSUPP;
   1493	}
   1494
   1495	if (cl) {
   1496		/* Check parent */
   1497		if (parentid) {
   1498			if (cl->tparent &&
   1499			    cl->tparent->common.classid != parentid) {
   1500				NL_SET_ERR_MSG(extack, "Invalid parent id");
   1501				return -EINVAL;
   1502			}
   1503			if (!cl->tparent && parentid != TC_H_ROOT) {
   1504				NL_SET_ERR_MSG(extack, "Parent must be root");
   1505				return -EINVAL;
   1506			}
   1507		}
   1508
   1509		if (tb[TCA_CBQ_RATE]) {
   1510			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
   1511					      tb[TCA_CBQ_RTAB], extack);
   1512			if (rtab == NULL)
   1513				return -EINVAL;
   1514		}
   1515
   1516		if (tca[TCA_RATE]) {
   1517			err = gen_replace_estimator(&cl->bstats, NULL,
   1518						    &cl->rate_est,
   1519						    NULL,
   1520						    true,
   1521						    tca[TCA_RATE]);
   1522			if (err) {
   1523				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
   1524				qdisc_put_rtab(rtab);
   1525				return err;
   1526			}
   1527		}
   1528
   1529		/* Change class parameters */
   1530		sch_tree_lock(sch);
   1531
   1532		if (cl->next_alive != NULL)
   1533			cbq_deactivate_class(cl);
   1534
   1535		if (rtab) {
   1536			qdisc_put_rtab(cl->R_tab);
   1537			cl->R_tab = rtab;
   1538		}
   1539
   1540		if (tb[TCA_CBQ_LSSOPT])
   1541			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
   1542
   1543		if (tb[TCA_CBQ_WRROPT]) {
   1544			cbq_rmprio(q, cl);
   1545			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
   1546		}
   1547
   1548		if (tb[TCA_CBQ_FOPT])
   1549			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
   1550
   1551		if (cl->q->q.qlen)
   1552			cbq_activate_class(cl);
   1553
   1554		sch_tree_unlock(sch);
   1555
   1556		return 0;
   1557	}
   1558
   1559	if (parentid == TC_H_ROOT)
   1560		return -EINVAL;
   1561
   1562	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
   1563		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
   1564		return -EINVAL;
   1565	}
   1566
   1567	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
   1568			      extack);
   1569	if (rtab == NULL)
   1570		return -EINVAL;
   1571
   1572	if (classid) {
   1573		err = -EINVAL;
   1574		if (TC_H_MAJ(classid ^ sch->handle) ||
   1575		    cbq_class_lookup(q, classid)) {
   1576			NL_SET_ERR_MSG(extack, "Specified class not found");
   1577			goto failure;
   1578		}
   1579	} else {
   1580		int i;
   1581		classid = TC_H_MAKE(sch->handle, 0x8000);
   1582
   1583		for (i = 0; i < 0x8000; i++) {
   1584			if (++q->hgenerator >= 0x8000)
   1585				q->hgenerator = 1;
   1586			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
   1587				break;
   1588		}
   1589		err = -ENOSR;
   1590		if (i >= 0x8000) {
   1591			NL_SET_ERR_MSG(extack, "Unable to generate classid");
   1592			goto failure;
   1593		}
   1594		classid = classid|q->hgenerator;
   1595	}
   1596
   1597	parent = &q->link;
   1598	if (parentid) {
   1599		parent = cbq_class_lookup(q, parentid);
   1600		err = -EINVAL;
   1601		if (!parent) {
   1602			NL_SET_ERR_MSG(extack, "Failed to find parentid");
   1603			goto failure;
   1604		}
   1605	}
   1606
   1607	err = -ENOBUFS;
   1608	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
   1609	if (cl == NULL)
   1610		goto failure;
   1611
   1612	gnet_stats_basic_sync_init(&cl->bstats);
   1613	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
   1614	if (err) {
   1615		kfree(cl);
   1616		goto failure;
   1617	}
   1618
   1619	if (tca[TCA_RATE]) {
   1620		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
   1621					NULL, true, tca[TCA_RATE]);
   1622		if (err) {
   1623			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
   1624			tcf_block_put(cl->block);
   1625			kfree(cl);
   1626			goto failure;
   1627		}
   1628	}
   1629
   1630	cl->R_tab = rtab;
   1631	rtab = NULL;
   1632	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
   1633				  NULL);
   1634	if (!cl->q)
   1635		cl->q = &noop_qdisc;
   1636	else
   1637		qdisc_hash_add(cl->q, true);
   1638
   1639	cl->common.classid = classid;
   1640	cl->tparent = parent;
   1641	cl->qdisc = sch;
   1642	cl->allot = parent->allot;
   1643	cl->quantum = cl->allot;
   1644	cl->weight = cl->R_tab->rate.rate;
   1645
   1646	sch_tree_lock(sch);
   1647	cbq_link_class(cl);
   1648	cl->borrow = cl->tparent;
   1649	if (cl->tparent != &q->link)
   1650		cl->share = cl->tparent;
   1651	cbq_adjust_levels(parent);
   1652	cl->minidle = -0x7FFFFFFF;
   1653	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
   1654	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
   1655	if (cl->ewma_log == 0)
   1656		cl->ewma_log = q->link.ewma_log;
   1657	if (cl->maxidle == 0)
   1658		cl->maxidle = q->link.maxidle;
   1659	if (cl->avpkt == 0)
   1660		cl->avpkt = q->link.avpkt;
   1661	if (tb[TCA_CBQ_FOPT])
   1662		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
   1663	sch_tree_unlock(sch);
   1664
   1665	qdisc_class_hash_grow(sch, &q->clhash);
   1666
   1667	*arg = (unsigned long)cl;
   1668	return 0;
   1669
   1670failure:
   1671	qdisc_put_rtab(rtab);
   1672	return err;
   1673}
   1674
   1675static int cbq_delete(struct Qdisc *sch, unsigned long arg,
   1676		      struct netlink_ext_ack *extack)
   1677{
   1678	struct cbq_sched_data *q = qdisc_priv(sch);
   1679	struct cbq_class *cl = (struct cbq_class *)arg;
   1680
   1681	if (cl->filters || cl->children || cl == &q->link)
   1682		return -EBUSY;
   1683
   1684	sch_tree_lock(sch);
   1685
   1686	qdisc_purge_queue(cl->q);
   1687
   1688	if (cl->next_alive)
   1689		cbq_deactivate_class(cl);
   1690
   1691	if (q->tx_borrowed == cl)
   1692		q->tx_borrowed = q->tx_class;
   1693	if (q->tx_class == cl) {
   1694		q->tx_class = NULL;
   1695		q->tx_borrowed = NULL;
   1696	}
   1697#ifdef CONFIG_NET_CLS_ACT
   1698	if (q->rx_class == cl)
   1699		q->rx_class = NULL;
   1700#endif
   1701
   1702	cbq_unlink_class(cl);
   1703	cbq_adjust_levels(cl->tparent);
   1704	cl->defmap = 0;
   1705	cbq_sync_defmap(cl);
   1706
   1707	cbq_rmprio(q, cl);
   1708	sch_tree_unlock(sch);
   1709
   1710	cbq_destroy_class(sch, cl);
   1711	return 0;
   1712}
   1713
   1714static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
   1715				       struct netlink_ext_ack *extack)
   1716{
   1717	struct cbq_sched_data *q = qdisc_priv(sch);
   1718	struct cbq_class *cl = (struct cbq_class *)arg;
   1719
   1720	if (cl == NULL)
   1721		cl = &q->link;
   1722
   1723	return cl->block;
   1724}
   1725
   1726static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
   1727				     u32 classid)
   1728{
   1729	struct cbq_sched_data *q = qdisc_priv(sch);
   1730	struct cbq_class *p = (struct cbq_class *)parent;
   1731	struct cbq_class *cl = cbq_class_lookup(q, classid);
   1732
   1733	if (cl) {
   1734		if (p && p->level <= cl->level)
   1735			return 0;
   1736		cl->filters++;
   1737		return (unsigned long)cl;
   1738	}
   1739	return 0;
   1740}
   1741
   1742static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
   1743{
   1744	struct cbq_class *cl = (struct cbq_class *)arg;
   1745
   1746	cl->filters--;
   1747}
   1748
   1749static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
   1750{
   1751	struct cbq_sched_data *q = qdisc_priv(sch);
   1752	struct cbq_class *cl;
   1753	unsigned int h;
   1754
   1755	if (arg->stop)
   1756		return;
   1757
   1758	for (h = 0; h < q->clhash.hashsize; h++) {
   1759		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
   1760			if (arg->count < arg->skip) {
   1761				arg->count++;
   1762				continue;
   1763			}
   1764			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
   1765				arg->stop = 1;
   1766				return;
   1767			}
   1768			arg->count++;
   1769		}
   1770	}
   1771}
   1772
   1773static const struct Qdisc_class_ops cbq_class_ops = {
   1774	.graft		=	cbq_graft,
   1775	.leaf		=	cbq_leaf,
   1776	.qlen_notify	=	cbq_qlen_notify,
   1777	.find		=	cbq_find,
   1778	.change		=	cbq_change_class,
   1779	.delete		=	cbq_delete,
   1780	.walk		=	cbq_walk,
   1781	.tcf_block	=	cbq_tcf_block,
   1782	.bind_tcf	=	cbq_bind_filter,
   1783	.unbind_tcf	=	cbq_unbind_filter,
   1784	.dump		=	cbq_dump_class,
   1785	.dump_stats	=	cbq_dump_class_stats,
   1786};
   1787
   1788static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
   1789	.next		=	NULL,
   1790	.cl_ops		=	&cbq_class_ops,
   1791	.id		=	"cbq",
   1792	.priv_size	=	sizeof(struct cbq_sched_data),
   1793	.enqueue	=	cbq_enqueue,
   1794	.dequeue	=	cbq_dequeue,
   1795	.peek		=	qdisc_peek_dequeued,
   1796	.init		=	cbq_init,
   1797	.reset		=	cbq_reset,
   1798	.destroy	=	cbq_destroy,
   1799	.change		=	NULL,
   1800	.dump		=	cbq_dump,
   1801	.dump_stats	=	cbq_dump_stats,
   1802	.owner		=	THIS_MODULE,
   1803};
   1804
   1805static int __init cbq_module_init(void)
   1806{
   1807	return register_qdisc(&cbq_qdisc_ops);
   1808}
   1809static void __exit cbq_module_exit(void)
   1810{
   1811	unregister_qdisc(&cbq_qdisc_ops);
   1812}
   1813module_init(cbq_module_init)
   1814module_exit(cbq_module_exit)
   1815MODULE_LICENSE("GPL");