sch_qfq.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
sch_qfq.c (42441B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * net/sched/sch_qfq.c         Quick Fair Queueing Plus Scheduler.
      4 *
      5 * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
      6 * Copyright (c) 2012 Paolo Valente.
      7 */
      8
      9#include <linux/module.h>
     10#include <linux/init.h>
     11#include <linux/bitops.h>
     12#include <linux/errno.h>
     13#include <linux/netdevice.h>
     14#include <linux/pkt_sched.h>
     15#include <net/sch_generic.h>
     16#include <net/pkt_sched.h>
     17#include <net/pkt_cls.h>
     18
     19
     20/*  Quick Fair Queueing Plus
     21    ========================
     22
     23    Sources:
     24
     25    [1] Paolo Valente,
     26    "Reducing the Execution Time of Fair-Queueing Schedulers."
     27    http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
     28
     29    Sources for QFQ:
     30
     31    [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
     32    Packet Scheduling with Tight Bandwidth Distribution Guarantees."
     33
     34    See also:
     35    http://retis.sssup.it/~fabio/linux/qfq/
     36 */
     37
     38/*
     39
     40  QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
     41  classes. Each aggregate is timestamped with a virtual start time S
     42  and a virtual finish time F, and scheduled according to its
     43  timestamps. S and F are computed as a function of a system virtual
     44  time function V. The classes within each aggregate are instead
     45  scheduled with DRR.
     46
     47  To speed up operations, QFQ+ divides also aggregates into a limited
     48  number of groups. Which group a class belongs to depends on the
     49  ratio between the maximum packet length for the class and the weight
     50  of the class. Groups have their own S and F. In the end, QFQ+
     51  schedules groups, then aggregates within groups, then classes within
     52  aggregates. See [1] and [2] for a full description.
     53
     54  Virtual time computations.
     55
     56  S, F and V are all computed in fixed point arithmetic with
     57  FRAC_BITS decimal bits.
     58
     59  QFQ_MAX_INDEX is the maximum index allowed for a group. We need
     60	one bit per index.
     61  QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
     62
     63  The layout of the bits is as below:
     64
     65                   [ MTU_SHIFT ][      FRAC_BITS    ]
     66                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
     67				 ^.__grp->index = 0
     68				 *.__grp->slot_shift
     69
     70  where MIN_SLOT_SHIFT is derived by difference from the others.
     71
     72  The max group index corresponds to Lmax/w_min, where
     73  Lmax=1<<MTU_SHIFT, w_min = 1 .
     74  From this, and knowing how many groups (MAX_INDEX) we want,
     75  we can derive the shift corresponding to each group.
     76
     77  Because we often need to compute
     78	F = S + len/w_i  and V = V + len/wsum
     79  instead of storing w_i store the value
     80	inv_w = (1<<FRAC_BITS)/w_i
     81  so we can do F = S + len * inv_w * wsum.
     82  We use W_TOT in the formulas so we can easily move between
     83  static and adaptive weight sum.
     84
     85  The per-scheduler-instance data contain all the data structures
     86  for the scheduler: bitmaps and bucket lists.
     87
     88 */
     89
     90/*
     91 * Maximum number of consecutive slots occupied by backlogged classes
     92 * inside a group.
     93 */
     94#define QFQ_MAX_SLOTS	32
     95
     96/*
     97 * Shifts used for aggregate<->group mapping.  We allow class weights that are
     98 * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
     99 * group with the smallest index that can support the L_i / r_i configured
    100 * for the classes in the aggregate.
    101 *
    102 * grp->index is the index of the group; and grp->slot_shift
    103 * is the shift for the corresponding (scaled) sigma_i.
    104 */
    105#define QFQ_MAX_INDEX		24
    106#define QFQ_MAX_WSHIFT		10
    107
    108#define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
    109#define QFQ_MAX_WSUM		(64*QFQ_MAX_WEIGHT)
    110
    111#define FRAC_BITS		30	/* fixed point arithmetic */
    112#define ONE_FP			(1UL << FRAC_BITS)
    113
    114#define QFQ_MTU_SHIFT		16	/* to support TSO/GSO */
    115#define QFQ_MIN_LMAX		512	/* see qfq_slot_insert */
    116
    117#define QFQ_MAX_AGG_CLASSES	8 /* max num classes per aggregate allowed */
    118
    119/*
    120 * Possible group states.  These values are used as indexes for the bitmaps
    121 * array of struct qfq_queue.
    122 */
    123enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
    124
    125struct qfq_group;
    126
    127struct qfq_aggregate;
    128
    129struct qfq_class {
    130	struct Qdisc_class_common common;
    131
    132	unsigned int filter_cnt;
    133
    134	struct gnet_stats_basic_sync bstats;
    135	struct gnet_stats_queue qstats;
    136	struct net_rate_estimator __rcu *rate_est;
    137	struct Qdisc *qdisc;
    138	struct list_head alist;		/* Link for active-classes list. */
    139	struct qfq_aggregate *agg;	/* Parent aggregate. */
    140	int deficit;			/* DRR deficit counter. */
    141};
    142
    143struct qfq_aggregate {
    144	struct hlist_node next;	/* Link for the slot list. */
    145	u64 S, F;		/* flow timestamps (exact) */
    146
    147	/* group we belong to. In principle we would need the index,
    148	 * which is log_2(lmax/weight), but we never reference it
    149	 * directly, only the group.
    150	 */
    151	struct qfq_group *grp;
    152
    153	/* these are copied from the flowset. */
    154	u32	class_weight; /* Weight of each class in this aggregate. */
    155	/* Max pkt size for the classes in this aggregate, DRR quantum. */
    156	int	lmax;
    157
    158	u32	inv_w;	    /* ONE_FP/(sum of weights of classes in aggr.). */
    159	u32	budgetmax;  /* Max budget for this aggregate. */
    160	u32	initial_budget, budget;     /* Initial and current budget. */
    161
    162	int		  num_classes;	/* Number of classes in this aggr. */
    163	struct list_head  active;	/* DRR queue of active classes. */
    164
    165	struct hlist_node nonfull_next;	/* See nonfull_aggs in qfq_sched. */
    166};
    167
    168struct qfq_group {
    169	u64 S, F;			/* group timestamps (approx). */
    170	unsigned int slot_shift;	/* Slot shift. */
    171	unsigned int index;		/* Group index. */
    172	unsigned int front;		/* Index of the front slot. */
    173	unsigned long full_slots;	/* non-empty slots */
    174
    175	/* Array of RR lists of active aggregates. */
    176	struct hlist_head slots[QFQ_MAX_SLOTS];
    177};
    178
    179struct qfq_sched {
    180	struct tcf_proto __rcu *filter_list;
    181	struct tcf_block	*block;
    182	struct Qdisc_class_hash clhash;
    183
    184	u64			oldV, V;	/* Precise virtual times. */
    185	struct qfq_aggregate	*in_serv_agg;   /* Aggregate being served. */
    186	u32			wsum;		/* weight sum */
    187	u32			iwsum;		/* inverse weight sum */
    188
    189	unsigned long bitmaps[QFQ_MAX_STATE];	    /* Group bitmaps. */
    190	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
    191	u32 min_slot_shift;	/* Index of the group-0 bit in the bitmaps. */
    192
    193	u32 max_agg_classes;		/* Max number of classes per aggr. */
    194	struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
    195};
    196
    197/*
    198 * Possible reasons why the timestamps of an aggregate are updated
    199 * enqueue: the aggregate switches from idle to active and must scheduled
    200 *	    for service
    201 * requeue: the aggregate finishes its budget, so it stops being served and
    202 *	    must be rescheduled for service
    203 */
    204enum update_reason {enqueue, requeue};
    205
    206static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
    207{
    208	struct qfq_sched *q = qdisc_priv(sch);
    209	struct Qdisc_class_common *clc;
    210
    211	clc = qdisc_class_find(&q->clhash, classid);
    212	if (clc == NULL)
    213		return NULL;
    214	return container_of(clc, struct qfq_class, common);
    215}
    216
    217static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
    218	[TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
    219	[TCA_QFQ_LMAX] = { .type = NLA_U32 },
    220};
    221
    222/*
    223 * Calculate a flow index, given its weight and maximum packet length.
    224 * index = log_2(maxlen/weight) but we need to apply the scaling.
    225 * This is used only once at flow creation.
    226 */
    227static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
    228{
    229	u64 slot_size = (u64)maxlen * inv_w;
    230	unsigned long size_map;
    231	int index = 0;
    232
    233	size_map = slot_size >> min_slot_shift;
    234	if (!size_map)
    235		goto out;
    236
    237	index = __fls(size_map) + 1;	/* basically a log_2 */
    238	index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
    239
    240	if (index < 0)
    241		index = 0;
    242out:
    243	pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
    244		 (unsigned long) ONE_FP/inv_w, maxlen, index);
    245
    246	return index;
    247}
    248
    249static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
    250static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
    251			     enum update_reason);
    252
    253static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
    254			 u32 lmax, u32 weight)
    255{
    256	INIT_LIST_HEAD(&agg->active);
    257	hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
    258
    259	agg->lmax = lmax;
    260	agg->class_weight = weight;
    261}
    262
    263static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
    264					  u32 lmax, u32 weight)
    265{
    266	struct qfq_aggregate *agg;
    267
    268	hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
    269		if (agg->lmax == lmax && agg->class_weight == weight)
    270			return agg;
    271
    272	return NULL;
    273}
    274
    275
    276/* Update aggregate as a function of the new number of classes. */
    277static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
    278			   int new_num_classes)
    279{
    280	u32 new_agg_weight;
    281
    282	if (new_num_classes == q->max_agg_classes)
    283		hlist_del_init(&agg->nonfull_next);
    284
    285	if (agg->num_classes > new_num_classes &&
    286	    new_num_classes == q->max_agg_classes - 1) /* agg no more full */
    287		hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
    288
    289	/* The next assignment may let
    290	 * agg->initial_budget > agg->budgetmax
    291	 * hold, we will take it into account in charge_actual_service().
    292	 */
    293	agg->budgetmax = new_num_classes * agg->lmax;
    294	new_agg_weight = agg->class_weight * new_num_classes;
    295	agg->inv_w = ONE_FP/new_agg_weight;
    296
    297	if (agg->grp == NULL) {
    298		int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
    299				       q->min_slot_shift);
    300		agg->grp = &q->groups[i];
    301	}
    302
    303	q->wsum +=
    304		(int) agg->class_weight * (new_num_classes - agg->num_classes);
    305	q->iwsum = ONE_FP / q->wsum;
    306
    307	agg->num_classes = new_num_classes;
    308}
    309
    310/* Add class to aggregate. */
    311static void qfq_add_to_agg(struct qfq_sched *q,
    312			   struct qfq_aggregate *agg,
    313			   struct qfq_class *cl)
    314{
    315	cl->agg = agg;
    316
    317	qfq_update_agg(q, agg, agg->num_classes+1);
    318	if (cl->qdisc->q.qlen > 0) { /* adding an active class */
    319		list_add_tail(&cl->alist, &agg->active);
    320		if (list_first_entry(&agg->active, struct qfq_class, alist) ==
    321		    cl && q->in_serv_agg != agg) /* agg was inactive */
    322			qfq_activate_agg(q, agg, enqueue); /* schedule agg */
    323	}
    324}
    325
    326static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
    327
    328static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
    329{
    330	hlist_del_init(&agg->nonfull_next);
    331	q->wsum -= agg->class_weight;
    332	if (q->wsum != 0)
    333		q->iwsum = ONE_FP / q->wsum;
    334
    335	if (q->in_serv_agg == agg)
    336		q->in_serv_agg = qfq_choose_next_agg(q);
    337	kfree(agg);
    338}
    339
    340/* Deschedule class from within its parent aggregate. */
    341static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
    342{
    343	struct qfq_aggregate *agg = cl->agg;
    344
    345
    346	list_del(&cl->alist); /* remove from RR queue of the aggregate */
    347	if (list_empty(&agg->active)) /* agg is now inactive */
    348		qfq_deactivate_agg(q, agg);
    349}
    350
    351/* Remove class from its parent aggregate. */
    352static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
    353{
    354	struct qfq_aggregate *agg = cl->agg;
    355
    356	cl->agg = NULL;
    357	if (agg->num_classes == 1) { /* agg being emptied, destroy it */
    358		qfq_destroy_agg(q, agg);
    359		return;
    360	}
    361	qfq_update_agg(q, agg, agg->num_classes-1);
    362}
    363
    364/* Deschedule class and remove it from its parent aggregate. */
    365static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
    366{
    367	if (cl->qdisc->q.qlen > 0) /* class is active */
    368		qfq_deactivate_class(q, cl);
    369
    370	qfq_rm_from_agg(q, cl);
    371}
    372
    373/* Move class to a new aggregate, matching the new class weight and/or lmax */
    374static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
    375			   u32 lmax)
    376{
    377	struct qfq_sched *q = qdisc_priv(sch);
    378	struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
    379
    380	if (new_agg == NULL) { /* create new aggregate */
    381		new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
    382		if (new_agg == NULL)
    383			return -ENOBUFS;
    384		qfq_init_agg(q, new_agg, lmax, weight);
    385	}
    386	qfq_deact_rm_from_agg(q, cl);
    387	qfq_add_to_agg(q, new_agg, cl);
    388
    389	return 0;
    390}
    391
    392static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
    393			    struct nlattr **tca, unsigned long *arg,
    394			    struct netlink_ext_ack *extack)
    395{
    396	struct qfq_sched *q = qdisc_priv(sch);
    397	struct qfq_class *cl = (struct qfq_class *)*arg;
    398	bool existing = false;
    399	struct nlattr *tb[TCA_QFQ_MAX + 1];
    400	struct qfq_aggregate *new_agg = NULL;
    401	u32 weight, lmax, inv_w;
    402	int err;
    403	int delta_w;
    404
    405	if (tca[TCA_OPTIONS] == NULL) {
    406		pr_notice("qfq: no options\n");
    407		return -EINVAL;
    408	}
    409
    410	err = nla_parse_nested_deprecated(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS],
    411					  qfq_policy, NULL);
    412	if (err < 0)
    413		return err;
    414
    415	if (tb[TCA_QFQ_WEIGHT]) {
    416		weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
    417		if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
    418			pr_notice("qfq: invalid weight %u\n", weight);
    419			return -EINVAL;
    420		}
    421	} else
    422		weight = 1;
    423
    424	if (tb[TCA_QFQ_LMAX]) {
    425		lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
    426		if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
    427			pr_notice("qfq: invalid max length %u\n", lmax);
    428			return -EINVAL;
    429		}
    430	} else
    431		lmax = psched_mtu(qdisc_dev(sch));
    432
    433	inv_w = ONE_FP / weight;
    434	weight = ONE_FP / inv_w;
    435
    436	if (cl != NULL &&
    437	    lmax == cl->agg->lmax &&
    438	    weight == cl->agg->class_weight)
    439		return 0; /* nothing to change */
    440
    441	delta_w = weight - (cl ? cl->agg->class_weight : 0);
    442
    443	if (q->wsum + delta_w > QFQ_MAX_WSUM) {
    444		pr_notice("qfq: total weight out of range (%d + %u)\n",
    445			  delta_w, q->wsum);
    446		return -EINVAL;
    447	}
    448
    449	if (cl != NULL) { /* modify existing class */
    450		if (tca[TCA_RATE]) {
    451			err = gen_replace_estimator(&cl->bstats, NULL,
    452						    &cl->rate_est,
    453						    NULL,
    454						    true,
    455						    tca[TCA_RATE]);
    456			if (err)
    457				return err;
    458		}
    459		existing = true;
    460		goto set_change_agg;
    461	}
    462
    463	/* create and init new class */
    464	cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
    465	if (cl == NULL)
    466		return -ENOBUFS;
    467
    468	gnet_stats_basic_sync_init(&cl->bstats);
    469	cl->common.classid = classid;
    470	cl->deficit = lmax;
    471
    472	cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
    473				      classid, NULL);
    474	if (cl->qdisc == NULL)
    475		cl->qdisc = &noop_qdisc;
    476
    477	if (tca[TCA_RATE]) {
    478		err = gen_new_estimator(&cl->bstats, NULL,
    479					&cl->rate_est,
    480					NULL,
    481					true,
    482					tca[TCA_RATE]);
    483		if (err)
    484			goto destroy_class;
    485	}
    486
    487	if (cl->qdisc != &noop_qdisc)
    488		qdisc_hash_add(cl->qdisc, true);
    489
    490set_change_agg:
    491	sch_tree_lock(sch);
    492	new_agg = qfq_find_agg(q, lmax, weight);
    493	if (new_agg == NULL) { /* create new aggregate */
    494		sch_tree_unlock(sch);
    495		new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
    496		if (new_agg == NULL) {
    497			err = -ENOBUFS;
    498			gen_kill_estimator(&cl->rate_est);
    499			goto destroy_class;
    500		}
    501		sch_tree_lock(sch);
    502		qfq_init_agg(q, new_agg, lmax, weight);
    503	}
    504	if (existing)
    505		qfq_deact_rm_from_agg(q, cl);
    506	else
    507		qdisc_class_hash_insert(&q->clhash, &cl->common);
    508	qfq_add_to_agg(q, new_agg, cl);
    509	sch_tree_unlock(sch);
    510	qdisc_class_hash_grow(sch, &q->clhash);
    511
    512	*arg = (unsigned long)cl;
    513	return 0;
    514
    515destroy_class:
    516	qdisc_put(cl->qdisc);
    517	kfree(cl);
    518	return err;
    519}
    520
    521static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
    522{
    523	struct qfq_sched *q = qdisc_priv(sch);
    524
    525	qfq_rm_from_agg(q, cl);
    526	gen_kill_estimator(&cl->rate_est);
    527	qdisc_put(cl->qdisc);
    528	kfree(cl);
    529}
    530
    531static int qfq_delete_class(struct Qdisc *sch, unsigned long arg,
    532			    struct netlink_ext_ack *extack)
    533{
    534	struct qfq_sched *q = qdisc_priv(sch);
    535	struct qfq_class *cl = (struct qfq_class *)arg;
    536
    537	if (cl->filter_cnt > 0)
    538		return -EBUSY;
    539
    540	sch_tree_lock(sch);
    541
    542	qdisc_purge_queue(cl->qdisc);
    543	qdisc_class_hash_remove(&q->clhash, &cl->common);
    544
    545	sch_tree_unlock(sch);
    546
    547	qfq_destroy_class(sch, cl);
    548	return 0;
    549}
    550
    551static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid)
    552{
    553	return (unsigned long)qfq_find_class(sch, classid);
    554}
    555
    556static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl,
    557				       struct netlink_ext_ack *extack)
    558{
    559	struct qfq_sched *q = qdisc_priv(sch);
    560
    561	if (cl)
    562		return NULL;
    563
    564	return q->block;
    565}
    566
    567static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
    568				  u32 classid)
    569{
    570	struct qfq_class *cl = qfq_find_class(sch, classid);
    571
    572	if (cl != NULL)
    573		cl->filter_cnt++;
    574
    575	return (unsigned long)cl;
    576}
    577
    578static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
    579{
    580	struct qfq_class *cl = (struct qfq_class *)arg;
    581
    582	cl->filter_cnt--;
    583}
    584
    585static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
    586			   struct Qdisc *new, struct Qdisc **old,
    587			   struct netlink_ext_ack *extack)
    588{
    589	struct qfq_class *cl = (struct qfq_class *)arg;
    590
    591	if (new == NULL) {
    592		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
    593					cl->common.classid, NULL);
    594		if (new == NULL)
    595			new = &noop_qdisc;
    596	}
    597
    598	*old = qdisc_replace(sch, new, &cl->qdisc);
    599	return 0;
    600}
    601
    602static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
    603{
    604	struct qfq_class *cl = (struct qfq_class *)arg;
    605
    606	return cl->qdisc;
    607}
    608
    609static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
    610			  struct sk_buff *skb, struct tcmsg *tcm)
    611{
    612	struct qfq_class *cl = (struct qfq_class *)arg;
    613	struct nlattr *nest;
    614
    615	tcm->tcm_parent	= TC_H_ROOT;
    616	tcm->tcm_handle	= cl->common.classid;
    617	tcm->tcm_info	= cl->qdisc->handle;
    618
    619	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
    620	if (nest == NULL)
    621		goto nla_put_failure;
    622	if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
    623	    nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
    624		goto nla_put_failure;
    625	return nla_nest_end(skb, nest);
    626
    627nla_put_failure:
    628	nla_nest_cancel(skb, nest);
    629	return -EMSGSIZE;
    630}
    631
    632static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
    633				struct gnet_dump *d)
    634{
    635	struct qfq_class *cl = (struct qfq_class *)arg;
    636	struct tc_qfq_stats xstats;
    637
    638	memset(&xstats, 0, sizeof(xstats));
    639
    640	xstats.weight = cl->agg->class_weight;
    641	xstats.lmax = cl->agg->lmax;
    642
    643	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
    644	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
    645	    qdisc_qstats_copy(d, cl->qdisc) < 0)
    646		return -1;
    647
    648	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
    649}
    650
    651static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
    652{
    653	struct qfq_sched *q = qdisc_priv(sch);
    654	struct qfq_class *cl;
    655	unsigned int i;
    656
    657	if (arg->stop)
    658		return;
    659
    660	for (i = 0; i < q->clhash.hashsize; i++) {
    661		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
    662			if (arg->count < arg->skip) {
    663				arg->count++;
    664				continue;
    665			}
    666			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
    667				arg->stop = 1;
    668				return;
    669			}
    670			arg->count++;
    671		}
    672	}
    673}
    674
    675static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
    676				      int *qerr)
    677{
    678	struct qfq_sched *q = qdisc_priv(sch);
    679	struct qfq_class *cl;
    680	struct tcf_result res;
    681	struct tcf_proto *fl;
    682	int result;
    683
    684	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
    685		pr_debug("qfq_classify: found %d\n", skb->priority);
    686		cl = qfq_find_class(sch, skb->priority);
    687		if (cl != NULL)
    688			return cl;
    689	}
    690
    691	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
    692	fl = rcu_dereference_bh(q->filter_list);
    693	result = tcf_classify(skb, NULL, fl, &res, false);
    694	if (result >= 0) {
    695#ifdef CONFIG_NET_CLS_ACT
    696		switch (result) {
    697		case TC_ACT_QUEUED:
    698		case TC_ACT_STOLEN:
    699		case TC_ACT_TRAP:
    700			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
    701			fallthrough;
    702		case TC_ACT_SHOT:
    703			return NULL;
    704		}
    705#endif
    706		cl = (struct qfq_class *)res.class;
    707		if (cl == NULL)
    708			cl = qfq_find_class(sch, res.classid);
    709		return cl;
    710	}
    711
    712	return NULL;
    713}
    714
    715/* Generic comparison function, handling wraparound. */
    716static inline int qfq_gt(u64 a, u64 b)
    717{
    718	return (s64)(a - b) > 0;
    719}
    720
    721/* Round a precise timestamp to its slotted value. */
    722static inline u64 qfq_round_down(u64 ts, unsigned int shift)
    723{
    724	return ts & ~((1ULL << shift) - 1);
    725}
    726
    727/* return the pointer to the group with lowest index in the bitmap */
    728static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
    729					unsigned long bitmap)
    730{
    731	int index = __ffs(bitmap);
    732	return &q->groups[index];
    733}
    734/* Calculate a mask to mimic what would be ffs_from(). */
    735static inline unsigned long mask_from(unsigned long bitmap, int from)
    736{
    737	return bitmap & ~((1UL << from) - 1);
    738}
    739
    740/*
    741 * The state computation relies on ER=0, IR=1, EB=2, IB=3
    742 * First compute eligibility comparing grp->S, q->V,
    743 * then check if someone is blocking us and possibly add EB
    744 */
    745static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
    746{
    747	/* if S > V we are not eligible */
    748	unsigned int state = qfq_gt(grp->S, q->V);
    749	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
    750	struct qfq_group *next;
    751
    752	if (mask) {
    753		next = qfq_ffs(q, mask);
    754		if (qfq_gt(grp->F, next->F))
    755			state |= EB;
    756	}
    757
    758	return state;
    759}
    760
    761
    762/*
    763 * In principle
    764 *	q->bitmaps[dst] |= q->bitmaps[src] & mask;
    765 *	q->bitmaps[src] &= ~mask;
    766 * but we should make sure that src != dst
    767 */
    768static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
    769				   int src, int dst)
    770{
    771	q->bitmaps[dst] |= q->bitmaps[src] & mask;
    772	q->bitmaps[src] &= ~mask;
    773}
    774
    775static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
    776{
    777	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
    778	struct qfq_group *next;
    779
    780	if (mask) {
    781		next = qfq_ffs(q, mask);
    782		if (!qfq_gt(next->F, old_F))
    783			return;
    784	}
    785
    786	mask = (1UL << index) - 1;
    787	qfq_move_groups(q, mask, EB, ER);
    788	qfq_move_groups(q, mask, IB, IR);
    789}
    790
    791/*
    792 * perhaps
    793 *
    794	old_V ^= q->V;
    795	old_V >>= q->min_slot_shift;
    796	if (old_V) {
    797		...
    798	}
    799 *
    800 */
    801static void qfq_make_eligible(struct qfq_sched *q)
    802{
    803	unsigned long vslot = q->V >> q->min_slot_shift;
    804	unsigned long old_vslot = q->oldV >> q->min_slot_shift;
    805
    806	if (vslot != old_vslot) {
    807		unsigned long mask;
    808		int last_flip_pos = fls(vslot ^ old_vslot);
    809
    810		if (last_flip_pos > 31) /* higher than the number of groups */
    811			mask = ~0UL;    /* make all groups eligible */
    812		else
    813			mask = (1UL << last_flip_pos) - 1;
    814
    815		qfq_move_groups(q, mask, IR, ER);
    816		qfq_move_groups(q, mask, IB, EB);
    817	}
    818}
    819
    820/*
    821 * The index of the slot in which the input aggregate agg is to be
    822 * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
    823 * and not a '-1' because the start time of the group may be moved
    824 * backward by one slot after the aggregate has been inserted, and
    825 * this would cause non-empty slots to be right-shifted by one
    826 * position.
    827 *
    828 * QFQ+ fully satisfies this bound to the slot index if the parameters
    829 * of the classes are not changed dynamically, and if QFQ+ never
    830 * happens to postpone the service of agg unjustly, i.e., it never
    831 * happens that the aggregate becomes backlogged and eligible, or just
    832 * eligible, while an aggregate with a higher approximated finish time
    833 * is being served. In particular, in this case QFQ+ guarantees that
    834 * the timestamps of agg are low enough that the slot index is never
    835 * higher than 2. Unfortunately, QFQ+ cannot provide the same
    836 * guarantee if it happens to unjustly postpone the service of agg, or
    837 * if the parameters of some class are changed.
    838 *
    839 * As for the first event, i.e., an out-of-order service, the
    840 * upper bound to the slot index guaranteed by QFQ+ grows to
    841 * 2 +
    842 * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
    843 * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
    844 *
    845 * The following function deals with this problem by backward-shifting
    846 * the timestamps of agg, if needed, so as to guarantee that the slot
    847 * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
    848 * cause the service of other aggregates to be postponed, yet the
    849 * worst-case guarantees of these aggregates are not violated.  In
    850 * fact, in case of no out-of-order service, the timestamps of agg
    851 * would have been even lower than they are after the backward shift,
    852 * because QFQ+ would have guaranteed a maximum value equal to 2 for
    853 * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
    854 * service is postponed because of the backward-shift would have
    855 * however waited for the service of agg before being served.
    856 *
    857 * The other event that may cause the slot index to be higher than 2
    858 * for agg is a recent change of the parameters of some class. If the
    859 * weight of a class is increased or the lmax (max_pkt_size) of the
    860 * class is decreased, then a new aggregate with smaller slot size
    861 * than the original parent aggregate of the class may happen to be
    862 * activated. The activation of this aggregate should be properly
    863 * delayed to when the service of the class has finished in the ideal
    864 * system tracked by QFQ+. If the activation of the aggregate is not
    865 * delayed to this reference time instant, then this aggregate may be
    866 * unjustly served before other aggregates waiting for service. This
    867 * may cause the above bound to the slot index to be violated for some
    868 * of these unlucky aggregates.
    869 *
    870 * Instead of delaying the activation of the new aggregate, which is
    871 * quite complex, the above-discussed capping of the slot index is
    872 * used to handle also the consequences of a change of the parameters
    873 * of a class.
    874 */
    875static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
    876			    u64 roundedS)
    877{
    878	u64 slot = (roundedS - grp->S) >> grp->slot_shift;
    879	unsigned int i; /* slot index in the bucket list */
    880
    881	if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
    882		u64 deltaS = roundedS - grp->S -
    883			((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
    884		agg->S -= deltaS;
    885		agg->F -= deltaS;
    886		slot = QFQ_MAX_SLOTS - 2;
    887	}
    888
    889	i = (grp->front + slot) % QFQ_MAX_SLOTS;
    890
    891	hlist_add_head(&agg->next, &grp->slots[i]);
    892	__set_bit(slot, &grp->full_slots);
    893}
    894
    895/* Maybe introduce hlist_first_entry?? */
    896static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
    897{
    898	return hlist_entry(grp->slots[grp->front].first,
    899			   struct qfq_aggregate, next);
    900}
    901
    902/*
    903 * remove the entry from the slot
    904 */
    905static void qfq_front_slot_remove(struct qfq_group *grp)
    906{
    907	struct qfq_aggregate *agg = qfq_slot_head(grp);
    908
    909	BUG_ON(!agg);
    910	hlist_del(&agg->next);
    911	if (hlist_empty(&grp->slots[grp->front]))
    912		__clear_bit(0, &grp->full_slots);
    913}
    914
    915/*
    916 * Returns the first aggregate in the first non-empty bucket of the
    917 * group. As a side effect, adjusts the bucket list so the first
    918 * non-empty bucket is at position 0 in full_slots.
    919 */
    920static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
    921{
    922	unsigned int i;
    923
    924	pr_debug("qfq slot_scan: grp %u full %#lx\n",
    925		 grp->index, grp->full_slots);
    926
    927	if (grp->full_slots == 0)
    928		return NULL;
    929
    930	i = __ffs(grp->full_slots);  /* zero based */
    931	if (i > 0) {
    932		grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
    933		grp->full_slots >>= i;
    934	}
    935
    936	return qfq_slot_head(grp);
    937}
    938
    939/*
    940 * adjust the bucket list. When the start time of a group decreases,
    941 * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
    942 * move the objects. The mask of occupied slots must be shifted
    943 * because we use ffs() to find the first non-empty slot.
    944 * This covers decreases in the group's start time, but what about
    945 * increases of the start time ?
    946 * Here too we should make sure that i is less than 32
    947 */
    948static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
    949{
    950	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
    951
    952	grp->full_slots <<= i;
    953	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
    954}
    955
    956static void qfq_update_eligible(struct qfq_sched *q)
    957{
    958	struct qfq_group *grp;
    959	unsigned long ineligible;
    960
    961	ineligible = q->bitmaps[IR] | q->bitmaps[IB];
    962	if (ineligible) {
    963		if (!q->bitmaps[ER]) {
    964			grp = qfq_ffs(q, ineligible);
    965			if (qfq_gt(grp->S, q->V))
    966				q->V = grp->S;
    967		}
    968		qfq_make_eligible(q);
    969	}
    970}
    971
    972/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
    973static void agg_dequeue(struct qfq_aggregate *agg,
    974			struct qfq_class *cl, unsigned int len)
    975{
    976	qdisc_dequeue_peeked(cl->qdisc);
    977
    978	cl->deficit -= (int) len;
    979
    980	if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
    981		list_del(&cl->alist);
    982	else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
    983		cl->deficit += agg->lmax;
    984		list_move_tail(&cl->alist, &agg->active);
    985	}
    986}
    987
    988static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
    989					   struct qfq_class **cl,
    990					   unsigned int *len)
    991{
    992	struct sk_buff *skb;
    993
    994	*cl = list_first_entry(&agg->active, struct qfq_class, alist);
    995	skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
    996	if (skb == NULL)
    997		WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
    998	else
    999		*len = qdisc_pkt_len(skb);
   1000
   1001	return skb;
   1002}
   1003
   1004/* Update F according to the actual service received by the aggregate. */
   1005static inline void charge_actual_service(struct qfq_aggregate *agg)
   1006{
   1007	/* Compute the service received by the aggregate, taking into
   1008	 * account that, after decreasing the number of classes in
   1009	 * agg, it may happen that
   1010	 * agg->initial_budget - agg->budget > agg->bugdetmax
   1011	 */
   1012	u32 service_received = min(agg->budgetmax,
   1013				   agg->initial_budget - agg->budget);
   1014
   1015	agg->F = agg->S + (u64)service_received * agg->inv_w;
   1016}
   1017
   1018/* Assign a reasonable start time for a new aggregate in group i.
   1019 * Admissible values for \hat(F) are multiples of \sigma_i
   1020 * no greater than V+\sigma_i . Larger values mean that
   1021 * we had a wraparound so we consider the timestamp to be stale.
   1022 *
   1023 * If F is not stale and F >= V then we set S = F.
   1024 * Otherwise we should assign S = V, but this may violate
   1025 * the ordering in EB (see [2]). So, if we have groups in ER,
   1026 * set S to the F_j of the first group j which would be blocking us.
   1027 * We are guaranteed not to move S backward because
   1028 * otherwise our group i would still be blocked.
   1029 */
   1030static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
   1031{
   1032	unsigned long mask;
   1033	u64 limit, roundedF;
   1034	int slot_shift = agg->grp->slot_shift;
   1035
   1036	roundedF = qfq_round_down(agg->F, slot_shift);
   1037	limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
   1038
   1039	if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
   1040		/* timestamp was stale */
   1041		mask = mask_from(q->bitmaps[ER], agg->grp->index);
   1042		if (mask) {
   1043			struct qfq_group *next = qfq_ffs(q, mask);
   1044			if (qfq_gt(roundedF, next->F)) {
   1045				if (qfq_gt(limit, next->F))
   1046					agg->S = next->F;
   1047				else /* preserve timestamp correctness */
   1048					agg->S = limit;
   1049				return;
   1050			}
   1051		}
   1052		agg->S = q->V;
   1053	} else  /* timestamp is not stale */
   1054		agg->S = agg->F;
   1055}
   1056
   1057/* Update the timestamps of agg before scheduling/rescheduling it for
   1058 * service.  In particular, assign to agg->F its maximum possible
   1059 * value, i.e., the virtual finish time with which the aggregate
   1060 * should be labeled if it used all its budget once in service.
   1061 */
   1062static inline void
   1063qfq_update_agg_ts(struct qfq_sched *q,
   1064		    struct qfq_aggregate *agg, enum update_reason reason)
   1065{
   1066	if (reason != requeue)
   1067		qfq_update_start(q, agg);
   1068	else /* just charge agg for the service received */
   1069		agg->S = agg->F;
   1070
   1071	agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
   1072}
   1073
   1074static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);
   1075
   1076static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
   1077{
   1078	struct qfq_sched *q = qdisc_priv(sch);
   1079	struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
   1080	struct qfq_class *cl;
   1081	struct sk_buff *skb = NULL;
   1082	/* next-packet len, 0 means no more active classes in in-service agg */
   1083	unsigned int len = 0;
   1084
   1085	if (in_serv_agg == NULL)
   1086		return NULL;
   1087
   1088	if (!list_empty(&in_serv_agg->active))
   1089		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
   1090
   1091	/*
   1092	 * If there are no active classes in the in-service aggregate,
   1093	 * or if the aggregate has not enough budget to serve its next
   1094	 * class, then choose the next aggregate to serve.
   1095	 */
   1096	if (len == 0 || in_serv_agg->budget < len) {
   1097		charge_actual_service(in_serv_agg);
   1098
   1099		/* recharge the budget of the aggregate */
   1100		in_serv_agg->initial_budget = in_serv_agg->budget =
   1101			in_serv_agg->budgetmax;
   1102
   1103		if (!list_empty(&in_serv_agg->active)) {
   1104			/*
   1105			 * Still active: reschedule for
   1106			 * service. Possible optimization: if no other
   1107			 * aggregate is active, then there is no point
   1108			 * in rescheduling this aggregate, and we can
   1109			 * just keep it as the in-service one. This
   1110			 * should be however a corner case, and to
   1111			 * handle it, we would need to maintain an
   1112			 * extra num_active_aggs field.
   1113			*/
   1114			qfq_update_agg_ts(q, in_serv_agg, requeue);
   1115			qfq_schedule_agg(q, in_serv_agg);
   1116		} else if (sch->q.qlen == 0) { /* no aggregate to serve */
   1117			q->in_serv_agg = NULL;
   1118			return NULL;
   1119		}
   1120
   1121		/*
   1122		 * If we get here, there are other aggregates queued:
   1123		 * choose the new aggregate to serve.
   1124		 */
   1125		in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
   1126		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
   1127	}
   1128	if (!skb)
   1129		return NULL;
   1130
   1131	qdisc_qstats_backlog_dec(sch, skb);
   1132	sch->q.qlen--;
   1133	qdisc_bstats_update(sch, skb);
   1134
   1135	agg_dequeue(in_serv_agg, cl, len);
   1136	/* If lmax is lowered, through qfq_change_class, for a class
   1137	 * owning pending packets with larger size than the new value
   1138	 * of lmax, then the following condition may hold.
   1139	 */
   1140	if (unlikely(in_serv_agg->budget < len))
   1141		in_serv_agg->budget = 0;
   1142	else
   1143		in_serv_agg->budget -= len;
   1144
   1145	q->V += (u64)len * q->iwsum;
   1146	pr_debug("qfq dequeue: len %u F %lld now %lld\n",
   1147		 len, (unsigned long long) in_serv_agg->F,
   1148		 (unsigned long long) q->V);
   1149
   1150	return skb;
   1151}
   1152
   1153static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
   1154{
   1155	struct qfq_group *grp;
   1156	struct qfq_aggregate *agg, *new_front_agg;
   1157	u64 old_F;
   1158
   1159	qfq_update_eligible(q);
   1160	q->oldV = q->V;
   1161
   1162	if (!q->bitmaps[ER])
   1163		return NULL;
   1164
   1165	grp = qfq_ffs(q, q->bitmaps[ER]);
   1166	old_F = grp->F;
   1167
   1168	agg = qfq_slot_head(grp);
   1169
   1170	/* agg starts to be served, remove it from schedule */
   1171	qfq_front_slot_remove(grp);
   1172
   1173	new_front_agg = qfq_slot_scan(grp);
   1174
   1175	if (new_front_agg == NULL) /* group is now inactive, remove from ER */
   1176		__clear_bit(grp->index, &q->bitmaps[ER]);
   1177	else {
   1178		u64 roundedS = qfq_round_down(new_front_agg->S,
   1179					      grp->slot_shift);
   1180		unsigned int s;
   1181
   1182		if (grp->S == roundedS)
   1183			return agg;
   1184		grp->S = roundedS;
   1185		grp->F = roundedS + (2ULL << grp->slot_shift);
   1186		__clear_bit(grp->index, &q->bitmaps[ER]);
   1187		s = qfq_calc_state(q, grp);
   1188		__set_bit(grp->index, &q->bitmaps[s]);
   1189	}
   1190
   1191	qfq_unblock_groups(q, grp->index, old_F);
   1192
   1193	return agg;
   1194}
   1195
   1196static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
   1197		       struct sk_buff **to_free)
   1198{
   1199	unsigned int len = qdisc_pkt_len(skb), gso_segs;
   1200	struct qfq_sched *q = qdisc_priv(sch);
   1201	struct qfq_class *cl;
   1202	struct qfq_aggregate *agg;
   1203	int err = 0;
   1204	bool first;
   1205
   1206	cl = qfq_classify(skb, sch, &err);
   1207	if (cl == NULL) {
   1208		if (err & __NET_XMIT_BYPASS)
   1209			qdisc_qstats_drop(sch);
   1210		__qdisc_drop(skb, to_free);
   1211		return err;
   1212	}
   1213	pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
   1214
   1215	if (unlikely(cl->agg->lmax < len)) {
   1216		pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
   1217			 cl->agg->lmax, len, cl->common.classid);
   1218		err = qfq_change_agg(sch, cl, cl->agg->class_weight, len);
   1219		if (err) {
   1220			cl->qstats.drops++;
   1221			return qdisc_drop(skb, sch, to_free);
   1222		}
   1223	}
   1224
   1225	gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
   1226	first = !cl->qdisc->q.qlen;
   1227	err = qdisc_enqueue(skb, cl->qdisc, to_free);
   1228	if (unlikely(err != NET_XMIT_SUCCESS)) {
   1229		pr_debug("qfq_enqueue: enqueue failed %d\n", err);
   1230		if (net_xmit_drop_count(err)) {
   1231			cl->qstats.drops++;
   1232			qdisc_qstats_drop(sch);
   1233		}
   1234		return err;
   1235	}
   1236
   1237	_bstats_update(&cl->bstats, len, gso_segs);
   1238	sch->qstats.backlog += len;
   1239	++sch->q.qlen;
   1240
   1241	agg = cl->agg;
   1242	/* if the queue was not empty, then done here */
   1243	if (!first) {
   1244		if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
   1245		    list_first_entry(&agg->active, struct qfq_class, alist)
   1246		    == cl && cl->deficit < len)
   1247			list_move_tail(&cl->alist, &agg->active);
   1248
   1249		return err;
   1250	}
   1251
   1252	/* schedule class for service within the aggregate */
   1253	cl->deficit = agg->lmax;
   1254	list_add_tail(&cl->alist, &agg->active);
   1255
   1256	if (list_first_entry(&agg->active, struct qfq_class, alist) != cl ||
   1257	    q->in_serv_agg == agg)
   1258		return err; /* non-empty or in service, nothing else to do */
   1259
   1260	qfq_activate_agg(q, agg, enqueue);
   1261
   1262	return err;
   1263}
   1264
   1265/*
   1266 * Schedule aggregate according to its timestamps.
   1267 */
   1268static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
   1269{
   1270	struct qfq_group *grp = agg->grp;
   1271	u64 roundedS;
   1272	int s;
   1273
   1274	roundedS = qfq_round_down(agg->S, grp->slot_shift);
   1275
   1276	/*
   1277	 * Insert agg in the correct bucket.
   1278	 * If agg->S >= grp->S we don't need to adjust the
   1279	 * bucket list and simply go to the insertion phase.
   1280	 * Otherwise grp->S is decreasing, we must make room
   1281	 * in the bucket list, and also recompute the group state.
   1282	 * Finally, if there were no flows in this group and nobody
   1283	 * was in ER make sure to adjust V.
   1284	 */
   1285	if (grp->full_slots) {
   1286		if (!qfq_gt(grp->S, agg->S))
   1287			goto skip_update;
   1288
   1289		/* create a slot for this agg->S */
   1290		qfq_slot_rotate(grp, roundedS);
   1291		/* group was surely ineligible, remove */
   1292		__clear_bit(grp->index, &q->bitmaps[IR]);
   1293		__clear_bit(grp->index, &q->bitmaps[IB]);
   1294	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) &&
   1295		   q->in_serv_agg == NULL)
   1296		q->V = roundedS;
   1297
   1298	grp->S = roundedS;
   1299	grp->F = roundedS + (2ULL << grp->slot_shift);
   1300	s = qfq_calc_state(q, grp);
   1301	__set_bit(grp->index, &q->bitmaps[s]);
   1302
   1303	pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
   1304		 s, q->bitmaps[s],
   1305		 (unsigned long long) agg->S,
   1306		 (unsigned long long) agg->F,
   1307		 (unsigned long long) q->V);
   1308
   1309skip_update:
   1310	qfq_slot_insert(grp, agg, roundedS);
   1311}
   1312
   1313
   1314/* Update agg ts and schedule agg for service */
   1315static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
   1316			     enum update_reason reason)
   1317{
   1318	agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */
   1319
   1320	qfq_update_agg_ts(q, agg, reason);
   1321	if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */
   1322		q->in_serv_agg = agg; /* start serving this aggregate */
   1323		 /* update V: to be in service, agg must be eligible */
   1324		q->oldV = q->V = agg->S;
   1325	} else if (agg != q->in_serv_agg)
   1326		qfq_schedule_agg(q, agg);
   1327}
   1328
   1329static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
   1330			    struct qfq_aggregate *agg)
   1331{
   1332	unsigned int i, offset;
   1333	u64 roundedS;
   1334
   1335	roundedS = qfq_round_down(agg->S, grp->slot_shift);
   1336	offset = (roundedS - grp->S) >> grp->slot_shift;
   1337
   1338	i = (grp->front + offset) % QFQ_MAX_SLOTS;
   1339
   1340	hlist_del(&agg->next);
   1341	if (hlist_empty(&grp->slots[i]))
   1342		__clear_bit(offset, &grp->full_slots);
   1343}
   1344
   1345/*
   1346 * Called to forcibly deschedule an aggregate.  If the aggregate is
   1347 * not in the front bucket, or if the latter has other aggregates in
   1348 * the front bucket, we can simply remove the aggregate with no other
   1349 * side effects.
   1350 * Otherwise we must propagate the event up.
   1351 */
   1352static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
   1353{
   1354	struct qfq_group *grp = agg->grp;
   1355	unsigned long mask;
   1356	u64 roundedS;
   1357	int s;
   1358
   1359	if (agg == q->in_serv_agg) {
   1360		charge_actual_service(agg);
   1361		q->in_serv_agg = qfq_choose_next_agg(q);
   1362		return;
   1363	}
   1364
   1365	agg->F = agg->S;
   1366	qfq_slot_remove(q, grp, agg);
   1367
   1368	if (!grp->full_slots) {
   1369		__clear_bit(grp->index, &q->bitmaps[IR]);
   1370		__clear_bit(grp->index, &q->bitmaps[EB]);
   1371		__clear_bit(grp->index, &q->bitmaps[IB]);
   1372
   1373		if (test_bit(grp->index, &q->bitmaps[ER]) &&
   1374		    !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
   1375			mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
   1376			if (mask)
   1377				mask = ~((1UL << __fls(mask)) - 1);
   1378			else
   1379				mask = ~0UL;
   1380			qfq_move_groups(q, mask, EB, ER);
   1381			qfq_move_groups(q, mask, IB, IR);
   1382		}
   1383		__clear_bit(grp->index, &q->bitmaps[ER]);
   1384	} else if (hlist_empty(&grp->slots[grp->front])) {
   1385		agg = qfq_slot_scan(grp);
   1386		roundedS = qfq_round_down(agg->S, grp->slot_shift);
   1387		if (grp->S != roundedS) {
   1388			__clear_bit(grp->index, &q->bitmaps[ER]);
   1389			__clear_bit(grp->index, &q->bitmaps[IR]);
   1390			__clear_bit(grp->index, &q->bitmaps[EB]);
   1391			__clear_bit(grp->index, &q->bitmaps[IB]);
   1392			grp->S = roundedS;
   1393			grp->F = roundedS + (2ULL << grp->slot_shift);
   1394			s = qfq_calc_state(q, grp);
   1395			__set_bit(grp->index, &q->bitmaps[s]);
   1396		}
   1397	}
   1398}
   1399
   1400static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
   1401{
   1402	struct qfq_sched *q = qdisc_priv(sch);
   1403	struct qfq_class *cl = (struct qfq_class *)arg;
   1404
   1405	qfq_deactivate_class(q, cl);
   1406}
   1407
   1408static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
   1409			  struct netlink_ext_ack *extack)
   1410{
   1411	struct qfq_sched *q = qdisc_priv(sch);
   1412	struct qfq_group *grp;
   1413	int i, j, err;
   1414	u32 max_cl_shift, maxbudg_shift, max_classes;
   1415
   1416	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
   1417	if (err)
   1418		return err;
   1419
   1420	err = qdisc_class_hash_init(&q->clhash);
   1421	if (err < 0)
   1422		return err;
   1423
   1424	max_classes = min_t(u64, (u64)qdisc_dev(sch)->tx_queue_len + 1,
   1425			    QFQ_MAX_AGG_CLASSES);
   1426	/* max_cl_shift = floor(log_2(max_classes)) */
   1427	max_cl_shift = __fls(max_classes);
   1428	q->max_agg_classes = 1<<max_cl_shift;
   1429
   1430	/* maxbudg_shift = log2(max_len * max_classes_per_agg) */
   1431	maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
   1432	q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
   1433
   1434	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
   1435		grp = &q->groups[i];
   1436		grp->index = i;
   1437		grp->slot_shift = q->min_slot_shift + i;
   1438		for (j = 0; j < QFQ_MAX_SLOTS; j++)
   1439			INIT_HLIST_HEAD(&grp->slots[j]);
   1440	}
   1441
   1442	INIT_HLIST_HEAD(&q->nonfull_aggs);
   1443
   1444	return 0;
   1445}
   1446
   1447static void qfq_reset_qdisc(struct Qdisc *sch)
   1448{
   1449	struct qfq_sched *q = qdisc_priv(sch);
   1450	struct qfq_class *cl;
   1451	unsigned int i;
   1452
   1453	for (i = 0; i < q->clhash.hashsize; i++) {
   1454		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
   1455			if (cl->qdisc->q.qlen > 0)
   1456				qfq_deactivate_class(q, cl);
   1457
   1458			qdisc_reset(cl->qdisc);
   1459		}
   1460	}
   1461	sch->qstats.backlog = 0;
   1462	sch->q.qlen = 0;
   1463}
   1464
   1465static void qfq_destroy_qdisc(struct Qdisc *sch)
   1466{
   1467	struct qfq_sched *q = qdisc_priv(sch);
   1468	struct qfq_class *cl;
   1469	struct hlist_node *next;
   1470	unsigned int i;
   1471
   1472	tcf_block_put(q->block);
   1473
   1474	for (i = 0; i < q->clhash.hashsize; i++) {
   1475		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
   1476					  common.hnode) {
   1477			qfq_destroy_class(sch, cl);
   1478		}
   1479	}
   1480	qdisc_class_hash_destroy(&q->clhash);
   1481}
   1482
   1483static const struct Qdisc_class_ops qfq_class_ops = {
   1484	.change		= qfq_change_class,
   1485	.delete		= qfq_delete_class,
   1486	.find		= qfq_search_class,
   1487	.tcf_block	= qfq_tcf_block,
   1488	.bind_tcf	= qfq_bind_tcf,
   1489	.unbind_tcf	= qfq_unbind_tcf,
   1490	.graft		= qfq_graft_class,
   1491	.leaf		= qfq_class_leaf,
   1492	.qlen_notify	= qfq_qlen_notify,
   1493	.dump		= qfq_dump_class,
   1494	.dump_stats	= qfq_dump_class_stats,
   1495	.walk		= qfq_walk,
   1496};
   1497
   1498static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
   1499	.cl_ops		= &qfq_class_ops,
   1500	.id		= "qfq",
   1501	.priv_size	= sizeof(struct qfq_sched),
   1502	.enqueue	= qfq_enqueue,
   1503	.dequeue	= qfq_dequeue,
   1504	.peek		= qdisc_peek_dequeued,
   1505	.init		= qfq_init_qdisc,
   1506	.reset		= qfq_reset_qdisc,
   1507	.destroy	= qfq_destroy_qdisc,
   1508	.owner		= THIS_MODULE,
   1509};
   1510
   1511static int __init qfq_init(void)
   1512{
   1513	return register_qdisc(&qfq_qdisc_ops);
   1514}
   1515
   1516static void __exit qfq_exit(void)
   1517{
   1518	unregister_qdisc(&qfq_qdisc_ops);
   1519}
   1520
   1521module_init(qfq_init);
   1522module_exit(qfq_exit);
   1523MODULE_LICENSE("GPL");