cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sch_sfq.c (23057B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
      4 *
      5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
      6 */
      7
      8#include <linux/module.h>
      9#include <linux/types.h>
     10#include <linux/kernel.h>
     11#include <linux/jiffies.h>
     12#include <linux/string.h>
     13#include <linux/in.h>
     14#include <linux/errno.h>
     15#include <linux/init.h>
     16#include <linux/skbuff.h>
     17#include <linux/siphash.h>
     18#include <linux/slab.h>
     19#include <linux/vmalloc.h>
     20#include <net/netlink.h>
     21#include <net/pkt_sched.h>
     22#include <net/pkt_cls.h>
     23#include <net/red.h>
     24
     25
     26/*	Stochastic Fairness Queuing algorithm.
     27	=======================================
     28
     29	Source:
     30	Paul E. McKenney "Stochastic Fairness Queuing",
     31	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
     32
     33	Paul E. McKenney "Stochastic Fairness Queuing",
     34	"Interworking: Research and Experience", v.2, 1991, p.113-131.
     35
     36
     37	See also:
     38	M. Shreedhar and George Varghese "Efficient Fair
     39	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
     40
     41
     42	This is not the thing that is usually called (W)FQ nowadays.
     43	It does not use any timestamp mechanism, but instead
     44	processes queues in round-robin order.
     45
     46	ADVANTAGE:
     47
     48	- It is very cheap. Both CPU and memory requirements are minimal.
     49
     50	DRAWBACKS:
     51
     52	- "Stochastic" -> It is not 100% fair.
     53	When hash collisions occur, several flows are considered as one.
     54
     55	- "Round-robin" -> It introduces larger delays than virtual clock
     56	based schemes, and should not be used for isolating interactive
     57	traffic	from non-interactive. It means, that this scheduler
     58	should be used as leaf of CBQ or P3, which put interactive traffic
     59	to higher priority band.
     60
     61	We still need true WFQ for top level CSZ, but using WFQ
     62	for the best effort traffic is absolutely pointless:
     63	SFQ is superior for this purpose.
     64
     65	IMPLEMENTATION:
     66	This implementation limits :
     67	- maximal queue length per flow to 127 packets.
     68	- max mtu to 2^18-1;
     69	- max 65408 flows,
     70	- number of hash buckets to 65536.
     71
     72	It is easy to increase these values, but not in flight.  */
     73
     74#define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
     75#define SFQ_DEFAULT_FLOWS	128
     76#define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
     77#define SFQ_EMPTY_SLOT		0xffff
     78#define SFQ_DEFAULT_HASH_DIVISOR 1024
     79
     80/* We use 16 bits to store allot, and want to handle packets up to 64K
     81 * Scale allot by 8 (1<<3) so that no overflow occurs.
     82 */
     83#define SFQ_ALLOT_SHIFT		3
     84#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
     85
     86/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
     87typedef u16 sfq_index;
     88
     89/*
     90 * We dont use pointers to save space.
     91 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
     92 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
     93 * are 'pointers' to dep[] array
     94 */
     95struct sfq_head {
     96	sfq_index	next;
     97	sfq_index	prev;
     98};
     99
    100struct sfq_slot {
    101	struct sk_buff	*skblist_next;
    102	struct sk_buff	*skblist_prev;
    103	sfq_index	qlen; /* number of skbs in skblist */
    104	sfq_index	next; /* next slot in sfq RR chain */
    105	struct sfq_head dep; /* anchor in dep[] chains */
    106	unsigned short	hash; /* hash value (index in ht[]) */
    107	short		allot; /* credit for this slot */
    108
    109	unsigned int    backlog;
    110	struct red_vars vars;
    111};
    112
    113struct sfq_sched_data {
    114/* frequently used fields */
    115	int		limit;		/* limit of total number of packets in this qdisc */
    116	unsigned int	divisor;	/* number of slots in hash table */
    117	u8		headdrop;
    118	u8		maxdepth;	/* limit of packets per flow */
    119
    120	siphash_key_t 	perturbation;
    121	u8		cur_depth;	/* depth of longest slot */
    122	u8		flags;
    123	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
    124	struct tcf_proto __rcu *filter_list;
    125	struct tcf_block *block;
    126	sfq_index	*ht;		/* Hash table ('divisor' slots) */
    127	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
    128
    129	struct red_parms *red_parms;
    130	struct tc_sfqred_stats stats;
    131	struct sfq_slot *tail;		/* current slot in round */
    132
    133	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
    134					/* Linked lists of slots, indexed by depth
    135					 * dep[0] : list of unused flows
    136					 * dep[1] : list of flows with 1 packet
    137					 * dep[X] : list of flows with X packets
    138					 */
    139
    140	unsigned int	maxflows;	/* number of flows in flows array */
    141	int		perturb_period;
    142	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
    143	struct timer_list perturb_timer;
    144	struct Qdisc	*sch;
    145};
    146
    147/*
    148 * sfq_head are either in a sfq_slot or in dep[] array
    149 */
    150static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
    151{
    152	if (val < SFQ_MAX_FLOWS)
    153		return &q->slots[val].dep;
    154	return &q->dep[val - SFQ_MAX_FLOWS];
    155}
    156
    157static unsigned int sfq_hash(const struct sfq_sched_data *q,
    158			     const struct sk_buff *skb)
    159{
    160	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
    161}
    162
    163static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
    164				 int *qerr)
    165{
    166	struct sfq_sched_data *q = qdisc_priv(sch);
    167	struct tcf_result res;
    168	struct tcf_proto *fl;
    169	int result;
    170
    171	if (TC_H_MAJ(skb->priority) == sch->handle &&
    172	    TC_H_MIN(skb->priority) > 0 &&
    173	    TC_H_MIN(skb->priority) <= q->divisor)
    174		return TC_H_MIN(skb->priority);
    175
    176	fl = rcu_dereference_bh(q->filter_list);
    177	if (!fl)
    178		return sfq_hash(q, skb) + 1;
    179
    180	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
    181	result = tcf_classify(skb, NULL, fl, &res, false);
    182	if (result >= 0) {
    183#ifdef CONFIG_NET_CLS_ACT
    184		switch (result) {
    185		case TC_ACT_STOLEN:
    186		case TC_ACT_QUEUED:
    187		case TC_ACT_TRAP:
    188			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
    189			fallthrough;
    190		case TC_ACT_SHOT:
    191			return 0;
    192		}
    193#endif
    194		if (TC_H_MIN(res.classid) <= q->divisor)
    195			return TC_H_MIN(res.classid);
    196	}
    197	return 0;
    198}
    199
    200/*
    201 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
    202 */
    203static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
    204{
    205	sfq_index p, n;
    206	struct sfq_slot *slot = &q->slots[x];
    207	int qlen = slot->qlen;
    208
    209	p = qlen + SFQ_MAX_FLOWS;
    210	n = q->dep[qlen].next;
    211
    212	slot->dep.next = n;
    213	slot->dep.prev = p;
    214
    215	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
    216	sfq_dep_head(q, n)->prev = x;
    217}
    218
    219#define sfq_unlink(q, x, n, p)			\
    220	do {					\
    221		n = q->slots[x].dep.next;	\
    222		p = q->slots[x].dep.prev;	\
    223		sfq_dep_head(q, p)->next = n;	\
    224		sfq_dep_head(q, n)->prev = p;	\
    225	} while (0)
    226
    227
    228static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
    229{
    230	sfq_index p, n;
    231	int d;
    232
    233	sfq_unlink(q, x, n, p);
    234
    235	d = q->slots[x].qlen--;
    236	if (n == p && q->cur_depth == d)
    237		q->cur_depth--;
    238	sfq_link(q, x);
    239}
    240
    241static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
    242{
    243	sfq_index p, n;
    244	int d;
    245
    246	sfq_unlink(q, x, n, p);
    247
    248	d = ++q->slots[x].qlen;
    249	if (q->cur_depth < d)
    250		q->cur_depth = d;
    251	sfq_link(q, x);
    252}
    253
    254/* helper functions : might be changed when/if skb use a standard list_head */
    255
    256/* remove one skb from tail of slot queue */
    257static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
    258{
    259	struct sk_buff *skb = slot->skblist_prev;
    260
    261	slot->skblist_prev = skb->prev;
    262	skb->prev->next = (struct sk_buff *)slot;
    263	skb->next = skb->prev = NULL;
    264	return skb;
    265}
    266
    267/* remove one skb from head of slot queue */
    268static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
    269{
    270	struct sk_buff *skb = slot->skblist_next;
    271
    272	slot->skblist_next = skb->next;
    273	skb->next->prev = (struct sk_buff *)slot;
    274	skb->next = skb->prev = NULL;
    275	return skb;
    276}
    277
    278static inline void slot_queue_init(struct sfq_slot *slot)
    279{
    280	memset(slot, 0, sizeof(*slot));
    281	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
    282}
    283
    284/* add skb to slot queue (tail add) */
    285static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
    286{
    287	skb->prev = slot->skblist_prev;
    288	skb->next = (struct sk_buff *)slot;
    289	slot->skblist_prev->next = skb;
    290	slot->skblist_prev = skb;
    291}
    292
    293static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
    294{
    295	struct sfq_sched_data *q = qdisc_priv(sch);
    296	sfq_index x, d = q->cur_depth;
    297	struct sk_buff *skb;
    298	unsigned int len;
    299	struct sfq_slot *slot;
    300
    301	/* Queue is full! Find the longest slot and drop tail packet from it */
    302	if (d > 1) {
    303		x = q->dep[d].next;
    304		slot = &q->slots[x];
    305drop:
    306		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
    307		len = qdisc_pkt_len(skb);
    308		slot->backlog -= len;
    309		sfq_dec(q, x);
    310		sch->q.qlen--;
    311		qdisc_qstats_backlog_dec(sch, skb);
    312		qdisc_drop(skb, sch, to_free);
    313		return len;
    314	}
    315
    316	if (d == 1) {
    317		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
    318		x = q->tail->next;
    319		slot = &q->slots[x];
    320		q->tail->next = slot->next;
    321		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
    322		goto drop;
    323	}
    324
    325	return 0;
    326}
    327
    328/* Is ECN parameter configured */
    329static int sfq_prob_mark(const struct sfq_sched_data *q)
    330{
    331	return q->flags & TC_RED_ECN;
    332}
    333
    334/* Should packets over max threshold just be marked */
    335static int sfq_hard_mark(const struct sfq_sched_data *q)
    336{
    337	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
    338}
    339
    340static int sfq_headdrop(const struct sfq_sched_data *q)
    341{
    342	return q->headdrop;
    343}
    344
    345static int
    346sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
    347{
    348	struct sfq_sched_data *q = qdisc_priv(sch);
    349	unsigned int hash, dropped;
    350	sfq_index x, qlen;
    351	struct sfq_slot *slot;
    352	int ret;
    353	struct sk_buff *head;
    354	int delta;
    355
    356	hash = sfq_classify(skb, sch, &ret);
    357	if (hash == 0) {
    358		if (ret & __NET_XMIT_BYPASS)
    359			qdisc_qstats_drop(sch);
    360		__qdisc_drop(skb, to_free);
    361		return ret;
    362	}
    363	hash--;
    364
    365	x = q->ht[hash];
    366	slot = &q->slots[x];
    367	if (x == SFQ_EMPTY_SLOT) {
    368		x = q->dep[0].next; /* get a free slot */
    369		if (x >= SFQ_MAX_FLOWS)
    370			return qdisc_drop(skb, sch, to_free);
    371		q->ht[hash] = x;
    372		slot = &q->slots[x];
    373		slot->hash = hash;
    374		slot->backlog = 0; /* should already be 0 anyway... */
    375		red_set_vars(&slot->vars);
    376		goto enqueue;
    377	}
    378	if (q->red_parms) {
    379		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
    380							&slot->vars,
    381							slot->backlog);
    382		switch (red_action(q->red_parms,
    383				   &slot->vars,
    384				   slot->vars.qavg)) {
    385		case RED_DONT_MARK:
    386			break;
    387
    388		case RED_PROB_MARK:
    389			qdisc_qstats_overlimit(sch);
    390			if (sfq_prob_mark(q)) {
    391				/* We know we have at least one packet in queue */
    392				if (sfq_headdrop(q) &&
    393				    INET_ECN_set_ce(slot->skblist_next)) {
    394					q->stats.prob_mark_head++;
    395					break;
    396				}
    397				if (INET_ECN_set_ce(skb)) {
    398					q->stats.prob_mark++;
    399					break;
    400				}
    401			}
    402			q->stats.prob_drop++;
    403			goto congestion_drop;
    404
    405		case RED_HARD_MARK:
    406			qdisc_qstats_overlimit(sch);
    407			if (sfq_hard_mark(q)) {
    408				/* We know we have at least one packet in queue */
    409				if (sfq_headdrop(q) &&
    410				    INET_ECN_set_ce(slot->skblist_next)) {
    411					q->stats.forced_mark_head++;
    412					break;
    413				}
    414				if (INET_ECN_set_ce(skb)) {
    415					q->stats.forced_mark++;
    416					break;
    417				}
    418			}
    419			q->stats.forced_drop++;
    420			goto congestion_drop;
    421		}
    422	}
    423
    424	if (slot->qlen >= q->maxdepth) {
    425congestion_drop:
    426		if (!sfq_headdrop(q))
    427			return qdisc_drop(skb, sch, to_free);
    428
    429		/* We know we have at least one packet in queue */
    430		head = slot_dequeue_head(slot);
    431		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
    432		sch->qstats.backlog -= delta;
    433		slot->backlog -= delta;
    434		qdisc_drop(head, sch, to_free);
    435
    436		slot_queue_add(slot, skb);
    437		qdisc_tree_reduce_backlog(sch, 0, delta);
    438		return NET_XMIT_CN;
    439	}
    440
    441enqueue:
    442	qdisc_qstats_backlog_inc(sch, skb);
    443	slot->backlog += qdisc_pkt_len(skb);
    444	slot_queue_add(slot, skb);
    445	sfq_inc(q, x);
    446	if (slot->qlen == 1) {		/* The flow is new */
    447		if (q->tail == NULL) {	/* It is the first flow */
    448			slot->next = x;
    449		} else {
    450			slot->next = q->tail->next;
    451			q->tail->next = x;
    452		}
    453		/* We put this flow at the end of our flow list.
    454		 * This might sound unfair for a new flow to wait after old ones,
    455		 * but we could endup servicing new flows only, and freeze old ones.
    456		 */
    457		q->tail = slot;
    458		/* We could use a bigger initial quantum for new flows */
    459		slot->allot = q->scaled_quantum;
    460	}
    461	if (++sch->q.qlen <= q->limit)
    462		return NET_XMIT_SUCCESS;
    463
    464	qlen = slot->qlen;
    465	dropped = sfq_drop(sch, to_free);
    466	/* Return Congestion Notification only if we dropped a packet
    467	 * from this flow.
    468	 */
    469	if (qlen != slot->qlen) {
    470		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
    471		return NET_XMIT_CN;
    472	}
    473
    474	/* As we dropped a packet, better let upper stack know this */
    475	qdisc_tree_reduce_backlog(sch, 1, dropped);
    476	return NET_XMIT_SUCCESS;
    477}
    478
    479static struct sk_buff *
    480sfq_dequeue(struct Qdisc *sch)
    481{
    482	struct sfq_sched_data *q = qdisc_priv(sch);
    483	struct sk_buff *skb;
    484	sfq_index a, next_a;
    485	struct sfq_slot *slot;
    486
    487	/* No active slots */
    488	if (q->tail == NULL)
    489		return NULL;
    490
    491next_slot:
    492	a = q->tail->next;
    493	slot = &q->slots[a];
    494	if (slot->allot <= 0) {
    495		q->tail = slot;
    496		slot->allot += q->scaled_quantum;
    497		goto next_slot;
    498	}
    499	skb = slot_dequeue_head(slot);
    500	sfq_dec(q, a);
    501	qdisc_bstats_update(sch, skb);
    502	sch->q.qlen--;
    503	qdisc_qstats_backlog_dec(sch, skb);
    504	slot->backlog -= qdisc_pkt_len(skb);
    505	/* Is the slot empty? */
    506	if (slot->qlen == 0) {
    507		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
    508		next_a = slot->next;
    509		if (a == next_a) {
    510			q->tail = NULL; /* no more active slots */
    511			return skb;
    512		}
    513		q->tail->next = next_a;
    514	} else {
    515		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
    516	}
    517	return skb;
    518}
    519
    520static void
    521sfq_reset(struct Qdisc *sch)
    522{
    523	struct sk_buff *skb;
    524
    525	while ((skb = sfq_dequeue(sch)) != NULL)
    526		rtnl_kfree_skbs(skb, skb);
    527}
    528
    529/*
    530 * When q->perturbation is changed, we rehash all queued skbs
    531 * to avoid OOO (Out Of Order) effects.
    532 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
    533 * counters.
    534 */
    535static void sfq_rehash(struct Qdisc *sch)
    536{
    537	struct sfq_sched_data *q = qdisc_priv(sch);
    538	struct sk_buff *skb;
    539	int i;
    540	struct sfq_slot *slot;
    541	struct sk_buff_head list;
    542	int dropped = 0;
    543	unsigned int drop_len = 0;
    544
    545	__skb_queue_head_init(&list);
    546
    547	for (i = 0; i < q->maxflows; i++) {
    548		slot = &q->slots[i];
    549		if (!slot->qlen)
    550			continue;
    551		while (slot->qlen) {
    552			skb = slot_dequeue_head(slot);
    553			sfq_dec(q, i);
    554			__skb_queue_tail(&list, skb);
    555		}
    556		slot->backlog = 0;
    557		red_set_vars(&slot->vars);
    558		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
    559	}
    560	q->tail = NULL;
    561
    562	while ((skb = __skb_dequeue(&list)) != NULL) {
    563		unsigned int hash = sfq_hash(q, skb);
    564		sfq_index x = q->ht[hash];
    565
    566		slot = &q->slots[x];
    567		if (x == SFQ_EMPTY_SLOT) {
    568			x = q->dep[0].next; /* get a free slot */
    569			if (x >= SFQ_MAX_FLOWS) {
    570drop:
    571				qdisc_qstats_backlog_dec(sch, skb);
    572				drop_len += qdisc_pkt_len(skb);
    573				kfree_skb(skb);
    574				dropped++;
    575				continue;
    576			}
    577			q->ht[hash] = x;
    578			slot = &q->slots[x];
    579			slot->hash = hash;
    580		}
    581		if (slot->qlen >= q->maxdepth)
    582			goto drop;
    583		slot_queue_add(slot, skb);
    584		if (q->red_parms)
    585			slot->vars.qavg = red_calc_qavg(q->red_parms,
    586							&slot->vars,
    587							slot->backlog);
    588		slot->backlog += qdisc_pkt_len(skb);
    589		sfq_inc(q, x);
    590		if (slot->qlen == 1) {		/* The flow is new */
    591			if (q->tail == NULL) {	/* It is the first flow */
    592				slot->next = x;
    593			} else {
    594				slot->next = q->tail->next;
    595				q->tail->next = x;
    596			}
    597			q->tail = slot;
    598			slot->allot = q->scaled_quantum;
    599		}
    600	}
    601	sch->q.qlen -= dropped;
    602	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
    603}
    604
    605static void sfq_perturbation(struct timer_list *t)
    606{
    607	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
    608	struct Qdisc *sch = q->sch;
    609	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
    610	siphash_key_t nkey;
    611
    612	get_random_bytes(&nkey, sizeof(nkey));
    613	spin_lock(root_lock);
    614	q->perturbation = nkey;
    615	if (!q->filter_list && q->tail)
    616		sfq_rehash(sch);
    617	spin_unlock(root_lock);
    618
    619	if (q->perturb_period)
    620		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
    621}
    622
    623static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
    624{
    625	struct sfq_sched_data *q = qdisc_priv(sch);
    626	struct tc_sfq_qopt *ctl = nla_data(opt);
    627	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
    628	unsigned int qlen, dropped = 0;
    629	struct red_parms *p = NULL;
    630	struct sk_buff *to_free = NULL;
    631	struct sk_buff *tail = NULL;
    632
    633	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
    634		return -EINVAL;
    635	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
    636		ctl_v1 = nla_data(opt);
    637	if (ctl->divisor &&
    638	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
    639		return -EINVAL;
    640
    641	/* slot->allot is a short, make sure quantum is not too big. */
    642	if (ctl->quantum) {
    643		unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum);
    644
    645		if (scaled <= 0 || scaled > SHRT_MAX)
    646			return -EINVAL;
    647	}
    648
    649	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
    650					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
    651		return -EINVAL;
    652	if (ctl_v1 && ctl_v1->qth_min) {
    653		p = kmalloc(sizeof(*p), GFP_KERNEL);
    654		if (!p)
    655			return -ENOMEM;
    656	}
    657	sch_tree_lock(sch);
    658	if (ctl->quantum) {
    659		q->quantum = ctl->quantum;
    660		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
    661	}
    662	q->perturb_period = ctl->perturb_period * HZ;
    663	if (ctl->flows)
    664		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
    665	if (ctl->divisor) {
    666		q->divisor = ctl->divisor;
    667		q->maxflows = min_t(u32, q->maxflows, q->divisor);
    668	}
    669	if (ctl_v1) {
    670		if (ctl_v1->depth)
    671			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
    672		if (p) {
    673			swap(q->red_parms, p);
    674			red_set_parms(q->red_parms,
    675				      ctl_v1->qth_min, ctl_v1->qth_max,
    676				      ctl_v1->Wlog,
    677				      ctl_v1->Plog, ctl_v1->Scell_log,
    678				      NULL,
    679				      ctl_v1->max_P);
    680		}
    681		q->flags = ctl_v1->flags;
    682		q->headdrop = ctl_v1->headdrop;
    683	}
    684	if (ctl->limit) {
    685		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
    686		q->maxflows = min_t(u32, q->maxflows, q->limit);
    687	}
    688
    689	qlen = sch->q.qlen;
    690	while (sch->q.qlen > q->limit) {
    691		dropped += sfq_drop(sch, &to_free);
    692		if (!tail)
    693			tail = to_free;
    694	}
    695
    696	rtnl_kfree_skbs(to_free, tail);
    697	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
    698
    699	del_timer(&q->perturb_timer);
    700	if (q->perturb_period) {
    701		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
    702		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
    703	}
    704	sch_tree_unlock(sch);
    705	kfree(p);
    706	return 0;
    707}
    708
    709static void *sfq_alloc(size_t sz)
    710{
    711	return  kvmalloc(sz, GFP_KERNEL);
    712}
    713
    714static void sfq_free(void *addr)
    715{
    716	kvfree(addr);
    717}
    718
    719static void sfq_destroy(struct Qdisc *sch)
    720{
    721	struct sfq_sched_data *q = qdisc_priv(sch);
    722
    723	tcf_block_put(q->block);
    724	q->perturb_period = 0;
    725	del_timer_sync(&q->perturb_timer);
    726	sfq_free(q->ht);
    727	sfq_free(q->slots);
    728	kfree(q->red_parms);
    729}
    730
    731static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
    732		    struct netlink_ext_ack *extack)
    733{
    734	struct sfq_sched_data *q = qdisc_priv(sch);
    735	int i;
    736	int err;
    737
    738	q->sch = sch;
    739	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
    740
    741	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
    742	if (err)
    743		return err;
    744
    745	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
    746		q->dep[i].next = i + SFQ_MAX_FLOWS;
    747		q->dep[i].prev = i + SFQ_MAX_FLOWS;
    748	}
    749
    750	q->limit = SFQ_MAX_DEPTH;
    751	q->maxdepth = SFQ_MAX_DEPTH;
    752	q->cur_depth = 0;
    753	q->tail = NULL;
    754	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
    755	q->maxflows = SFQ_DEFAULT_FLOWS;
    756	q->quantum = psched_mtu(qdisc_dev(sch));
    757	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
    758	q->perturb_period = 0;
    759	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
    760
    761	if (opt) {
    762		int err = sfq_change(sch, opt);
    763		if (err)
    764			return err;
    765	}
    766
    767	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
    768	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
    769	if (!q->ht || !q->slots) {
    770		/* Note: sfq_destroy() will be called by our caller */
    771		return -ENOMEM;
    772	}
    773
    774	for (i = 0; i < q->divisor; i++)
    775		q->ht[i] = SFQ_EMPTY_SLOT;
    776
    777	for (i = 0; i < q->maxflows; i++) {
    778		slot_queue_init(&q->slots[i]);
    779		sfq_link(q, i);
    780	}
    781	if (q->limit >= 1)
    782		sch->flags |= TCQ_F_CAN_BYPASS;
    783	else
    784		sch->flags &= ~TCQ_F_CAN_BYPASS;
    785	return 0;
    786}
    787
    788static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
    789{
    790	struct sfq_sched_data *q = qdisc_priv(sch);
    791	unsigned char *b = skb_tail_pointer(skb);
    792	struct tc_sfq_qopt_v1 opt;
    793	struct red_parms *p = q->red_parms;
    794
    795	memset(&opt, 0, sizeof(opt));
    796	opt.v0.quantum	= q->quantum;
    797	opt.v0.perturb_period = q->perturb_period / HZ;
    798	opt.v0.limit	= q->limit;
    799	opt.v0.divisor	= q->divisor;
    800	opt.v0.flows	= q->maxflows;
    801	opt.depth	= q->maxdepth;
    802	opt.headdrop	= q->headdrop;
    803
    804	if (p) {
    805		opt.qth_min	= p->qth_min >> p->Wlog;
    806		opt.qth_max	= p->qth_max >> p->Wlog;
    807		opt.Wlog	= p->Wlog;
    808		opt.Plog	= p->Plog;
    809		opt.Scell_log	= p->Scell_log;
    810		opt.max_P	= p->max_P;
    811	}
    812	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
    813	opt.flags	= q->flags;
    814
    815	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
    816		goto nla_put_failure;
    817
    818	return skb->len;
    819
    820nla_put_failure:
    821	nlmsg_trim(skb, b);
    822	return -1;
    823}
    824
    825static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
    826{
    827	return NULL;
    828}
    829
    830static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
    831{
    832	return 0;
    833}
    834
    835static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
    836			      u32 classid)
    837{
    838	return 0;
    839}
    840
    841static void sfq_unbind(struct Qdisc *q, unsigned long cl)
    842{
    843}
    844
    845static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
    846				       struct netlink_ext_ack *extack)
    847{
    848	struct sfq_sched_data *q = qdisc_priv(sch);
    849
    850	if (cl)
    851		return NULL;
    852	return q->block;
    853}
    854
    855static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
    856			  struct sk_buff *skb, struct tcmsg *tcm)
    857{
    858	tcm->tcm_handle |= TC_H_MIN(cl);
    859	return 0;
    860}
    861
    862static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
    863				struct gnet_dump *d)
    864{
    865	struct sfq_sched_data *q = qdisc_priv(sch);
    866	sfq_index idx = q->ht[cl - 1];
    867	struct gnet_stats_queue qs = { 0 };
    868	struct tc_sfq_xstats xstats = { 0 };
    869
    870	if (idx != SFQ_EMPTY_SLOT) {
    871		const struct sfq_slot *slot = &q->slots[idx];
    872
    873		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
    874		qs.qlen = slot->qlen;
    875		qs.backlog = slot->backlog;
    876	}
    877	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
    878		return -1;
    879	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
    880}
    881
    882static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
    883{
    884	struct sfq_sched_data *q = qdisc_priv(sch);
    885	unsigned int i;
    886
    887	if (arg->stop)
    888		return;
    889
    890	for (i = 0; i < q->divisor; i++) {
    891		if (q->ht[i] == SFQ_EMPTY_SLOT ||
    892		    arg->count < arg->skip) {
    893			arg->count++;
    894			continue;
    895		}
    896		if (arg->fn(sch, i + 1, arg) < 0) {
    897			arg->stop = 1;
    898			break;
    899		}
    900		arg->count++;
    901	}
    902}
    903
    904static const struct Qdisc_class_ops sfq_class_ops = {
    905	.leaf		=	sfq_leaf,
    906	.find		=	sfq_find,
    907	.tcf_block	=	sfq_tcf_block,
    908	.bind_tcf	=	sfq_bind,
    909	.unbind_tcf	=	sfq_unbind,
    910	.dump		=	sfq_dump_class,
    911	.dump_stats	=	sfq_dump_class_stats,
    912	.walk		=	sfq_walk,
    913};
    914
    915static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
    916	.cl_ops		=	&sfq_class_ops,
    917	.id		=	"sfq",
    918	.priv_size	=	sizeof(struct sfq_sched_data),
    919	.enqueue	=	sfq_enqueue,
    920	.dequeue	=	sfq_dequeue,
    921	.peek		=	qdisc_peek_dequeued,
    922	.init		=	sfq_init,
    923	.reset		=	sfq_reset,
    924	.destroy	=	sfq_destroy,
    925	.change		=	NULL,
    926	.dump		=	sfq_dump,
    927	.owner		=	THIS_MODULE,
    928};
    929
    930static int __init sfq_module_init(void)
    931{
    932	return register_qdisc(&sfq_qdisc_ops);
    933}
    934static void __exit sfq_module_exit(void)
    935{
    936	unregister_qdisc(&sfq_qdisc_ops);
    937}
    938module_init(sfq_module_init)
    939module_exit(sfq_module_exit)
    940MODULE_LICENSE("GPL");