cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sch_fq.c (27303B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
      4 *
      5 *  Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
      6 *
      7 *  Meant to be mostly used for locally generated traffic :
      8 *  Fast classification depends on skb->sk being set before reaching us.
      9 *  If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
     10 *  All packets belonging to a socket are considered as a 'flow'.
     11 *
     12 *  Flows are dynamically allocated and stored in a hash table of RB trees
     13 *  They are also part of one Round Robin 'queues' (new or old flows)
     14 *
     15 *  Burst avoidance (aka pacing) capability :
     16 *
     17 *  Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
     18 *  bunch of packets, and this packet scheduler adds delay between
     19 *  packets to respect rate limitation.
     20 *
     21 *  enqueue() :
     22 *   - lookup one RB tree (out of 1024 or more) to find the flow.
     23 *     If non existent flow, create it, add it to the tree.
     24 *     Add skb to the per flow list of skb (fifo).
     25 *   - Use a special fifo for high prio packets
     26 *
     27 *  dequeue() : serves flows in Round Robin
     28 *  Note : When a flow becomes empty, we do not immediately remove it from
     29 *  rb trees, for performance reasons (its expected to send additional packets,
     30 *  or SLAB cache will reuse socket for another flow)
     31 */
     32
     33#include <linux/module.h>
     34#include <linux/types.h>
     35#include <linux/kernel.h>
     36#include <linux/jiffies.h>
     37#include <linux/string.h>
     38#include <linux/in.h>
     39#include <linux/errno.h>
     40#include <linux/init.h>
     41#include <linux/skbuff.h>
     42#include <linux/slab.h>
     43#include <linux/rbtree.h>
     44#include <linux/hash.h>
     45#include <linux/prefetch.h>
     46#include <linux/vmalloc.h>
     47#include <net/netlink.h>
     48#include <net/pkt_sched.h>
     49#include <net/sock.h>
     50#include <net/tcp_states.h>
     51#include <net/tcp.h>
     52
     53struct fq_skb_cb {
     54	u64	        time_to_send;
     55};
     56
     57static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
     58{
     59	qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
     60	return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
     61}
     62
     63/*
     64 * Per flow structure, dynamically allocated.
     65 * If packets have monotically increasing time_to_send, they are placed in O(1)
     66 * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
     67 */
     68struct fq_flow {
     69/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
     70	struct rb_root	t_root;
     71	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
     72	union {
     73		struct sk_buff *tail;	/* last skb in the list */
     74		unsigned long  age;	/* (jiffies | 1UL) when flow was emptied, for gc */
     75	};
     76	struct rb_node	fq_node;	/* anchor in fq_root[] trees */
     77	struct sock	*sk;
     78	u32		socket_hash;	/* sk_hash */
     79	int		qlen;		/* number of packets in flow queue */
     80
     81/* Second cache line, used in fq_dequeue() */
     82	int		credit;
     83	/* 32bit hole on 64bit arches */
     84
     85	struct fq_flow *next;		/* next pointer in RR lists */
     86
     87	struct rb_node  rate_node;	/* anchor in q->delayed tree */
     88	u64		time_next_packet;
     89} ____cacheline_aligned_in_smp;
     90
     91struct fq_flow_head {
     92	struct fq_flow *first;
     93	struct fq_flow *last;
     94};
     95
     96struct fq_sched_data {
     97	struct fq_flow_head new_flows;
     98
     99	struct fq_flow_head old_flows;
    100
    101	struct rb_root	delayed;	/* for rate limited flows */
    102	u64		time_next_delayed_flow;
    103	u64		ktime_cache;	/* copy of last ktime_get_ns() */
    104	unsigned long	unthrottle_latency_ns;
    105
    106	struct fq_flow	internal;	/* for non classified or high prio packets */
    107	u32		quantum;
    108	u32		initial_quantum;
    109	u32		flow_refill_delay;
    110	u32		flow_plimit;	/* max packets per flow */
    111	unsigned long	flow_max_rate;	/* optional max rate per flow */
    112	u64		ce_threshold;
    113	u64		horizon;	/* horizon in ns */
    114	u32		orphan_mask;	/* mask for orphaned skb */
    115	u32		low_rate_threshold;
    116	struct rb_root	*fq_root;
    117	u8		rate_enable;
    118	u8		fq_trees_log;
    119	u8		horizon_drop;
    120	u32		flows;
    121	u32		inactive_flows;
    122	u32		throttled_flows;
    123
    124	u64		stat_gc_flows;
    125	u64		stat_internal_packets;
    126	u64		stat_throttled;
    127	u64		stat_ce_mark;
    128	u64		stat_horizon_drops;
    129	u64		stat_horizon_caps;
    130	u64		stat_flows_plimit;
    131	u64		stat_pkts_too_long;
    132	u64		stat_allocation_errors;
    133
    134	u32		timer_slack; /* hrtimer slack in ns */
    135	struct qdisc_watchdog watchdog;
    136};
    137
    138/*
    139 * f->tail and f->age share the same location.
    140 * We can use the low order bit to differentiate if this location points
    141 * to a sk_buff or contains a jiffies value, if we force this value to be odd.
    142 * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
    143 */
    144static void fq_flow_set_detached(struct fq_flow *f)
    145{
    146	f->age = jiffies | 1UL;
    147}
    148
    149static bool fq_flow_is_detached(const struct fq_flow *f)
    150{
    151	return !!(f->age & 1UL);
    152}
    153
    154/* special value to mark a throttled flow (not on old/new list) */
    155static struct fq_flow throttled;
    156
    157static bool fq_flow_is_throttled(const struct fq_flow *f)
    158{
    159	return f->next == &throttled;
    160}
    161
    162static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
    163{
    164	if (head->first)
    165		head->last->next = flow;
    166	else
    167		head->first = flow;
    168	head->last = flow;
    169	flow->next = NULL;
    170}
    171
    172static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
    173{
    174	rb_erase(&f->rate_node, &q->delayed);
    175	q->throttled_flows--;
    176	fq_flow_add_tail(&q->old_flows, f);
    177}
    178
    179static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
    180{
    181	struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
    182
    183	while (*p) {
    184		struct fq_flow *aux;
    185
    186		parent = *p;
    187		aux = rb_entry(parent, struct fq_flow, rate_node);
    188		if (f->time_next_packet >= aux->time_next_packet)
    189			p = &parent->rb_right;
    190		else
    191			p = &parent->rb_left;
    192	}
    193	rb_link_node(&f->rate_node, parent, p);
    194	rb_insert_color(&f->rate_node, &q->delayed);
    195	q->throttled_flows++;
    196	q->stat_throttled++;
    197
    198	f->next = &throttled;
    199	if (q->time_next_delayed_flow > f->time_next_packet)
    200		q->time_next_delayed_flow = f->time_next_packet;
    201}
    202
    203
    204static struct kmem_cache *fq_flow_cachep __read_mostly;
    205
    206
    207/* limit number of collected flows per round */
    208#define FQ_GC_MAX 8
    209#define FQ_GC_AGE (3*HZ)
    210
    211static bool fq_gc_candidate(const struct fq_flow *f)
    212{
    213	return fq_flow_is_detached(f) &&
    214	       time_after(jiffies, f->age + FQ_GC_AGE);
    215}
    216
    217static void fq_gc(struct fq_sched_data *q,
    218		  struct rb_root *root,
    219		  struct sock *sk)
    220{
    221	struct rb_node **p, *parent;
    222	void *tofree[FQ_GC_MAX];
    223	struct fq_flow *f;
    224	int i, fcnt = 0;
    225
    226	p = &root->rb_node;
    227	parent = NULL;
    228	while (*p) {
    229		parent = *p;
    230
    231		f = rb_entry(parent, struct fq_flow, fq_node);
    232		if (f->sk == sk)
    233			break;
    234
    235		if (fq_gc_candidate(f)) {
    236			tofree[fcnt++] = f;
    237			if (fcnt == FQ_GC_MAX)
    238				break;
    239		}
    240
    241		if (f->sk > sk)
    242			p = &parent->rb_right;
    243		else
    244			p = &parent->rb_left;
    245	}
    246
    247	if (!fcnt)
    248		return;
    249
    250	for (i = fcnt; i > 0; ) {
    251		f = tofree[--i];
    252		rb_erase(&f->fq_node, root);
    253	}
    254	q->flows -= fcnt;
    255	q->inactive_flows -= fcnt;
    256	q->stat_gc_flows += fcnt;
    257
    258	kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
    259}
    260
    261static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
    262{
    263	struct rb_node **p, *parent;
    264	struct sock *sk = skb->sk;
    265	struct rb_root *root;
    266	struct fq_flow *f;
    267
    268	/* warning: no starvation prevention... */
    269	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
    270		return &q->internal;
    271
    272	/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
    273	 * or a listener (SYNCOOKIE mode)
    274	 * 1) request sockets are not full blown,
    275	 *    they do not contain sk_pacing_rate
    276	 * 2) They are not part of a 'flow' yet
    277	 * 3) We do not want to rate limit them (eg SYNFLOOD attack),
    278	 *    especially if the listener set SO_MAX_PACING_RATE
    279	 * 4) We pretend they are orphaned
    280	 */
    281	if (!sk || sk_listener(sk)) {
    282		unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
    283
    284		/* By forcing low order bit to 1, we make sure to not
    285		 * collide with a local flow (socket pointers are word aligned)
    286		 */
    287		sk = (struct sock *)((hash << 1) | 1UL);
    288		skb_orphan(skb);
    289	} else if (sk->sk_state == TCP_CLOSE) {
    290		unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
    291		/*
    292		 * Sockets in TCP_CLOSE are non connected.
    293		 * Typical use case is UDP sockets, they can send packets
    294		 * with sendto() to many different destinations.
    295		 * We probably could use a generic bit advertising
    296		 * non connected sockets, instead of sk_state == TCP_CLOSE,
    297		 * if we care enough.
    298		 */
    299		sk = (struct sock *)((hash << 1) | 1UL);
    300	}
    301
    302	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
    303
    304	if (q->flows >= (2U << q->fq_trees_log) &&
    305	    q->inactive_flows > q->flows/2)
    306		fq_gc(q, root, sk);
    307
    308	p = &root->rb_node;
    309	parent = NULL;
    310	while (*p) {
    311		parent = *p;
    312
    313		f = rb_entry(parent, struct fq_flow, fq_node);
    314		if (f->sk == sk) {
    315			/* socket might have been reallocated, so check
    316			 * if its sk_hash is the same.
    317			 * It not, we need to refill credit with
    318			 * initial quantum
    319			 */
    320			if (unlikely(skb->sk == sk &&
    321				     f->socket_hash != sk->sk_hash)) {
    322				f->credit = q->initial_quantum;
    323				f->socket_hash = sk->sk_hash;
    324				if (q->rate_enable)
    325					smp_store_release(&sk->sk_pacing_status,
    326							  SK_PACING_FQ);
    327				if (fq_flow_is_throttled(f))
    328					fq_flow_unset_throttled(q, f);
    329				f->time_next_packet = 0ULL;
    330			}
    331			return f;
    332		}
    333		if (f->sk > sk)
    334			p = &parent->rb_right;
    335		else
    336			p = &parent->rb_left;
    337	}
    338
    339	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
    340	if (unlikely(!f)) {
    341		q->stat_allocation_errors++;
    342		return &q->internal;
    343	}
    344	/* f->t_root is already zeroed after kmem_cache_zalloc() */
    345
    346	fq_flow_set_detached(f);
    347	f->sk = sk;
    348	if (skb->sk == sk) {
    349		f->socket_hash = sk->sk_hash;
    350		if (q->rate_enable)
    351			smp_store_release(&sk->sk_pacing_status,
    352					  SK_PACING_FQ);
    353	}
    354	f->credit = q->initial_quantum;
    355
    356	rb_link_node(&f->fq_node, parent, p);
    357	rb_insert_color(&f->fq_node, root);
    358
    359	q->flows++;
    360	q->inactive_flows++;
    361	return f;
    362}
    363
    364static struct sk_buff *fq_peek(struct fq_flow *flow)
    365{
    366	struct sk_buff *skb = skb_rb_first(&flow->t_root);
    367	struct sk_buff *head = flow->head;
    368
    369	if (!skb)
    370		return head;
    371
    372	if (!head)
    373		return skb;
    374
    375	if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
    376		return skb;
    377	return head;
    378}
    379
    380static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
    381			  struct sk_buff *skb)
    382{
    383	if (skb == flow->head) {
    384		flow->head = skb->next;
    385	} else {
    386		rb_erase(&skb->rbnode, &flow->t_root);
    387		skb->dev = qdisc_dev(sch);
    388	}
    389}
    390
    391/* Remove one skb from flow queue.
    392 * This skb must be the return value of prior fq_peek().
    393 */
    394static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
    395			   struct sk_buff *skb)
    396{
    397	fq_erase_head(sch, flow, skb);
    398	skb_mark_not_on_list(skb);
    399	flow->qlen--;
    400	qdisc_qstats_backlog_dec(sch, skb);
    401	sch->q.qlen--;
    402}
    403
    404static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
    405{
    406	struct rb_node **p, *parent;
    407	struct sk_buff *head, *aux;
    408
    409	head = flow->head;
    410	if (!head ||
    411	    fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
    412		if (!head)
    413			flow->head = skb;
    414		else
    415			flow->tail->next = skb;
    416		flow->tail = skb;
    417		skb->next = NULL;
    418		return;
    419	}
    420
    421	p = &flow->t_root.rb_node;
    422	parent = NULL;
    423
    424	while (*p) {
    425		parent = *p;
    426		aux = rb_to_skb(parent);
    427		if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
    428			p = &parent->rb_right;
    429		else
    430			p = &parent->rb_left;
    431	}
    432	rb_link_node(&skb->rbnode, parent, p);
    433	rb_insert_color(&skb->rbnode, &flow->t_root);
    434}
    435
    436static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
    437				    const struct fq_sched_data *q)
    438{
    439	return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
    440}
    441
    442static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
    443		      struct sk_buff **to_free)
    444{
    445	struct fq_sched_data *q = qdisc_priv(sch);
    446	struct fq_flow *f;
    447
    448	if (unlikely(sch->q.qlen >= sch->limit))
    449		return qdisc_drop(skb, sch, to_free);
    450
    451	if (!skb->tstamp) {
    452		fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
    453	} else {
    454		/* Check if packet timestamp is too far in the future.
    455		 * Try first if our cached value, to avoid ktime_get_ns()
    456		 * cost in most cases.
    457		 */
    458		if (fq_packet_beyond_horizon(skb, q)) {
    459			/* Refresh our cache and check another time */
    460			q->ktime_cache = ktime_get_ns();
    461			if (fq_packet_beyond_horizon(skb, q)) {
    462				if (q->horizon_drop) {
    463					q->stat_horizon_drops++;
    464					return qdisc_drop(skb, sch, to_free);
    465				}
    466				q->stat_horizon_caps++;
    467				skb->tstamp = q->ktime_cache + q->horizon;
    468			}
    469		}
    470		fq_skb_cb(skb)->time_to_send = skb->tstamp;
    471	}
    472
    473	f = fq_classify(skb, q);
    474	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
    475		q->stat_flows_plimit++;
    476		return qdisc_drop(skb, sch, to_free);
    477	}
    478
    479	f->qlen++;
    480	qdisc_qstats_backlog_inc(sch, skb);
    481	if (fq_flow_is_detached(f)) {
    482		fq_flow_add_tail(&q->new_flows, f);
    483		if (time_after(jiffies, f->age + q->flow_refill_delay))
    484			f->credit = max_t(u32, f->credit, q->quantum);
    485		q->inactive_flows--;
    486	}
    487
    488	/* Note: this overwrites f->age */
    489	flow_queue_add(f, skb);
    490
    491	if (unlikely(f == &q->internal)) {
    492		q->stat_internal_packets++;
    493	}
    494	sch->q.qlen++;
    495
    496	return NET_XMIT_SUCCESS;
    497}
    498
    499static void fq_check_throttled(struct fq_sched_data *q, u64 now)
    500{
    501	unsigned long sample;
    502	struct rb_node *p;
    503
    504	if (q->time_next_delayed_flow > now)
    505		return;
    506
    507	/* Update unthrottle latency EWMA.
    508	 * This is cheap and can help diagnosing timer/latency problems.
    509	 */
    510	sample = (unsigned long)(now - q->time_next_delayed_flow);
    511	q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
    512	q->unthrottle_latency_ns += sample >> 3;
    513
    514	q->time_next_delayed_flow = ~0ULL;
    515	while ((p = rb_first(&q->delayed)) != NULL) {
    516		struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
    517
    518		if (f->time_next_packet > now) {
    519			q->time_next_delayed_flow = f->time_next_packet;
    520			break;
    521		}
    522		fq_flow_unset_throttled(q, f);
    523	}
    524}
    525
    526static struct sk_buff *fq_dequeue(struct Qdisc *sch)
    527{
    528	struct fq_sched_data *q = qdisc_priv(sch);
    529	struct fq_flow_head *head;
    530	struct sk_buff *skb;
    531	struct fq_flow *f;
    532	unsigned long rate;
    533	u32 plen;
    534	u64 now;
    535
    536	if (!sch->q.qlen)
    537		return NULL;
    538
    539	skb = fq_peek(&q->internal);
    540	if (unlikely(skb)) {
    541		fq_dequeue_skb(sch, &q->internal, skb);
    542		goto out;
    543	}
    544
    545	q->ktime_cache = now = ktime_get_ns();
    546	fq_check_throttled(q, now);
    547begin:
    548	head = &q->new_flows;
    549	if (!head->first) {
    550		head = &q->old_flows;
    551		if (!head->first) {
    552			if (q->time_next_delayed_flow != ~0ULL)
    553				qdisc_watchdog_schedule_range_ns(&q->watchdog,
    554							q->time_next_delayed_flow,
    555							q->timer_slack);
    556			return NULL;
    557		}
    558	}
    559	f = head->first;
    560
    561	if (f->credit <= 0) {
    562		f->credit += q->quantum;
    563		head->first = f->next;
    564		fq_flow_add_tail(&q->old_flows, f);
    565		goto begin;
    566	}
    567
    568	skb = fq_peek(f);
    569	if (skb) {
    570		u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
    571					     f->time_next_packet);
    572
    573		if (now < time_next_packet) {
    574			head->first = f->next;
    575			f->time_next_packet = time_next_packet;
    576			fq_flow_set_throttled(q, f);
    577			goto begin;
    578		}
    579		prefetch(&skb->end);
    580		if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
    581			INET_ECN_set_ce(skb);
    582			q->stat_ce_mark++;
    583		}
    584		fq_dequeue_skb(sch, f, skb);
    585	} else {
    586		head->first = f->next;
    587		/* force a pass through old_flows to prevent starvation */
    588		if ((head == &q->new_flows) && q->old_flows.first) {
    589			fq_flow_add_tail(&q->old_flows, f);
    590		} else {
    591			fq_flow_set_detached(f);
    592			q->inactive_flows++;
    593		}
    594		goto begin;
    595	}
    596	plen = qdisc_pkt_len(skb);
    597	f->credit -= plen;
    598
    599	if (!q->rate_enable)
    600		goto out;
    601
    602	rate = q->flow_max_rate;
    603
    604	/* If EDT time was provided for this skb, we need to
    605	 * update f->time_next_packet only if this qdisc enforces
    606	 * a flow max rate.
    607	 */
    608	if (!skb->tstamp) {
    609		if (skb->sk)
    610			rate = min(skb->sk->sk_pacing_rate, rate);
    611
    612		if (rate <= q->low_rate_threshold) {
    613			f->credit = 0;
    614		} else {
    615			plen = max(plen, q->quantum);
    616			if (f->credit > 0)
    617				goto out;
    618		}
    619	}
    620	if (rate != ~0UL) {
    621		u64 len = (u64)plen * NSEC_PER_SEC;
    622
    623		if (likely(rate))
    624			len = div64_ul(len, rate);
    625		/* Since socket rate can change later,
    626		 * clamp the delay to 1 second.
    627		 * Really, providers of too big packets should be fixed !
    628		 */
    629		if (unlikely(len > NSEC_PER_SEC)) {
    630			len = NSEC_PER_SEC;
    631			q->stat_pkts_too_long++;
    632		}
    633		/* Account for schedule/timers drifts.
    634		 * f->time_next_packet was set when prior packet was sent,
    635		 * and current time (@now) can be too late by tens of us.
    636		 */
    637		if (f->time_next_packet)
    638			len -= min(len/2, now - f->time_next_packet);
    639		f->time_next_packet = now + len;
    640	}
    641out:
    642	qdisc_bstats_update(sch, skb);
    643	return skb;
    644}
    645
    646static void fq_flow_purge(struct fq_flow *flow)
    647{
    648	struct rb_node *p = rb_first(&flow->t_root);
    649
    650	while (p) {
    651		struct sk_buff *skb = rb_to_skb(p);
    652
    653		p = rb_next(p);
    654		rb_erase(&skb->rbnode, &flow->t_root);
    655		rtnl_kfree_skbs(skb, skb);
    656	}
    657	rtnl_kfree_skbs(flow->head, flow->tail);
    658	flow->head = NULL;
    659	flow->qlen = 0;
    660}
    661
    662static void fq_reset(struct Qdisc *sch)
    663{
    664	struct fq_sched_data *q = qdisc_priv(sch);
    665	struct rb_root *root;
    666	struct rb_node *p;
    667	struct fq_flow *f;
    668	unsigned int idx;
    669
    670	sch->q.qlen = 0;
    671	sch->qstats.backlog = 0;
    672
    673	fq_flow_purge(&q->internal);
    674
    675	if (!q->fq_root)
    676		return;
    677
    678	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
    679		root = &q->fq_root[idx];
    680		while ((p = rb_first(root)) != NULL) {
    681			f = rb_entry(p, struct fq_flow, fq_node);
    682			rb_erase(p, root);
    683
    684			fq_flow_purge(f);
    685
    686			kmem_cache_free(fq_flow_cachep, f);
    687		}
    688	}
    689	q->new_flows.first	= NULL;
    690	q->old_flows.first	= NULL;
    691	q->delayed		= RB_ROOT;
    692	q->flows		= 0;
    693	q->inactive_flows	= 0;
    694	q->throttled_flows	= 0;
    695}
    696
    697static void fq_rehash(struct fq_sched_data *q,
    698		      struct rb_root *old_array, u32 old_log,
    699		      struct rb_root *new_array, u32 new_log)
    700{
    701	struct rb_node *op, **np, *parent;
    702	struct rb_root *oroot, *nroot;
    703	struct fq_flow *of, *nf;
    704	int fcnt = 0;
    705	u32 idx;
    706
    707	for (idx = 0; idx < (1U << old_log); idx++) {
    708		oroot = &old_array[idx];
    709		while ((op = rb_first(oroot)) != NULL) {
    710			rb_erase(op, oroot);
    711			of = rb_entry(op, struct fq_flow, fq_node);
    712			if (fq_gc_candidate(of)) {
    713				fcnt++;
    714				kmem_cache_free(fq_flow_cachep, of);
    715				continue;
    716			}
    717			nroot = &new_array[hash_ptr(of->sk, new_log)];
    718
    719			np = &nroot->rb_node;
    720			parent = NULL;
    721			while (*np) {
    722				parent = *np;
    723
    724				nf = rb_entry(parent, struct fq_flow, fq_node);
    725				BUG_ON(nf->sk == of->sk);
    726
    727				if (nf->sk > of->sk)
    728					np = &parent->rb_right;
    729				else
    730					np = &parent->rb_left;
    731			}
    732
    733			rb_link_node(&of->fq_node, parent, np);
    734			rb_insert_color(&of->fq_node, nroot);
    735		}
    736	}
    737	q->flows -= fcnt;
    738	q->inactive_flows -= fcnt;
    739	q->stat_gc_flows += fcnt;
    740}
    741
    742static void fq_free(void *addr)
    743{
    744	kvfree(addr);
    745}
    746
    747static int fq_resize(struct Qdisc *sch, u32 log)
    748{
    749	struct fq_sched_data *q = qdisc_priv(sch);
    750	struct rb_root *array;
    751	void *old_fq_root;
    752	u32 idx;
    753
    754	if (q->fq_root && log == q->fq_trees_log)
    755		return 0;
    756
    757	/* If XPS was setup, we can allocate memory on right NUMA node */
    758	array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
    759			      netdev_queue_numa_node_read(sch->dev_queue));
    760	if (!array)
    761		return -ENOMEM;
    762
    763	for (idx = 0; idx < (1U << log); idx++)
    764		array[idx] = RB_ROOT;
    765
    766	sch_tree_lock(sch);
    767
    768	old_fq_root = q->fq_root;
    769	if (old_fq_root)
    770		fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
    771
    772	q->fq_root = array;
    773	q->fq_trees_log = log;
    774
    775	sch_tree_unlock(sch);
    776
    777	fq_free(old_fq_root);
    778
    779	return 0;
    780}
    781
    782static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
    783	[TCA_FQ_UNSPEC]			= { .strict_start_type = TCA_FQ_TIMER_SLACK },
    784
    785	[TCA_FQ_PLIMIT]			= { .type = NLA_U32 },
    786	[TCA_FQ_FLOW_PLIMIT]		= { .type = NLA_U32 },
    787	[TCA_FQ_QUANTUM]		= { .type = NLA_U32 },
    788	[TCA_FQ_INITIAL_QUANTUM]	= { .type = NLA_U32 },
    789	[TCA_FQ_RATE_ENABLE]		= { .type = NLA_U32 },
    790	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },
    791	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
    792	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
    793	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
    794	[TCA_FQ_ORPHAN_MASK]		= { .type = NLA_U32 },
    795	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
    796	[TCA_FQ_CE_THRESHOLD]		= { .type = NLA_U32 },
    797	[TCA_FQ_TIMER_SLACK]		= { .type = NLA_U32 },
    798	[TCA_FQ_HORIZON]		= { .type = NLA_U32 },
    799	[TCA_FQ_HORIZON_DROP]		= { .type = NLA_U8 },
    800};
    801
    802static int fq_change(struct Qdisc *sch, struct nlattr *opt,
    803		     struct netlink_ext_ack *extack)
    804{
    805	struct fq_sched_data *q = qdisc_priv(sch);
    806	struct nlattr *tb[TCA_FQ_MAX + 1];
    807	int err, drop_count = 0;
    808	unsigned drop_len = 0;
    809	u32 fq_log;
    810
    811	if (!opt)
    812		return -EINVAL;
    813
    814	err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
    815					  NULL);
    816	if (err < 0)
    817		return err;
    818
    819	sch_tree_lock(sch);
    820
    821	fq_log = q->fq_trees_log;
    822
    823	if (tb[TCA_FQ_BUCKETS_LOG]) {
    824		u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
    825
    826		if (nval >= 1 && nval <= ilog2(256*1024))
    827			fq_log = nval;
    828		else
    829			err = -EINVAL;
    830	}
    831	if (tb[TCA_FQ_PLIMIT])
    832		sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
    833
    834	if (tb[TCA_FQ_FLOW_PLIMIT])
    835		q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
    836
    837	if (tb[TCA_FQ_QUANTUM]) {
    838		u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
    839
    840		if (quantum > 0 && quantum <= (1 << 20)) {
    841			q->quantum = quantum;
    842		} else {
    843			NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
    844			err = -EINVAL;
    845		}
    846	}
    847
    848	if (tb[TCA_FQ_INITIAL_QUANTUM])
    849		q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
    850
    851	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
    852		pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
    853				    nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
    854
    855	if (tb[TCA_FQ_FLOW_MAX_RATE]) {
    856		u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
    857
    858		q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
    859	}
    860	if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
    861		q->low_rate_threshold =
    862			nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
    863
    864	if (tb[TCA_FQ_RATE_ENABLE]) {
    865		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
    866
    867		if (enable <= 1)
    868			q->rate_enable = enable;
    869		else
    870			err = -EINVAL;
    871	}
    872
    873	if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
    874		u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
    875
    876		q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
    877	}
    878
    879	if (tb[TCA_FQ_ORPHAN_MASK])
    880		q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
    881
    882	if (tb[TCA_FQ_CE_THRESHOLD])
    883		q->ce_threshold = (u64)NSEC_PER_USEC *
    884				  nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
    885
    886	if (tb[TCA_FQ_TIMER_SLACK])
    887		q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
    888
    889	if (tb[TCA_FQ_HORIZON])
    890		q->horizon = (u64)NSEC_PER_USEC *
    891				  nla_get_u32(tb[TCA_FQ_HORIZON]);
    892
    893	if (tb[TCA_FQ_HORIZON_DROP])
    894		q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
    895
    896	if (!err) {
    897
    898		sch_tree_unlock(sch);
    899		err = fq_resize(sch, fq_log);
    900		sch_tree_lock(sch);
    901	}
    902	while (sch->q.qlen > sch->limit) {
    903		struct sk_buff *skb = fq_dequeue(sch);
    904
    905		if (!skb)
    906			break;
    907		drop_len += qdisc_pkt_len(skb);
    908		rtnl_kfree_skbs(skb, skb);
    909		drop_count++;
    910	}
    911	qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
    912
    913	sch_tree_unlock(sch);
    914	return err;
    915}
    916
    917static void fq_destroy(struct Qdisc *sch)
    918{
    919	struct fq_sched_data *q = qdisc_priv(sch);
    920
    921	fq_reset(sch);
    922	fq_free(q->fq_root);
    923	qdisc_watchdog_cancel(&q->watchdog);
    924}
    925
    926static int fq_init(struct Qdisc *sch, struct nlattr *opt,
    927		   struct netlink_ext_ack *extack)
    928{
    929	struct fq_sched_data *q = qdisc_priv(sch);
    930	int err;
    931
    932	sch->limit		= 10000;
    933	q->flow_plimit		= 100;
    934	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
    935	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
    936	q->flow_refill_delay	= msecs_to_jiffies(40);
    937	q->flow_max_rate	= ~0UL;
    938	q->time_next_delayed_flow = ~0ULL;
    939	q->rate_enable		= 1;
    940	q->new_flows.first	= NULL;
    941	q->old_flows.first	= NULL;
    942	q->delayed		= RB_ROOT;
    943	q->fq_root		= NULL;
    944	q->fq_trees_log		= ilog2(1024);
    945	q->orphan_mask		= 1024 - 1;
    946	q->low_rate_threshold	= 550000 / 8;
    947
    948	q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
    949
    950	q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
    951	q->horizon_drop = 1; /* by default, drop packets beyond horizon */
    952
    953	/* Default ce_threshold of 4294 seconds */
    954	q->ce_threshold		= (u64)NSEC_PER_USEC * ~0U;
    955
    956	qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
    957
    958	if (opt)
    959		err = fq_change(sch, opt, extack);
    960	else
    961		err = fq_resize(sch, q->fq_trees_log);
    962
    963	return err;
    964}
    965
    966static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
    967{
    968	struct fq_sched_data *q = qdisc_priv(sch);
    969	u64 ce_threshold = q->ce_threshold;
    970	u64 horizon = q->horizon;
    971	struct nlattr *opts;
    972
    973	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
    974	if (opts == NULL)
    975		goto nla_put_failure;
    976
    977	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
    978
    979	do_div(ce_threshold, NSEC_PER_USEC);
    980	do_div(horizon, NSEC_PER_USEC);
    981
    982	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
    983	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
    984	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
    985	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
    986	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
    987	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
    988			min_t(unsigned long, q->flow_max_rate, ~0U)) ||
    989	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
    990			jiffies_to_usecs(q->flow_refill_delay)) ||
    991	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
    992	    nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
    993			q->low_rate_threshold) ||
    994	    nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
    995	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
    996	    nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) ||
    997	    nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
    998	    nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
    999		goto nla_put_failure;
   1000
   1001	return nla_nest_end(skb, opts);
   1002
   1003nla_put_failure:
   1004	return -1;
   1005}
   1006
   1007static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
   1008{
   1009	struct fq_sched_data *q = qdisc_priv(sch);
   1010	struct tc_fq_qd_stats st;
   1011
   1012	sch_tree_lock(sch);
   1013
   1014	st.gc_flows		  = q->stat_gc_flows;
   1015	st.highprio_packets	  = q->stat_internal_packets;
   1016	st.tcp_retrans		  = 0;
   1017	st.throttled		  = q->stat_throttled;
   1018	st.flows_plimit		  = q->stat_flows_plimit;
   1019	st.pkts_too_long	  = q->stat_pkts_too_long;
   1020	st.allocation_errors	  = q->stat_allocation_errors;
   1021	st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
   1022				    ktime_get_ns();
   1023	st.flows		  = q->flows;
   1024	st.inactive_flows	  = q->inactive_flows;
   1025	st.throttled_flows	  = q->throttled_flows;
   1026	st.unthrottle_latency_ns  = min_t(unsigned long,
   1027					  q->unthrottle_latency_ns, ~0U);
   1028	st.ce_mark		  = q->stat_ce_mark;
   1029	st.horizon_drops	  = q->stat_horizon_drops;
   1030	st.horizon_caps		  = q->stat_horizon_caps;
   1031	sch_tree_unlock(sch);
   1032
   1033	return gnet_stats_copy_app(d, &st, sizeof(st));
   1034}
   1035
   1036static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
   1037	.id		=	"fq",
   1038	.priv_size	=	sizeof(struct fq_sched_data),
   1039
   1040	.enqueue	=	fq_enqueue,
   1041	.dequeue	=	fq_dequeue,
   1042	.peek		=	qdisc_peek_dequeued,
   1043	.init		=	fq_init,
   1044	.reset		=	fq_reset,
   1045	.destroy	=	fq_destroy,
   1046	.change		=	fq_change,
   1047	.dump		=	fq_dump,
   1048	.dump_stats	=	fq_dump_stats,
   1049	.owner		=	THIS_MODULE,
   1050};
   1051
   1052static int __init fq_module_init(void)
   1053{
   1054	int ret;
   1055
   1056	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
   1057					   sizeof(struct fq_flow),
   1058					   0, 0, NULL);
   1059	if (!fq_flow_cachep)
   1060		return -ENOMEM;
   1061
   1062	ret = register_qdisc(&fq_qdisc_ops);
   1063	if (ret)
   1064		kmem_cache_destroy(fq_flow_cachep);
   1065	return ret;
   1066}
   1067
   1068static void __exit fq_module_exit(void)
   1069{
   1070	unregister_qdisc(&fq_qdisc_ops);
   1071	kmem_cache_destroy(fq_flow_cachep);
   1072}
   1073
   1074module_init(fq_module_init)
   1075module_exit(fq_module_exit)
   1076MODULE_AUTHOR("Eric Dumazet");
   1077MODULE_LICENSE("GPL");
   1078MODULE_DESCRIPTION("Fair Queue Packet Scheduler");