sch_tbf.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
sch_tbf.c (15554B)
      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * net/sched/sch_tbf.c	Token Bucket Filter queue.
      4 *
      5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
      6 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
      7 *						 original idea by Martin Devera
      8 */
      9
     10#include <linux/module.h>
     11#include <linux/types.h>
     12#include <linux/kernel.h>
     13#include <linux/string.h>
     14#include <linux/errno.h>
     15#include <linux/skbuff.h>
     16#include <net/netlink.h>
     17#include <net/sch_generic.h>
     18#include <net/pkt_cls.h>
     19#include <net/pkt_sched.h>
     20
     21
     22/*	Simple Token Bucket Filter.
     23	=======================================
     24
     25	SOURCE.
     26	-------
     27
     28	None.
     29
     30	Description.
     31	------------
     32
     33	A data flow obeys TBF with rate R and depth B, if for any
     34	time interval t_i...t_f the number of transmitted bits
     35	does not exceed B + R*(t_f-t_i).
     36
     37	Packetized version of this definition:
     38	The sequence of packets of sizes s_i served at moments t_i
     39	obeys TBF, if for any i<=k:
     40
     41	s_i+....+s_k <= B + R*(t_k - t_i)
     42
     43	Algorithm.
     44	----------
     45
     46	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
     47
     48	N(t+delta) = min{B/R, N(t) + delta}
     49
     50	If the first packet in queue has length S, it may be
     51	transmitted only at the time t_* when S/R <= N(t_*),
     52	and in this case N(t) jumps:
     53
     54	N(t_* + 0) = N(t_* - 0) - S/R.
     55
     56
     57
     58	Actually, QoS requires two TBF to be applied to a data stream.
     59	One of them controls steady state burst size, another
     60	one with rate P (peak rate) and depth M (equal to link MTU)
     61	limits bursts at a smaller time scale.
     62
     63	It is easy to see that P>R, and B>M. If P is infinity, this double
     64	TBF is equivalent to a single one.
     65
     66	When TBF works in reshaping mode, latency is estimated as:
     67
     68	lat = max ((L-B)/R, (L-M)/P)
     69
     70
     71	NOTES.
     72	------
     73
     74	If TBF throttles, it starts a watchdog timer, which will wake it up
     75	when it is ready to transmit.
     76	Note that the minimal timer resolution is 1/HZ.
     77	If no new packets arrive during this period,
     78	or if the device is not awaken by EOI for some previous packet,
     79	TBF can stop its activity for 1/HZ.
     80
     81
     82	This means, that with depth B, the maximal rate is
     83
     84	R_crit = B*HZ
     85
     86	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
     87
     88	Note that the peak rate TBF is much more tough: with MTU 1500
     89	P_crit = 150Kbytes/sec. So, if you need greater peak
     90	rates, use alpha with HZ=1000 :-)
     91
     92	With classful TBF, limit is just kept for backwards compatibility.
     93	It is passed to the default bfifo qdisc - if the inner qdisc is
     94	changed the limit is not effective anymore.
     95*/
     96
     97struct tbf_sched_data {
     98/* Parameters */
     99	u32		limit;		/* Maximal length of backlog: bytes */
    100	u32		max_size;
    101	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
    102	s64		mtu;
    103	struct psched_ratecfg rate;
    104	struct psched_ratecfg peak;
    105
    106/* Variables */
    107	s64	tokens;			/* Current number of B tokens */
    108	s64	ptokens;		/* Current number of P tokens */
    109	s64	t_c;			/* Time check-point */
    110	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
    111	struct qdisc_watchdog watchdog;	/* Watchdog timer */
    112};
    113
    114
    115/* Time to Length, convert time in ns to length in bytes
    116 * to determinate how many bytes can be sent in given time.
    117 */
    118static u64 psched_ns_t2l(const struct psched_ratecfg *r,
    119			 u64 time_in_ns)
    120{
    121	/* The formula is :
    122	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
    123	 */
    124	u64 len = time_in_ns * r->rate_bytes_ps;
    125
    126	do_div(len, NSEC_PER_SEC);
    127
    128	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
    129		do_div(len, 53);
    130		len = len * 48;
    131	}
    132
    133	if (len > r->overhead)
    134		len -= r->overhead;
    135	else
    136		len = 0;
    137
    138	return len;
    139}
    140
    141static void tbf_offload_change(struct Qdisc *sch)
    142{
    143	struct tbf_sched_data *q = qdisc_priv(sch);
    144	struct net_device *dev = qdisc_dev(sch);
    145	struct tc_tbf_qopt_offload qopt;
    146
    147	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
    148		return;
    149
    150	qopt.command = TC_TBF_REPLACE;
    151	qopt.handle = sch->handle;
    152	qopt.parent = sch->parent;
    153	qopt.replace_params.rate = q->rate;
    154	qopt.replace_params.max_size = q->max_size;
    155	qopt.replace_params.qstats = &sch->qstats;
    156
    157	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
    158}
    159
    160static void tbf_offload_destroy(struct Qdisc *sch)
    161{
    162	struct net_device *dev = qdisc_dev(sch);
    163	struct tc_tbf_qopt_offload qopt;
    164
    165	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
    166		return;
    167
    168	qopt.command = TC_TBF_DESTROY;
    169	qopt.handle = sch->handle;
    170	qopt.parent = sch->parent;
    171	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
    172}
    173
    174static int tbf_offload_dump(struct Qdisc *sch)
    175{
    176	struct tc_tbf_qopt_offload qopt;
    177
    178	qopt.command = TC_TBF_STATS;
    179	qopt.handle = sch->handle;
    180	qopt.parent = sch->parent;
    181	qopt.stats.bstats = &sch->bstats;
    182	qopt.stats.qstats = &sch->qstats;
    183
    184	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
    185}
    186
    187static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new,
    188			      struct Qdisc *old, struct netlink_ext_ack *extack)
    189{
    190	struct tc_tbf_qopt_offload graft_offload = {
    191		.handle		= sch->handle,
    192		.parent		= sch->parent,
    193		.child_handle	= new->handle,
    194		.command	= TC_TBF_GRAFT,
    195	};
    196
    197	qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old,
    198				   TC_SETUP_QDISC_TBF, &graft_offload, extack);
    199}
    200
    201/* GSO packet is too big, segment it so that tbf can transmit
    202 * each segment in time
    203 */
    204static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
    205		       struct sk_buff **to_free)
    206{
    207	struct tbf_sched_data *q = qdisc_priv(sch);
    208	struct sk_buff *segs, *nskb;
    209	netdev_features_t features = netif_skb_features(skb);
    210	unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
    211	int ret, nb;
    212
    213	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
    214
    215	if (IS_ERR_OR_NULL(segs))
    216		return qdisc_drop(skb, sch, to_free);
    217
    218	nb = 0;
    219	skb_list_walk_safe(segs, segs, nskb) {
    220		skb_mark_not_on_list(segs);
    221		qdisc_skb_cb(segs)->pkt_len = segs->len;
    222		len += segs->len;
    223		ret = qdisc_enqueue(segs, q->qdisc, to_free);
    224		if (ret != NET_XMIT_SUCCESS) {
    225			if (net_xmit_drop_count(ret))
    226				qdisc_qstats_drop(sch);
    227		} else {
    228			nb++;
    229		}
    230	}
    231	sch->q.qlen += nb;
    232	if (nb > 1)
    233		qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
    234	consume_skb(skb);
    235	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
    236}
    237
    238static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
    239		       struct sk_buff **to_free)
    240{
    241	struct tbf_sched_data *q = qdisc_priv(sch);
    242	unsigned int len = qdisc_pkt_len(skb);
    243	int ret;
    244
    245	if (qdisc_pkt_len(skb) > q->max_size) {
    246		if (skb_is_gso(skb) &&
    247		    skb_gso_validate_mac_len(skb, q->max_size))
    248			return tbf_segment(skb, sch, to_free);
    249		return qdisc_drop(skb, sch, to_free);
    250	}
    251	ret = qdisc_enqueue(skb, q->qdisc, to_free);
    252	if (ret != NET_XMIT_SUCCESS) {
    253		if (net_xmit_drop_count(ret))
    254			qdisc_qstats_drop(sch);
    255		return ret;
    256	}
    257
    258	sch->qstats.backlog += len;
    259	sch->q.qlen++;
    260	return NET_XMIT_SUCCESS;
    261}
    262
    263static bool tbf_peak_present(const struct tbf_sched_data *q)
    264{
    265	return q->peak.rate_bytes_ps;
    266}
    267
    268static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
    269{
    270	struct tbf_sched_data *q = qdisc_priv(sch);
    271	struct sk_buff *skb;
    272
    273	skb = q->qdisc->ops->peek(q->qdisc);
    274
    275	if (skb) {
    276		s64 now;
    277		s64 toks;
    278		s64 ptoks = 0;
    279		unsigned int len = qdisc_pkt_len(skb);
    280
    281		now = ktime_get_ns();
    282		toks = min_t(s64, now - q->t_c, q->buffer);
    283
    284		if (tbf_peak_present(q)) {
    285			ptoks = toks + q->ptokens;
    286			if (ptoks > q->mtu)
    287				ptoks = q->mtu;
    288			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
    289		}
    290		toks += q->tokens;
    291		if (toks > q->buffer)
    292			toks = q->buffer;
    293		toks -= (s64) psched_l2t_ns(&q->rate, len);
    294
    295		if ((toks|ptoks) >= 0) {
    296			skb = qdisc_dequeue_peeked(q->qdisc);
    297			if (unlikely(!skb))
    298				return NULL;
    299
    300			q->t_c = now;
    301			q->tokens = toks;
    302			q->ptokens = ptoks;
    303			qdisc_qstats_backlog_dec(sch, skb);
    304			sch->q.qlen--;
    305			qdisc_bstats_update(sch, skb);
    306			return skb;
    307		}
    308
    309		qdisc_watchdog_schedule_ns(&q->watchdog,
    310					   now + max_t(long, -toks, -ptoks));
    311
    312		/* Maybe we have a shorter packet in the queue,
    313		   which can be sent now. It sounds cool,
    314		   but, however, this is wrong in principle.
    315		   We MUST NOT reorder packets under these circumstances.
    316
    317		   Really, if we split the flow into independent
    318		   subflows, it would be a very good solution.
    319		   This is the main idea of all FQ algorithms
    320		   (cf. CSZ, HPFQ, HFSC)
    321		 */
    322
    323		qdisc_qstats_overlimit(sch);
    324	}
    325	return NULL;
    326}
    327
    328static void tbf_reset(struct Qdisc *sch)
    329{
    330	struct tbf_sched_data *q = qdisc_priv(sch);
    331
    332	qdisc_reset(q->qdisc);
    333	sch->qstats.backlog = 0;
    334	sch->q.qlen = 0;
    335	q->t_c = ktime_get_ns();
    336	q->tokens = q->buffer;
    337	q->ptokens = q->mtu;
    338	qdisc_watchdog_cancel(&q->watchdog);
    339}
    340
    341static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
    342	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
    343	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
    344	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
    345	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
    346	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
    347	[TCA_TBF_BURST] = { .type = NLA_U32 },
    348	[TCA_TBF_PBURST] = { .type = NLA_U32 },
    349};
    350
    351static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
    352		      struct netlink_ext_ack *extack)
    353{
    354	int err;
    355	struct tbf_sched_data *q = qdisc_priv(sch);
    356	struct nlattr *tb[TCA_TBF_MAX + 1];
    357	struct tc_tbf_qopt *qopt;
    358	struct Qdisc *child = NULL;
    359	struct psched_ratecfg rate;
    360	struct psched_ratecfg peak;
    361	u64 max_size;
    362	s64 buffer, mtu;
    363	u64 rate64 = 0, prate64 = 0;
    364
    365	err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
    366					  NULL);
    367	if (err < 0)
    368		return err;
    369
    370	err = -EINVAL;
    371	if (tb[TCA_TBF_PARMS] == NULL)
    372		goto done;
    373
    374	qopt = nla_data(tb[TCA_TBF_PARMS]);
    375	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
    376		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
    377					      tb[TCA_TBF_RTAB],
    378					      NULL));
    379
    380	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
    381			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
    382						      tb[TCA_TBF_PTAB],
    383						      NULL));
    384
    385	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
    386	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
    387
    388	if (tb[TCA_TBF_RATE64])
    389		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
    390	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
    391
    392	if (tb[TCA_TBF_BURST]) {
    393		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
    394		buffer = psched_l2t_ns(&rate, max_size);
    395	} else {
    396		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
    397	}
    398
    399	if (qopt->peakrate.rate) {
    400		if (tb[TCA_TBF_PRATE64])
    401			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
    402		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
    403		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
    404			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
    405					peak.rate_bytes_ps, rate.rate_bytes_ps);
    406			err = -EINVAL;
    407			goto done;
    408		}
    409
    410		if (tb[TCA_TBF_PBURST]) {
    411			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
    412			max_size = min_t(u32, max_size, pburst);
    413			mtu = psched_l2t_ns(&peak, pburst);
    414		} else {
    415			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
    416		}
    417	} else {
    418		memset(&peak, 0, sizeof(peak));
    419	}
    420
    421	if (max_size < psched_mtu(qdisc_dev(sch)))
    422		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
    423				    max_size, qdisc_dev(sch)->name,
    424				    psched_mtu(qdisc_dev(sch)));
    425
    426	if (!max_size) {
    427		err = -EINVAL;
    428		goto done;
    429	}
    430
    431	if (q->qdisc != &noop_qdisc) {
    432		err = fifo_set_limit(q->qdisc, qopt->limit);
    433		if (err)
    434			goto done;
    435	} else if (qopt->limit > 0) {
    436		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
    437					 extack);
    438		if (IS_ERR(child)) {
    439			err = PTR_ERR(child);
    440			goto done;
    441		}
    442
    443		/* child is fifo, no need to check for noop_qdisc */
    444		qdisc_hash_add(child, true);
    445	}
    446
    447	sch_tree_lock(sch);
    448	if (child) {
    449		qdisc_tree_flush_backlog(q->qdisc);
    450		qdisc_put(q->qdisc);
    451		q->qdisc = child;
    452	}
    453	q->limit = qopt->limit;
    454	if (tb[TCA_TBF_PBURST])
    455		q->mtu = mtu;
    456	else
    457		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
    458	q->max_size = max_size;
    459	if (tb[TCA_TBF_BURST])
    460		q->buffer = buffer;
    461	else
    462		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
    463	q->tokens = q->buffer;
    464	q->ptokens = q->mtu;
    465
    466	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
    467	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
    468
    469	sch_tree_unlock(sch);
    470	err = 0;
    471
    472	tbf_offload_change(sch);
    473done:
    474	return err;
    475}
    476
    477static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
    478		    struct netlink_ext_ack *extack)
    479{
    480	struct tbf_sched_data *q = qdisc_priv(sch);
    481
    482	qdisc_watchdog_init(&q->watchdog, sch);
    483	q->qdisc = &noop_qdisc;
    484
    485	if (!opt)
    486		return -EINVAL;
    487
    488	q->t_c = ktime_get_ns();
    489
    490	return tbf_change(sch, opt, extack);
    491}
    492
    493static void tbf_destroy(struct Qdisc *sch)
    494{
    495	struct tbf_sched_data *q = qdisc_priv(sch);
    496
    497	qdisc_watchdog_cancel(&q->watchdog);
    498	tbf_offload_destroy(sch);
    499	qdisc_put(q->qdisc);
    500}
    501
    502static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
    503{
    504	struct tbf_sched_data *q = qdisc_priv(sch);
    505	struct nlattr *nest;
    506	struct tc_tbf_qopt opt;
    507	int err;
    508
    509	err = tbf_offload_dump(sch);
    510	if (err)
    511		return err;
    512
    513	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
    514	if (nest == NULL)
    515		goto nla_put_failure;
    516
    517	opt.limit = q->limit;
    518	psched_ratecfg_getrate(&opt.rate, &q->rate);
    519	if (tbf_peak_present(q))
    520		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
    521	else
    522		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
    523	opt.mtu = PSCHED_NS2TICKS(q->mtu);
    524	opt.buffer = PSCHED_NS2TICKS(q->buffer);
    525	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
    526		goto nla_put_failure;
    527	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
    528	    nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
    529			      TCA_TBF_PAD))
    530		goto nla_put_failure;
    531	if (tbf_peak_present(q) &&
    532	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
    533	    nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
    534			      TCA_TBF_PAD))
    535		goto nla_put_failure;
    536
    537	return nla_nest_end(skb, nest);
    538
    539nla_put_failure:
    540	nla_nest_cancel(skb, nest);
    541	return -1;
    542}
    543
    544static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
    545			  struct sk_buff *skb, struct tcmsg *tcm)
    546{
    547	struct tbf_sched_data *q = qdisc_priv(sch);
    548
    549	tcm->tcm_handle |= TC_H_MIN(1);
    550	tcm->tcm_info = q->qdisc->handle;
    551
    552	return 0;
    553}
    554
    555static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
    556		     struct Qdisc **old, struct netlink_ext_ack *extack)
    557{
    558	struct tbf_sched_data *q = qdisc_priv(sch);
    559
    560	if (new == NULL)
    561		new = &noop_qdisc;
    562
    563	*old = qdisc_replace(sch, new, &q->qdisc);
    564
    565	tbf_offload_graft(sch, new, *old, extack);
    566	return 0;
    567}
    568
    569static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
    570{
    571	struct tbf_sched_data *q = qdisc_priv(sch);
    572	return q->qdisc;
    573}
    574
    575static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
    576{
    577	return 1;
    578}
    579
    580static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
    581{
    582	if (!walker->stop) {
    583		if (walker->count >= walker->skip)
    584			if (walker->fn(sch, 1, walker) < 0) {
    585				walker->stop = 1;
    586				return;
    587			}
    588		walker->count++;
    589	}
    590}
    591
    592static const struct Qdisc_class_ops tbf_class_ops = {
    593	.graft		=	tbf_graft,
    594	.leaf		=	tbf_leaf,
    595	.find		=	tbf_find,
    596	.walk		=	tbf_walk,
    597	.dump		=	tbf_dump_class,
    598};
    599
    600static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
    601	.next		=	NULL,
    602	.cl_ops		=	&tbf_class_ops,
    603	.id		=	"tbf",
    604	.priv_size	=	sizeof(struct tbf_sched_data),
    605	.enqueue	=	tbf_enqueue,
    606	.dequeue	=	tbf_dequeue,
    607	.peek		=	qdisc_peek_dequeued,
    608	.init		=	tbf_init,
    609	.reset		=	tbf_reset,
    610	.destroy	=	tbf_destroy,
    611	.change		=	tbf_change,
    612	.dump		=	tbf_dump,
    613	.owner		=	THIS_MODULE,
    614};
    615
    616static int __init tbf_module_init(void)
    617{
    618	return register_qdisc(&tbf_qdisc_ops);
    619}
    620
    621static void __exit tbf_module_exit(void)
    622{
    623	unregister_qdisc(&tbf_qdisc_ops);
    624}
    625module_init(tbf_module_init)
    626module_exit(tbf_module_exit)
    627MODULE_LICENSE("GPL");