blk-iocost.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
blk-iocost.c (98934B)
      1/* SPDX-License-Identifier: GPL-2.0
      2 *
      3 * IO cost model based controller.
      4 *
      5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
      6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
      7 * Copyright (C) 2019 Facebook
      8 *
      9 * One challenge of controlling IO resources is the lack of trivially
     10 * observable cost metric.  This is distinguished from CPU and memory where
     11 * wallclock time and the number of bytes can serve as accurate enough
     12 * approximations.
     13 *
     14 * Bandwidth and iops are the most commonly used metrics for IO devices but
     15 * depending on the type and specifics of the device, different IO patterns
     16 * easily lead to multiple orders of magnitude variations rendering them
     17 * useless for the purpose of IO capacity distribution.  While on-device
     18 * time, with a lot of clutches, could serve as a useful approximation for
     19 * non-queued rotational devices, this is no longer viable with modern
     20 * devices, even the rotational ones.
     21 *
     22 * While there is no cost metric we can trivially observe, it isn't a
     23 * complete mystery.  For example, on a rotational device, seek cost
     24 * dominates while a contiguous transfer contributes a smaller amount
     25 * proportional to the size.  If we can characterize at least the relative
     26 * costs of these different types of IOs, it should be possible to
     27 * implement a reasonable work-conserving proportional IO resource
     28 * distribution.
     29 *
     30 * 1. IO Cost Model
     31 *
     32 * IO cost model estimates the cost of an IO given its basic parameters and
     33 * history (e.g. the end sector of the last IO).  The cost is measured in
     34 * device time.  If a given IO is estimated to cost 10ms, the device should
     35 * be able to process ~100 of those IOs in a second.
     36 *
     37 * Currently, there's only one builtin cost model - linear.  Each IO is
     38 * classified as sequential or random and given a base cost accordingly.
     39 * On top of that, a size cost proportional to the length of the IO is
     40 * added.  While simple, this model captures the operational
     41 * characteristics of a wide varienty of devices well enough.  Default
     42 * parameters for several different classes of devices are provided and the
     43 * parameters can be configured from userspace via
     44 * /sys/fs/cgroup/io.cost.model.
     45 *
     46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
     47 * device-specific coefficients.
     48 *
     49 * 2. Control Strategy
     50 *
     51 * The device virtual time (vtime) is used as the primary control metric.
     52 * The control strategy is composed of the following three parts.
     53 *
     54 * 2-1. Vtime Distribution
     55 *
     56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
     57 * calculated.  Please consider the following hierarchy where the numbers
     58 * inside parentheses denote the configured weights.
     59 *
     60 *           root
     61 *         /       \
     62 *      A (w:100)  B (w:300)
     63 *      /       \
     64 *  A0 (w:100)  A1 (w:100)
     65 *
     66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
     67 * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
     68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
     69 * 12.5% each.  The distribution mechanism only cares about these flattened
     70 * shares.  They're called hweights (hierarchical weights) and always add
     71 * upto 1 (WEIGHT_ONE).
     72 *
     73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
     74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
     75 * against the device vtime - an IO which takes 10ms on the underlying
     76 * device is considered to take 80ms on A0.
     77 *
     78 * This constitutes the basis of IO capacity distribution.  Each cgroup's
     79 * vtime is running at a rate determined by its hweight.  A cgroup tracks
     80 * the vtime consumed by past IOs and can issue a new IO if doing so
     81 * wouldn't outrun the current device vtime.  Otherwise, the IO is
     82 * suspended until the vtime has progressed enough to cover it.
     83 *
     84 * 2-2. Vrate Adjustment
     85 *
     86 * It's unrealistic to expect the cost model to be perfect.  There are too
     87 * many devices and even on the same device the overall performance
     88 * fluctuates depending on numerous factors such as IO mixture and device
     89 * internal garbage collection.  The controller needs to adapt dynamically.
     90 *
     91 * This is achieved by adjusting the overall IO rate according to how busy
     92 * the device is.  If the device becomes overloaded, we're sending down too
     93 * many IOs and should generally slow down.  If there are waiting issuers
     94 * but the device isn't saturated, we're issuing too few and should
     95 * generally speed up.
     96 *
     97 * To slow down, we lower the vrate - the rate at which the device vtime
     98 * passes compared to the wall clock.  For example, if the vtime is running
     99 * at the vrate of 75%, all cgroups added up would only be able to issue
    100 * 750ms worth of IOs per second, and vice-versa for speeding up.
    101 *
    102 * Device business is determined using two criteria - rq wait and
    103 * completion latencies.
    104 *
    105 * When a device gets saturated, the on-device and then the request queues
    106 * fill up and a bio which is ready to be issued has to wait for a request
    107 * to become available.  When this delay becomes noticeable, it's a clear
    108 * indication that the device is saturated and we lower the vrate.  This
    109 * saturation signal is fairly conservative as it only triggers when both
    110 * hardware and software queues are filled up, and is used as the default
    111 * busy signal.
    112 *
    113 * As devices can have deep queues and be unfair in how the queued commands
    114 * are executed, soley depending on rq wait may not result in satisfactory
    115 * control quality.  For a better control quality, completion latency QoS
    116 * parameters can be configured so that the device is considered saturated
    117 * if N'th percentile completion latency rises above the set point.
    118 *
    119 * The completion latency requirements are a function of both the
    120 * underlying device characteristics and the desired IO latency quality of
    121 * service.  There is an inherent trade-off - the tighter the latency QoS,
    122 * the higher the bandwidth lossage.  Latency QoS is disabled by default
    123 * and can be set through /sys/fs/cgroup/io.cost.qos.
    124 *
    125 * 2-3. Work Conservation
    126 *
    127 * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
    128 * periodically while B is sending out enough parallel IOs to saturate the
    129 * device on its own.  Let's say A's usage amounts to 100ms worth of IO
    130 * cost per second, i.e., 10% of the device capacity.  The naive
    131 * distribution of half and half would lead to 60% utilization of the
    132 * device, a significant reduction in the total amount of work done
    133 * compared to free-for-all competition.  This is too high a cost to pay
    134 * for IO control.
    135 *
    136 * To conserve the total amount of work done, we keep track of how much
    137 * each active cgroup is actually using and yield part of its weight if
    138 * there are other cgroups which can make use of it.  In the above case,
    139 * A's weight will be lowered so that it hovers above the actual usage and
    140 * B would be able to use the rest.
    141 *
    142 * As we don't want to penalize a cgroup for donating its weight, the
    143 * surplus weight adjustment factors in a margin and has an immediate
    144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
    145 *
    146 * Note that adjusting down surplus weights has the same effects as
    147 * accelerating vtime for other cgroups and work conservation can also be
    148 * implemented by adjusting vrate dynamically.  However, squaring who can
    149 * donate and should take back how much requires hweight propagations
    150 * anyway making it easier to implement and understand as a separate
    151 * mechanism.
    152 *
    153 * 3. Monitoring
    154 *
    155 * Instead of debugfs or other clumsy monitoring mechanisms, this
    156 * controller uses a drgn based monitoring script -
    157 * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
    158 * https://github.com/osandov/drgn.  The output looks like the following.
    159 *
    160 *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
    161 *                 active      weight      hweight% inflt% dbt  delay usages%
    162 *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
    163 *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
    164 *
    165 * - per	: Timer period
    166 * - cur_per	: Internal wall and device vtime clock
    167 * - vrate	: Device virtual time rate against wall clock
    168 * - weight	: Surplus-adjusted and configured weights
    169 * - hweight	: Surplus-adjusted and configured hierarchical weights
    170 * - inflt	: The percentage of in-flight IO cost at the end of last period
    171 * - del_ms	: Deferred issuer delay induction level and duration
    172 * - usages	: Usage history
    173 */
    174
    175#include <linux/kernel.h>
    176#include <linux/module.h>
    177#include <linux/timer.h>
    178#include <linux/time64.h>
    179#include <linux/parser.h>
    180#include <linux/sched/signal.h>
    181#include <asm/local.h>
    182#include <asm/local64.h>
    183#include "blk-rq-qos.h"
    184#include "blk-stat.h"
    185#include "blk-wbt.h"
    186#include "blk-cgroup.h"
    187
    188#ifdef CONFIG_TRACEPOINTS
    189
    190/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
    191#define TRACE_IOCG_PATH_LEN 1024
    192static DEFINE_SPINLOCK(trace_iocg_path_lock);
    193static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
    194
    195#define TRACE_IOCG_PATH(type, iocg, ...)					\
    196	do {									\
    197		unsigned long flags;						\
    198		if (trace_iocost_##type##_enabled()) {				\
    199			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
    200			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
    201				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
    202			trace_iocost_##type(iocg, trace_iocg_path,		\
    203					      ##__VA_ARGS__);			\
    204			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
    205		}								\
    206	} while (0)
    207
    208#else	/* CONFIG_TRACE_POINTS */
    209#define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
    210#endif	/* CONFIG_TRACE_POINTS */
    211
    212enum {
    213	MILLION			= 1000000,
    214
    215	/* timer period is calculated from latency requirements, bound it */
    216	MIN_PERIOD		= USEC_PER_MSEC,
    217	MAX_PERIOD		= USEC_PER_SEC,
    218
    219	/*
    220	 * iocg->vtime is targeted at 50% behind the device vtime, which
    221	 * serves as its IO credit buffer.  Surplus weight adjustment is
    222	 * immediately canceled if the vtime margin runs below 10%.
    223	 */
    224	MARGIN_MIN_PCT		= 10,
    225	MARGIN_LOW_PCT		= 20,
    226	MARGIN_TARGET_PCT	= 50,
    227
    228	INUSE_ADJ_STEP_PCT	= 25,
    229
    230	/* Have some play in timer operations */
    231	TIMER_SLACK_PCT		= 1,
    232
    233	/* 1/64k is granular enough and can easily be handled w/ u32 */
    234	WEIGHT_ONE		= 1 << 16,
    235
    236	/*
    237	 * As vtime is used to calculate the cost of each IO, it needs to
    238	 * be fairly high precision.  For example, it should be able to
    239	 * represent the cost of a single page worth of discard with
    240	 * suffificient accuracy.  At the same time, it should be able to
    241	 * represent reasonably long enough durations to be useful and
    242	 * convenient during operation.
    243	 *
    244	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
    245	 * granularity and days of wrap-around time even at extreme vrates.
    246	 */
    247	VTIME_PER_SEC_SHIFT	= 37,
    248	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
    249	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
    250	VTIME_PER_NSEC		= VTIME_PER_SEC / NSEC_PER_SEC,
    251
    252	/* bound vrate adjustments within two orders of magnitude */
    253	VRATE_MIN_PPM		= 10000,	/* 1% */
    254	VRATE_MAX_PPM		= 100000000,	/* 10000% */
    255
    256	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
    257	VRATE_CLAMP_ADJ_PCT	= 4,
    258
    259	/* if IOs end up waiting for requests, issue less */
    260	RQ_WAIT_BUSY_PCT	= 5,
    261
    262	/* unbusy hysterisis */
    263	UNBUSY_THR_PCT		= 75,
    264
    265	/*
    266	 * The effect of delay is indirect and non-linear and a huge amount of
    267	 * future debt can accumulate abruptly while unthrottled. Linearly scale
    268	 * up delay as debt is going up and then let it decay exponentially.
    269	 * This gives us quick ramp ups while delay is accumulating and long
    270	 * tails which can help reducing the frequency of debt explosions on
    271	 * unthrottle. The parameters are experimentally determined.
    272	 *
    273	 * The delay mechanism provides adequate protection and behavior in many
    274	 * cases. However, this is far from ideal and falls shorts on both
    275	 * fronts. The debtors are often throttled too harshly costing a
    276	 * significant level of fairness and possibly total work while the
    277	 * protection against their impacts on the system can be choppy and
    278	 * unreliable.
    279	 *
    280	 * The shortcoming primarily stems from the fact that, unlike for page
    281	 * cache, the kernel doesn't have well-defined back-pressure propagation
    282	 * mechanism and policies for anonymous memory. Fully addressing this
    283	 * issue will likely require substantial improvements in the area.
    284	 */
    285	MIN_DELAY_THR_PCT	= 500,
    286	MAX_DELAY_THR_PCT	= 25000,
    287	MIN_DELAY		= 250,
    288	MAX_DELAY		= 250 * USEC_PER_MSEC,
    289
    290	/* halve debts if avg usage over 100ms is under 50% */
    291	DFGV_USAGE_PCT		= 50,
    292	DFGV_PERIOD		= 100 * USEC_PER_MSEC,
    293
    294	/* don't let cmds which take a very long time pin lagging for too long */
    295	MAX_LAGGING_PERIODS	= 10,
    296
    297	/* switch iff the conditions are met for longer than this */
    298	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
    299
    300	/*
    301	 * Count IO size in 4k pages.  The 12bit shift helps keeping
    302	 * size-proportional components of cost calculation in closer
    303	 * numbers of digits to per-IO cost components.
    304	 */
    305	IOC_PAGE_SHIFT		= 12,
    306	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
    307	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
    308
    309	/* if apart further than 16M, consider randio for linear model */
    310	LCOEF_RANDIO_PAGES	= 4096,
    311};
    312
    313enum ioc_running {
    314	IOC_IDLE,
    315	IOC_RUNNING,
    316	IOC_STOP,
    317};
    318
    319/* io.cost.qos controls including per-dev enable of the whole controller */
    320enum {
    321	QOS_ENABLE,
    322	QOS_CTRL,
    323	NR_QOS_CTRL_PARAMS,
    324};
    325
    326/* io.cost.qos params */
    327enum {
    328	QOS_RPPM,
    329	QOS_RLAT,
    330	QOS_WPPM,
    331	QOS_WLAT,
    332	QOS_MIN,
    333	QOS_MAX,
    334	NR_QOS_PARAMS,
    335};
    336
    337/* io.cost.model controls */
    338enum {
    339	COST_CTRL,
    340	COST_MODEL,
    341	NR_COST_CTRL_PARAMS,
    342};
    343
    344/* builtin linear cost model coefficients */
    345enum {
    346	I_LCOEF_RBPS,
    347	I_LCOEF_RSEQIOPS,
    348	I_LCOEF_RRANDIOPS,
    349	I_LCOEF_WBPS,
    350	I_LCOEF_WSEQIOPS,
    351	I_LCOEF_WRANDIOPS,
    352	NR_I_LCOEFS,
    353};
    354
    355enum {
    356	LCOEF_RPAGE,
    357	LCOEF_RSEQIO,
    358	LCOEF_RRANDIO,
    359	LCOEF_WPAGE,
    360	LCOEF_WSEQIO,
    361	LCOEF_WRANDIO,
    362	NR_LCOEFS,
    363};
    364
    365enum {
    366	AUTOP_INVALID,
    367	AUTOP_HDD,
    368	AUTOP_SSD_QD1,
    369	AUTOP_SSD_DFL,
    370	AUTOP_SSD_FAST,
    371};
    372
    373struct ioc_params {
    374	u32				qos[NR_QOS_PARAMS];
    375	u64				i_lcoefs[NR_I_LCOEFS];
    376	u64				lcoefs[NR_LCOEFS];
    377	u32				too_fast_vrate_pct;
    378	u32				too_slow_vrate_pct;
    379};
    380
    381struct ioc_margins {
    382	s64				min;
    383	s64				low;
    384	s64				target;
    385};
    386
    387struct ioc_missed {
    388	local_t				nr_met;
    389	local_t				nr_missed;
    390	u32				last_met;
    391	u32				last_missed;
    392};
    393
    394struct ioc_pcpu_stat {
    395	struct ioc_missed		missed[2];
    396
    397	local64_t			rq_wait_ns;
    398	u64				last_rq_wait_ns;
    399};
    400
    401/* per device */
    402struct ioc {
    403	struct rq_qos			rqos;
    404
    405	bool				enabled;
    406
    407	struct ioc_params		params;
    408	struct ioc_margins		margins;
    409	u32				period_us;
    410	u32				timer_slack_ns;
    411	u64				vrate_min;
    412	u64				vrate_max;
    413
    414	spinlock_t			lock;
    415	struct timer_list		timer;
    416	struct list_head		active_iocgs;	/* active cgroups */
    417	struct ioc_pcpu_stat __percpu	*pcpu_stat;
    418
    419	enum ioc_running		running;
    420	atomic64_t			vtime_rate;
    421	u64				vtime_base_rate;
    422	s64				vtime_err;
    423
    424	seqcount_spinlock_t		period_seqcount;
    425	u64				period_at;	/* wallclock starttime */
    426	u64				period_at_vtime; /* vtime starttime */
    427
    428	atomic64_t			cur_period;	/* inc'd each period */
    429	int				busy_level;	/* saturation history */
    430
    431	bool				weights_updated;
    432	atomic_t			hweight_gen;	/* for lazy hweights */
    433
    434	/* debt forgivness */
    435	u64				dfgv_period_at;
    436	u64				dfgv_period_rem;
    437	u64				dfgv_usage_us_sum;
    438
    439	u64				autop_too_fast_at;
    440	u64				autop_too_slow_at;
    441	int				autop_idx;
    442	bool				user_qos_params:1;
    443	bool				user_cost_model:1;
    444};
    445
    446struct iocg_pcpu_stat {
    447	local64_t			abs_vusage;
    448};
    449
    450struct iocg_stat {
    451	u64				usage_us;
    452	u64				wait_us;
    453	u64				indebt_us;
    454	u64				indelay_us;
    455};
    456
    457/* per device-cgroup pair */
    458struct ioc_gq {
    459	struct blkg_policy_data		pd;
    460	struct ioc			*ioc;
    461
    462	/*
    463	 * A iocg can get its weight from two sources - an explicit
    464	 * per-device-cgroup configuration or the default weight of the
    465	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
    466	 * configuration.  `weight` is the effective considering both
    467	 * sources.
    468	 *
    469	 * When an idle cgroup becomes active its `active` goes from 0 to
    470	 * `weight`.  `inuse` is the surplus adjusted active weight.
    471	 * `active` and `inuse` are used to calculate `hweight_active` and
    472	 * `hweight_inuse`.
    473	 *
    474	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
    475	 * surplus adjustments.
    476	 *
    477	 * `inuse` may be adjusted dynamically during period. `saved_*` are used
    478	 * to determine and track adjustments.
    479	 */
    480	u32				cfg_weight;
    481	u32				weight;
    482	u32				active;
    483	u32				inuse;
    484
    485	u32				last_inuse;
    486	s64				saved_margin;
    487
    488	sector_t			cursor;		/* to detect randio */
    489
    490	/*
    491	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
    492	 * issued.  If lagging behind device vtime, the delta represents
    493	 * the currently available IO budget.  If running ahead, the
    494	 * overage.
    495	 *
    496	 * `vtime_done` is the same but progressed on completion rather
    497	 * than issue.  The delta behind `vtime` represents the cost of
    498	 * currently in-flight IOs.
    499	 */
    500	atomic64_t			vtime;
    501	atomic64_t			done_vtime;
    502	u64				abs_vdebt;
    503
    504	/* current delay in effect and when it started */
    505	u64				delay;
    506	u64				delay_at;
    507
    508	/*
    509	 * The period this iocg was last active in.  Used for deactivation
    510	 * and invalidating `vtime`.
    511	 */
    512	atomic64_t			active_period;
    513	struct list_head		active_list;
    514
    515	/* see __propagate_weights() and current_hweight() for details */
    516	u64				child_active_sum;
    517	u64				child_inuse_sum;
    518	u64				child_adjusted_sum;
    519	int				hweight_gen;
    520	u32				hweight_active;
    521	u32				hweight_inuse;
    522	u32				hweight_donating;
    523	u32				hweight_after_donation;
    524
    525	struct list_head		walk_list;
    526	struct list_head		surplus_list;
    527
    528	struct wait_queue_head		waitq;
    529	struct hrtimer			waitq_timer;
    530
    531	/* timestamp at the latest activation */
    532	u64				activated_at;
    533
    534	/* statistics */
    535	struct iocg_pcpu_stat __percpu	*pcpu_stat;
    536	struct iocg_stat		stat;
    537	struct iocg_stat		last_stat;
    538	u64				last_stat_abs_vusage;
    539	u64				usage_delta_us;
    540	u64				wait_since;
    541	u64				indebt_since;
    542	u64				indelay_since;
    543
    544	/* this iocg's depth in the hierarchy and ancestors including self */
    545	int				level;
    546	struct ioc_gq			*ancestors[];
    547};
    548
    549/* per cgroup */
    550struct ioc_cgrp {
    551	struct blkcg_policy_data	cpd;
    552	unsigned int			dfl_weight;
    553};
    554
    555struct ioc_now {
    556	u64				now_ns;
    557	u64				now;
    558	u64				vnow;
    559	u64				vrate;
    560};
    561
    562struct iocg_wait {
    563	struct wait_queue_entry		wait;
    564	struct bio			*bio;
    565	u64				abs_cost;
    566	bool				committed;
    567};
    568
    569struct iocg_wake_ctx {
    570	struct ioc_gq			*iocg;
    571	u32				hw_inuse;
    572	s64				vbudget;
    573};
    574
    575static const struct ioc_params autop[] = {
    576	[AUTOP_HDD] = {
    577		.qos				= {
    578			[QOS_RLAT]		=        250000, /* 250ms */
    579			[QOS_WLAT]		=        250000,
    580			[QOS_MIN]		= VRATE_MIN_PPM,
    581			[QOS_MAX]		= VRATE_MAX_PPM,
    582		},
    583		.i_lcoefs			= {
    584			[I_LCOEF_RBPS]		=     174019176,
    585			[I_LCOEF_RSEQIOPS]	=         41708,
    586			[I_LCOEF_RRANDIOPS]	=           370,
    587			[I_LCOEF_WBPS]		=     178075866,
    588			[I_LCOEF_WSEQIOPS]	=         42705,
    589			[I_LCOEF_WRANDIOPS]	=           378,
    590		},
    591	},
    592	[AUTOP_SSD_QD1] = {
    593		.qos				= {
    594			[QOS_RLAT]		=         25000, /* 25ms */
    595			[QOS_WLAT]		=         25000,
    596			[QOS_MIN]		= VRATE_MIN_PPM,
    597			[QOS_MAX]		= VRATE_MAX_PPM,
    598		},
    599		.i_lcoefs			= {
    600			[I_LCOEF_RBPS]		=     245855193,
    601			[I_LCOEF_RSEQIOPS]	=         61575,
    602			[I_LCOEF_RRANDIOPS]	=          6946,
    603			[I_LCOEF_WBPS]		=     141365009,
    604			[I_LCOEF_WSEQIOPS]	=         33716,
    605			[I_LCOEF_WRANDIOPS]	=         26796,
    606		},
    607	},
    608	[AUTOP_SSD_DFL] = {
    609		.qos				= {
    610			[QOS_RLAT]		=         25000, /* 25ms */
    611			[QOS_WLAT]		=         25000,
    612			[QOS_MIN]		= VRATE_MIN_PPM,
    613			[QOS_MAX]		= VRATE_MAX_PPM,
    614		},
    615		.i_lcoefs			= {
    616			[I_LCOEF_RBPS]		=     488636629,
    617			[I_LCOEF_RSEQIOPS]	=          8932,
    618			[I_LCOEF_RRANDIOPS]	=          8518,
    619			[I_LCOEF_WBPS]		=     427891549,
    620			[I_LCOEF_WSEQIOPS]	=         28755,
    621			[I_LCOEF_WRANDIOPS]	=         21940,
    622		},
    623		.too_fast_vrate_pct		=           500,
    624	},
    625	[AUTOP_SSD_FAST] = {
    626		.qos				= {
    627			[QOS_RLAT]		=          5000, /* 5ms */
    628			[QOS_WLAT]		=          5000,
    629			[QOS_MIN]		= VRATE_MIN_PPM,
    630			[QOS_MAX]		= VRATE_MAX_PPM,
    631		},
    632		.i_lcoefs			= {
    633			[I_LCOEF_RBPS]		=    3102524156LLU,
    634			[I_LCOEF_RSEQIOPS]	=        724816,
    635			[I_LCOEF_RRANDIOPS]	=        778122,
    636			[I_LCOEF_WBPS]		=    1742780862LLU,
    637			[I_LCOEF_WSEQIOPS]	=        425702,
    638			[I_LCOEF_WRANDIOPS]	=	 443193,
    639		},
    640		.too_slow_vrate_pct		=            10,
    641	},
    642};
    643
    644/*
    645 * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
    646 * vtime credit shortage and down on device saturation.
    647 */
    648static u32 vrate_adj_pct[] =
    649	{ 0, 0, 0, 0,
    650	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    651	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    652	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
    653
    654static struct blkcg_policy blkcg_policy_iocost;
    655
    656/* accessors and helpers */
    657static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
    658{
    659	return container_of(rqos, struct ioc, rqos);
    660}
    661
    662static struct ioc *q_to_ioc(struct request_queue *q)
    663{
    664	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
    665}
    666
    667static const char *q_name(struct request_queue *q)
    668{
    669	if (blk_queue_registered(q))
    670		return kobject_name(q->kobj.parent);
    671	else
    672		return "<unknown>";
    673}
    674
    675static const char __maybe_unused *ioc_name(struct ioc *ioc)
    676{
    677	return q_name(ioc->rqos.q);
    678}
    679
    680static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
    681{
    682	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
    683}
    684
    685static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
    686{
    687	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
    688}
    689
    690static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
    691{
    692	return pd_to_blkg(&iocg->pd);
    693}
    694
    695static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
    696{
    697	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
    698			    struct ioc_cgrp, cpd);
    699}
    700
    701/*
    702 * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
    703 * weight, the more expensive each IO.  Must round up.
    704 */
    705static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
    706{
    707	return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
    708}
    709
    710/*
    711 * The inverse of abs_cost_to_cost().  Must round up.
    712 */
    713static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
    714{
    715	return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
    716}
    717
    718static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
    719			    u64 abs_cost, u64 cost)
    720{
    721	struct iocg_pcpu_stat *gcs;
    722
    723	bio->bi_iocost_cost = cost;
    724	atomic64_add(cost, &iocg->vtime);
    725
    726	gcs = get_cpu_ptr(iocg->pcpu_stat);
    727	local64_add(abs_cost, &gcs->abs_vusage);
    728	put_cpu_ptr(gcs);
    729}
    730
    731static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
    732{
    733	if (lock_ioc) {
    734		spin_lock_irqsave(&iocg->ioc->lock, *flags);
    735		spin_lock(&iocg->waitq.lock);
    736	} else {
    737		spin_lock_irqsave(&iocg->waitq.lock, *flags);
    738	}
    739}
    740
    741static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
    742{
    743	if (unlock_ioc) {
    744		spin_unlock(&iocg->waitq.lock);
    745		spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
    746	} else {
    747		spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
    748	}
    749}
    750
    751#define CREATE_TRACE_POINTS
    752#include <trace/events/iocost.h>
    753
    754static void ioc_refresh_margins(struct ioc *ioc)
    755{
    756	struct ioc_margins *margins = &ioc->margins;
    757	u32 period_us = ioc->period_us;
    758	u64 vrate = ioc->vtime_base_rate;
    759
    760	margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
    761	margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
    762	margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
    763}
    764
    765/* latency Qos params changed, update period_us and all the dependent params */
    766static void ioc_refresh_period_us(struct ioc *ioc)
    767{
    768	u32 ppm, lat, multi, period_us;
    769
    770	lockdep_assert_held(&ioc->lock);
    771
    772	/* pick the higher latency target */
    773	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
    774		ppm = ioc->params.qos[QOS_RPPM];
    775		lat = ioc->params.qos[QOS_RLAT];
    776	} else {
    777		ppm = ioc->params.qos[QOS_WPPM];
    778		lat = ioc->params.qos[QOS_WLAT];
    779	}
    780
    781	/*
    782	 * We want the period to be long enough to contain a healthy number
    783	 * of IOs while short enough for granular control.  Define it as a
    784	 * multiple of the latency target.  Ideally, the multiplier should
    785	 * be scaled according to the percentile so that it would nominally
    786	 * contain a certain number of requests.  Let's be simpler and
    787	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
    788	 */
    789	if (ppm)
    790		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
    791	else
    792		multi = 2;
    793	period_us = multi * lat;
    794	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
    795
    796	/* calculate dependent params */
    797	ioc->period_us = period_us;
    798	ioc->timer_slack_ns = div64_u64(
    799		(u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
    800		100);
    801	ioc_refresh_margins(ioc);
    802}
    803
    804static int ioc_autop_idx(struct ioc *ioc)
    805{
    806	int idx = ioc->autop_idx;
    807	const struct ioc_params *p = &autop[idx];
    808	u32 vrate_pct;
    809	u64 now_ns;
    810
    811	/* rotational? */
    812	if (!blk_queue_nonrot(ioc->rqos.q))
    813		return AUTOP_HDD;
    814
    815	/* handle SATA SSDs w/ broken NCQ */
    816	if (blk_queue_depth(ioc->rqos.q) == 1)
    817		return AUTOP_SSD_QD1;
    818
    819	/* use one of the normal ssd sets */
    820	if (idx < AUTOP_SSD_DFL)
    821		return AUTOP_SSD_DFL;
    822
    823	/* if user is overriding anything, maintain what was there */
    824	if (ioc->user_qos_params || ioc->user_cost_model)
    825		return idx;
    826
    827	/* step up/down based on the vrate */
    828	vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
    829	now_ns = ktime_get_ns();
    830
    831	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
    832		if (!ioc->autop_too_fast_at)
    833			ioc->autop_too_fast_at = now_ns;
    834		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
    835			return idx + 1;
    836	} else {
    837		ioc->autop_too_fast_at = 0;
    838	}
    839
    840	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
    841		if (!ioc->autop_too_slow_at)
    842			ioc->autop_too_slow_at = now_ns;
    843		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
    844			return idx - 1;
    845	} else {
    846		ioc->autop_too_slow_at = 0;
    847	}
    848
    849	return idx;
    850}
    851
    852/*
    853 * Take the followings as input
    854 *
    855 *  @bps	maximum sequential throughput
    856 *  @seqiops	maximum sequential 4k iops
    857 *  @randiops	maximum random 4k iops
    858 *
    859 * and calculate the linear model cost coefficients.
    860 *
    861 *  *@page	per-page cost		1s / (@bps / 4096)
    862 *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
    863 *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
    864 */
    865static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
    866			u64 *page, u64 *seqio, u64 *randio)
    867{
    868	u64 v;
    869
    870	*page = *seqio = *randio = 0;
    871
    872	if (bps)
    873		*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
    874					   DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
    875
    876	if (seqiops) {
    877		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
    878		if (v > *page)
    879			*seqio = v - *page;
    880	}
    881
    882	if (randiops) {
    883		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
    884		if (v > *page)
    885			*randio = v - *page;
    886	}
    887}
    888
    889static void ioc_refresh_lcoefs(struct ioc *ioc)
    890{
    891	u64 *u = ioc->params.i_lcoefs;
    892	u64 *c = ioc->params.lcoefs;
    893
    894	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
    895		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
    896	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
    897		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
    898}
    899
    900static bool ioc_refresh_params(struct ioc *ioc, bool force)
    901{
    902	const struct ioc_params *p;
    903	int idx;
    904
    905	lockdep_assert_held(&ioc->lock);
    906
    907	idx = ioc_autop_idx(ioc);
    908	p = &autop[idx];
    909
    910	if (idx == ioc->autop_idx && !force)
    911		return false;
    912
    913	if (idx != ioc->autop_idx)
    914		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
    915
    916	ioc->autop_idx = idx;
    917	ioc->autop_too_fast_at = 0;
    918	ioc->autop_too_slow_at = 0;
    919
    920	if (!ioc->user_qos_params)
    921		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
    922	if (!ioc->user_cost_model)
    923		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
    924
    925	ioc_refresh_period_us(ioc);
    926	ioc_refresh_lcoefs(ioc);
    927
    928	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
    929					    VTIME_PER_USEC, MILLION);
    930	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
    931				   VTIME_PER_USEC, MILLION);
    932
    933	return true;
    934}
    935
    936/*
    937 * When an iocg accumulates too much vtime or gets deactivated, we throw away
    938 * some vtime, which lowers the overall device utilization. As the exact amount
    939 * which is being thrown away is known, we can compensate by accelerating the
    940 * vrate accordingly so that the extra vtime generated in the current period
    941 * matches what got lost.
    942 */
    943static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
    944{
    945	s64 pleft = ioc->period_at + ioc->period_us - now->now;
    946	s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
    947	s64 vcomp, vcomp_min, vcomp_max;
    948
    949	lockdep_assert_held(&ioc->lock);
    950
    951	/* we need some time left in this period */
    952	if (pleft <= 0)
    953		goto done;
    954
    955	/*
    956	 * Calculate how much vrate should be adjusted to offset the error.
    957	 * Limit the amount of adjustment and deduct the adjusted amount from
    958	 * the error.
    959	 */
    960	vcomp = -div64_s64(ioc->vtime_err, pleft);
    961	vcomp_min = -(ioc->vtime_base_rate >> 1);
    962	vcomp_max = ioc->vtime_base_rate;
    963	vcomp = clamp(vcomp, vcomp_min, vcomp_max);
    964
    965	ioc->vtime_err += vcomp * pleft;
    966
    967	atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
    968done:
    969	/* bound how much error can accumulate */
    970	ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
    971}
    972
    973static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
    974				  int nr_lagging, int nr_shortages,
    975				  int prev_busy_level, u32 *missed_ppm)
    976{
    977	u64 vrate = ioc->vtime_base_rate;
    978	u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
    979
    980	if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
    981		if (ioc->busy_level != prev_busy_level || nr_lagging)
    982			trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
    983						   missed_ppm, rq_wait_pct,
    984						   nr_lagging, nr_shortages);
    985
    986		return;
    987	}
    988
    989	/*
    990	 * If vrate is out of bounds, apply clamp gradually as the
    991	 * bounds can change abruptly.  Otherwise, apply busy_level
    992	 * based adjustment.
    993	 */
    994	if (vrate < vrate_min) {
    995		vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
    996		vrate = min(vrate, vrate_min);
    997	} else if (vrate > vrate_max) {
    998		vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
    999		vrate = max(vrate, vrate_max);
   1000	} else {
   1001		int idx = min_t(int, abs(ioc->busy_level),
   1002				ARRAY_SIZE(vrate_adj_pct) - 1);
   1003		u32 adj_pct = vrate_adj_pct[idx];
   1004
   1005		if (ioc->busy_level > 0)
   1006			adj_pct = 100 - adj_pct;
   1007		else
   1008			adj_pct = 100 + adj_pct;
   1009
   1010		vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
   1011			      vrate_min, vrate_max);
   1012	}
   1013
   1014	trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
   1015				   nr_lagging, nr_shortages);
   1016
   1017	ioc->vtime_base_rate = vrate;
   1018	ioc_refresh_margins(ioc);
   1019}
   1020
   1021/* take a snapshot of the current [v]time and vrate */
   1022static void ioc_now(struct ioc *ioc, struct ioc_now *now)
   1023{
   1024	unsigned seq;
   1025
   1026	now->now_ns = ktime_get();
   1027	now->now = ktime_to_us(now->now_ns);
   1028	now->vrate = atomic64_read(&ioc->vtime_rate);
   1029
   1030	/*
   1031	 * The current vtime is
   1032	 *
   1033	 *   vtime at period start + (wallclock time since the start) * vrate
   1034	 *
   1035	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
   1036	 * needed, they're seqcount protected.
   1037	 */
   1038	do {
   1039		seq = read_seqcount_begin(&ioc->period_seqcount);
   1040		now->vnow = ioc->period_at_vtime +
   1041			(now->now - ioc->period_at) * now->vrate;
   1042	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
   1043}
   1044
   1045static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
   1046{
   1047	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
   1048
   1049	write_seqcount_begin(&ioc->period_seqcount);
   1050	ioc->period_at = now->now;
   1051	ioc->period_at_vtime = now->vnow;
   1052	write_seqcount_end(&ioc->period_seqcount);
   1053
   1054	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
   1055	add_timer(&ioc->timer);
   1056}
   1057
   1058/*
   1059 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
   1060 * weight sums and propagate upwards accordingly. If @save, the current margin
   1061 * is saved to be used as reference for later inuse in-period adjustments.
   1062 */
   1063static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
   1064				bool save, struct ioc_now *now)
   1065{
   1066	struct ioc *ioc = iocg->ioc;
   1067	int lvl;
   1068
   1069	lockdep_assert_held(&ioc->lock);
   1070
   1071	/*
   1072	 * For an active leaf node, its inuse shouldn't be zero or exceed
   1073	 * @active. An active internal node's inuse is solely determined by the
   1074	 * inuse to active ratio of its children regardless of @inuse.
   1075	 */
   1076	if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
   1077		inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
   1078					   iocg->child_active_sum);
   1079	} else {
   1080		inuse = clamp_t(u32, inuse, 1, active);
   1081	}
   1082
   1083	iocg->last_inuse = iocg->inuse;
   1084	if (save)
   1085		iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
   1086
   1087	if (active == iocg->active && inuse == iocg->inuse)
   1088		return;
   1089
   1090	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
   1091		struct ioc_gq *parent = iocg->ancestors[lvl];
   1092		struct ioc_gq *child = iocg->ancestors[lvl + 1];
   1093		u32 parent_active = 0, parent_inuse = 0;
   1094
   1095		/* update the level sums */
   1096		parent->child_active_sum += (s32)(active - child->active);
   1097		parent->child_inuse_sum += (s32)(inuse - child->inuse);
   1098		/* apply the updates */
   1099		child->active = active;
   1100		child->inuse = inuse;
   1101
   1102		/*
   1103		 * The delta between inuse and active sums indicates that
   1104		 * much of weight is being given away.  Parent's inuse
   1105		 * and active should reflect the ratio.
   1106		 */
   1107		if (parent->child_active_sum) {
   1108			parent_active = parent->weight;
   1109			parent_inuse = DIV64_U64_ROUND_UP(
   1110				parent_active * parent->child_inuse_sum,
   1111				parent->child_active_sum);
   1112		}
   1113
   1114		/* do we need to keep walking up? */
   1115		if (parent_active == parent->active &&
   1116		    parent_inuse == parent->inuse)
   1117			break;
   1118
   1119		active = parent_active;
   1120		inuse = parent_inuse;
   1121	}
   1122
   1123	ioc->weights_updated = true;
   1124}
   1125
   1126static void commit_weights(struct ioc *ioc)
   1127{
   1128	lockdep_assert_held(&ioc->lock);
   1129
   1130	if (ioc->weights_updated) {
   1131		/* paired with rmb in current_hweight(), see there */
   1132		smp_wmb();
   1133		atomic_inc(&ioc->hweight_gen);
   1134		ioc->weights_updated = false;
   1135	}
   1136}
   1137
   1138static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
   1139			      bool save, struct ioc_now *now)
   1140{
   1141	__propagate_weights(iocg, active, inuse, save, now);
   1142	commit_weights(iocg->ioc);
   1143}
   1144
   1145static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
   1146{
   1147	struct ioc *ioc = iocg->ioc;
   1148	int lvl;
   1149	u32 hwa, hwi;
   1150	int ioc_gen;
   1151
   1152	/* hot path - if uptodate, use cached */
   1153	ioc_gen = atomic_read(&ioc->hweight_gen);
   1154	if (ioc_gen == iocg->hweight_gen)
   1155		goto out;
   1156
   1157	/*
   1158	 * Paired with wmb in commit_weights(). If we saw the updated
   1159	 * hweight_gen, all the weight updates from __propagate_weights() are
   1160	 * visible too.
   1161	 *
   1162	 * We can race with weight updates during calculation and get it
   1163	 * wrong.  However, hweight_gen would have changed and a future
   1164	 * reader will recalculate and we're guaranteed to discard the
   1165	 * wrong result soon.
   1166	 */
   1167	smp_rmb();
   1168
   1169	hwa = hwi = WEIGHT_ONE;
   1170	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
   1171		struct ioc_gq *parent = iocg->ancestors[lvl];
   1172		struct ioc_gq *child = iocg->ancestors[lvl + 1];
   1173		u64 active_sum = READ_ONCE(parent->child_active_sum);
   1174		u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
   1175		u32 active = READ_ONCE(child->active);
   1176		u32 inuse = READ_ONCE(child->inuse);
   1177
   1178		/* we can race with deactivations and either may read as zero */
   1179		if (!active_sum || !inuse_sum)
   1180			continue;
   1181
   1182		active_sum = max_t(u64, active, active_sum);
   1183		hwa = div64_u64((u64)hwa * active, active_sum);
   1184
   1185		inuse_sum = max_t(u64, inuse, inuse_sum);
   1186		hwi = div64_u64((u64)hwi * inuse, inuse_sum);
   1187	}
   1188
   1189	iocg->hweight_active = max_t(u32, hwa, 1);
   1190	iocg->hweight_inuse = max_t(u32, hwi, 1);
   1191	iocg->hweight_gen = ioc_gen;
   1192out:
   1193	if (hw_activep)
   1194		*hw_activep = iocg->hweight_active;
   1195	if (hw_inusep)
   1196		*hw_inusep = iocg->hweight_inuse;
   1197}
   1198
   1199/*
   1200 * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
   1201 * other weights stay unchanged.
   1202 */
   1203static u32 current_hweight_max(struct ioc_gq *iocg)
   1204{
   1205	u32 hwm = WEIGHT_ONE;
   1206	u32 inuse = iocg->active;
   1207	u64 child_inuse_sum;
   1208	int lvl;
   1209
   1210	lockdep_assert_held(&iocg->ioc->lock);
   1211
   1212	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
   1213		struct ioc_gq *parent = iocg->ancestors[lvl];
   1214		struct ioc_gq *child = iocg->ancestors[lvl + 1];
   1215
   1216		child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
   1217		hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
   1218		inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
   1219					   parent->child_active_sum);
   1220	}
   1221
   1222	return max_t(u32, hwm, 1);
   1223}
   1224
   1225static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
   1226{
   1227	struct ioc *ioc = iocg->ioc;
   1228	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
   1229	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
   1230	u32 weight;
   1231
   1232	lockdep_assert_held(&ioc->lock);
   1233
   1234	weight = iocg->cfg_weight ?: iocc->dfl_weight;
   1235	if (weight != iocg->weight && iocg->active)
   1236		propagate_weights(iocg, weight, iocg->inuse, true, now);
   1237	iocg->weight = weight;
   1238}
   1239
   1240static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
   1241{
   1242	struct ioc *ioc = iocg->ioc;
   1243	u64 last_period, cur_period;
   1244	u64 vtime, vtarget;
   1245	int i;
   1246
   1247	/*
   1248	 * If seem to be already active, just update the stamp to tell the
   1249	 * timer that we're still active.  We don't mind occassional races.
   1250	 */
   1251	if (!list_empty(&iocg->active_list)) {
   1252		ioc_now(ioc, now);
   1253		cur_period = atomic64_read(&ioc->cur_period);
   1254		if (atomic64_read(&iocg->active_period) != cur_period)
   1255			atomic64_set(&iocg->active_period, cur_period);
   1256		return true;
   1257	}
   1258
   1259	/* racy check on internal node IOs, treat as root level IOs */
   1260	if (iocg->child_active_sum)
   1261		return false;
   1262
   1263	spin_lock_irq(&ioc->lock);
   1264
   1265	ioc_now(ioc, now);
   1266
   1267	/* update period */
   1268	cur_period = atomic64_read(&ioc->cur_period);
   1269	last_period = atomic64_read(&iocg->active_period);
   1270	atomic64_set(&iocg->active_period, cur_period);
   1271
   1272	/* already activated or breaking leaf-only constraint? */
   1273	if (!list_empty(&iocg->active_list))
   1274		goto succeed_unlock;
   1275	for (i = iocg->level - 1; i > 0; i--)
   1276		if (!list_empty(&iocg->ancestors[i]->active_list))
   1277			goto fail_unlock;
   1278
   1279	if (iocg->child_active_sum)
   1280		goto fail_unlock;
   1281
   1282	/*
   1283	 * Always start with the target budget. On deactivation, we throw away
   1284	 * anything above it.
   1285	 */
   1286	vtarget = now->vnow - ioc->margins.target;
   1287	vtime = atomic64_read(&iocg->vtime);
   1288
   1289	atomic64_add(vtarget - vtime, &iocg->vtime);
   1290	atomic64_add(vtarget - vtime, &iocg->done_vtime);
   1291	vtime = vtarget;
   1292
   1293	/*
   1294	 * Activate, propagate weight and start period timer if not
   1295	 * running.  Reset hweight_gen to avoid accidental match from
   1296	 * wrapping.
   1297	 */
   1298	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
   1299	list_add(&iocg->active_list, &ioc->active_iocgs);
   1300
   1301	propagate_weights(iocg, iocg->weight,
   1302			  iocg->last_inuse ?: iocg->weight, true, now);
   1303
   1304	TRACE_IOCG_PATH(iocg_activate, iocg, now,
   1305			last_period, cur_period, vtime);
   1306
   1307	iocg->activated_at = now->now;
   1308
   1309	if (ioc->running == IOC_IDLE) {
   1310		ioc->running = IOC_RUNNING;
   1311		ioc->dfgv_period_at = now->now;
   1312		ioc->dfgv_period_rem = 0;
   1313		ioc_start_period(ioc, now);
   1314	}
   1315
   1316succeed_unlock:
   1317	spin_unlock_irq(&ioc->lock);
   1318	return true;
   1319
   1320fail_unlock:
   1321	spin_unlock_irq(&ioc->lock);
   1322	return false;
   1323}
   1324
   1325static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
   1326{
   1327	struct ioc *ioc = iocg->ioc;
   1328	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
   1329	u64 tdelta, delay, new_delay;
   1330	s64 vover, vover_pct;
   1331	u32 hwa;
   1332
   1333	lockdep_assert_held(&iocg->waitq.lock);
   1334
   1335	/* calculate the current delay in effect - 1/2 every second */
   1336	tdelta = now->now - iocg->delay_at;
   1337	if (iocg->delay)
   1338		delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
   1339	else
   1340		delay = 0;
   1341
   1342	/* calculate the new delay from the debt amount */
   1343	current_hweight(iocg, &hwa, NULL);
   1344	vover = atomic64_read(&iocg->vtime) +
   1345		abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
   1346	vover_pct = div64_s64(100 * vover,
   1347			      ioc->period_us * ioc->vtime_base_rate);
   1348
   1349	if (vover_pct <= MIN_DELAY_THR_PCT)
   1350		new_delay = 0;
   1351	else if (vover_pct >= MAX_DELAY_THR_PCT)
   1352		new_delay = MAX_DELAY;
   1353	else
   1354		new_delay = MIN_DELAY +
   1355			div_u64((MAX_DELAY - MIN_DELAY) *
   1356				(vover_pct - MIN_DELAY_THR_PCT),
   1357				MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
   1358
   1359	/* pick the higher one and apply */
   1360	if (new_delay > delay) {
   1361		iocg->delay = new_delay;
   1362		iocg->delay_at = now->now;
   1363		delay = new_delay;
   1364	}
   1365
   1366	if (delay >= MIN_DELAY) {
   1367		if (!iocg->indelay_since)
   1368			iocg->indelay_since = now->now;
   1369		blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
   1370		return true;
   1371	} else {
   1372		if (iocg->indelay_since) {
   1373			iocg->stat.indelay_us += now->now - iocg->indelay_since;
   1374			iocg->indelay_since = 0;
   1375		}
   1376		iocg->delay = 0;
   1377		blkcg_clear_delay(blkg);
   1378		return false;
   1379	}
   1380}
   1381
   1382static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
   1383			    struct ioc_now *now)
   1384{
   1385	struct iocg_pcpu_stat *gcs;
   1386
   1387	lockdep_assert_held(&iocg->ioc->lock);
   1388	lockdep_assert_held(&iocg->waitq.lock);
   1389	WARN_ON_ONCE(list_empty(&iocg->active_list));
   1390
   1391	/*
   1392	 * Once in debt, debt handling owns inuse. @iocg stays at the minimum
   1393	 * inuse donating all of it share to others until its debt is paid off.
   1394	 */
   1395	if (!iocg->abs_vdebt && abs_cost) {
   1396		iocg->indebt_since = now->now;
   1397		propagate_weights(iocg, iocg->active, 0, false, now);
   1398	}
   1399
   1400	iocg->abs_vdebt += abs_cost;
   1401
   1402	gcs = get_cpu_ptr(iocg->pcpu_stat);
   1403	local64_add(abs_cost, &gcs->abs_vusage);
   1404	put_cpu_ptr(gcs);
   1405}
   1406
   1407static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
   1408			  struct ioc_now *now)
   1409{
   1410	lockdep_assert_held(&iocg->ioc->lock);
   1411	lockdep_assert_held(&iocg->waitq.lock);
   1412
   1413	/* make sure that nobody messed with @iocg */
   1414	WARN_ON_ONCE(list_empty(&iocg->active_list));
   1415	WARN_ON_ONCE(iocg->inuse > 1);
   1416
   1417	iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
   1418
   1419	/* if debt is paid in full, restore inuse */
   1420	if (!iocg->abs_vdebt) {
   1421		iocg->stat.indebt_us += now->now - iocg->indebt_since;
   1422		iocg->indebt_since = 0;
   1423
   1424		propagate_weights(iocg, iocg->active, iocg->last_inuse,
   1425				  false, now);
   1426	}
   1427}
   1428
   1429static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
   1430			int flags, void *key)
   1431{
   1432	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
   1433	struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
   1434	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
   1435
   1436	ctx->vbudget -= cost;
   1437
   1438	if (ctx->vbudget < 0)
   1439		return -1;
   1440
   1441	iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
   1442	wait->committed = true;
   1443
   1444	/*
   1445	 * autoremove_wake_function() removes the wait entry only when it
   1446	 * actually changed the task state. We want the wait always removed.
   1447	 * Remove explicitly and use default_wake_function(). Note that the
   1448	 * order of operations is important as finish_wait() tests whether
   1449	 * @wq_entry is removed without grabbing the lock.
   1450	 */
   1451	default_wake_function(wq_entry, mode, flags, key);
   1452	list_del_init_careful(&wq_entry->entry);
   1453	return 0;
   1454}
   1455
   1456/*
   1457 * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
   1458 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
   1459 * addition to iocg->waitq.lock.
   1460 */
   1461static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
   1462			    struct ioc_now *now)
   1463{
   1464	struct ioc *ioc = iocg->ioc;
   1465	struct iocg_wake_ctx ctx = { .iocg = iocg };
   1466	u64 vshortage, expires, oexpires;
   1467	s64 vbudget;
   1468	u32 hwa;
   1469
   1470	lockdep_assert_held(&iocg->waitq.lock);
   1471
   1472	current_hweight(iocg, &hwa, NULL);
   1473	vbudget = now->vnow - atomic64_read(&iocg->vtime);
   1474
   1475	/* pay off debt */
   1476	if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
   1477		u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
   1478		u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
   1479		u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
   1480
   1481		lockdep_assert_held(&ioc->lock);
   1482
   1483		atomic64_add(vpay, &iocg->vtime);
   1484		atomic64_add(vpay, &iocg->done_vtime);
   1485		iocg_pay_debt(iocg, abs_vpay, now);
   1486		vbudget -= vpay;
   1487	}
   1488
   1489	if (iocg->abs_vdebt || iocg->delay)
   1490		iocg_kick_delay(iocg, now);
   1491
   1492	/*
   1493	 * Debt can still be outstanding if we haven't paid all yet or the
   1494	 * caller raced and called without @pay_debt. Shouldn't wake up waiters
   1495	 * under debt. Make sure @vbudget reflects the outstanding amount and is
   1496	 * not positive.
   1497	 */
   1498	if (iocg->abs_vdebt) {
   1499		s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
   1500		vbudget = min_t(s64, 0, vbudget - vdebt);
   1501	}
   1502
   1503	/*
   1504	 * Wake up the ones which are due and see how much vtime we'll need for
   1505	 * the next one. As paying off debt restores hw_inuse, it must be read
   1506	 * after the above debt payment.
   1507	 */
   1508	ctx.vbudget = vbudget;
   1509	current_hweight(iocg, NULL, &ctx.hw_inuse);
   1510
   1511	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
   1512
   1513	if (!waitqueue_active(&iocg->waitq)) {
   1514		if (iocg->wait_since) {
   1515			iocg->stat.wait_us += now->now - iocg->wait_since;
   1516			iocg->wait_since = 0;
   1517		}
   1518		return;
   1519	}
   1520
   1521	if (!iocg->wait_since)
   1522		iocg->wait_since = now->now;
   1523
   1524	if (WARN_ON_ONCE(ctx.vbudget >= 0))
   1525		return;
   1526
   1527	/* determine next wakeup, add a timer margin to guarantee chunking */
   1528	vshortage = -ctx.vbudget;
   1529	expires = now->now_ns +
   1530		DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
   1531		NSEC_PER_USEC;
   1532	expires += ioc->timer_slack_ns;
   1533
   1534	/* if already active and close enough, don't bother */
   1535	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
   1536	if (hrtimer_is_queued(&iocg->waitq_timer) &&
   1537	    abs(oexpires - expires) <= ioc->timer_slack_ns)
   1538		return;
   1539
   1540	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
   1541			       ioc->timer_slack_ns, HRTIMER_MODE_ABS);
   1542}
   1543
   1544static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
   1545{
   1546	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
   1547	bool pay_debt = READ_ONCE(iocg->abs_vdebt);
   1548	struct ioc_now now;
   1549	unsigned long flags;
   1550
   1551	ioc_now(iocg->ioc, &now);
   1552
   1553	iocg_lock(iocg, pay_debt, &flags);
   1554	iocg_kick_waitq(iocg, pay_debt, &now);
   1555	iocg_unlock(iocg, pay_debt, &flags);
   1556
   1557	return HRTIMER_NORESTART;
   1558}
   1559
   1560static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
   1561{
   1562	u32 nr_met[2] = { };
   1563	u32 nr_missed[2] = { };
   1564	u64 rq_wait_ns = 0;
   1565	int cpu, rw;
   1566
   1567	for_each_online_cpu(cpu) {
   1568		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
   1569		u64 this_rq_wait_ns;
   1570
   1571		for (rw = READ; rw <= WRITE; rw++) {
   1572			u32 this_met = local_read(&stat->missed[rw].nr_met);
   1573			u32 this_missed = local_read(&stat->missed[rw].nr_missed);
   1574
   1575			nr_met[rw] += this_met - stat->missed[rw].last_met;
   1576			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
   1577			stat->missed[rw].last_met = this_met;
   1578			stat->missed[rw].last_missed = this_missed;
   1579		}
   1580
   1581		this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
   1582		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
   1583		stat->last_rq_wait_ns = this_rq_wait_ns;
   1584	}
   1585
   1586	for (rw = READ; rw <= WRITE; rw++) {
   1587		if (nr_met[rw] + nr_missed[rw])
   1588			missed_ppm_ar[rw] =
   1589				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
   1590						   nr_met[rw] + nr_missed[rw]);
   1591		else
   1592			missed_ppm_ar[rw] = 0;
   1593	}
   1594
   1595	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
   1596				   ioc->period_us * NSEC_PER_USEC);
   1597}
   1598
   1599/* was iocg idle this period? */
   1600static bool iocg_is_idle(struct ioc_gq *iocg)
   1601{
   1602	struct ioc *ioc = iocg->ioc;
   1603
   1604	/* did something get issued this period? */
   1605	if (atomic64_read(&iocg->active_period) ==
   1606	    atomic64_read(&ioc->cur_period))
   1607		return false;
   1608
   1609	/* is something in flight? */
   1610	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
   1611		return false;
   1612
   1613	return true;
   1614}
   1615
   1616/*
   1617 * Call this function on the target leaf @iocg's to build pre-order traversal
   1618 * list of all the ancestors in @inner_walk. The inner nodes are linked through
   1619 * ->walk_list and the caller is responsible for dissolving the list after use.
   1620 */
   1621static void iocg_build_inner_walk(struct ioc_gq *iocg,
   1622				  struct list_head *inner_walk)
   1623{
   1624	int lvl;
   1625
   1626	WARN_ON_ONCE(!list_empty(&iocg->walk_list));
   1627
   1628	/* find the first ancestor which hasn't been visited yet */
   1629	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
   1630		if (!list_empty(&iocg->ancestors[lvl]->walk_list))
   1631			break;
   1632	}
   1633
   1634	/* walk down and visit the inner nodes to get pre-order traversal */
   1635	while (++lvl <= iocg->level - 1) {
   1636		struct ioc_gq *inner = iocg->ancestors[lvl];
   1637
   1638		/* record traversal order */
   1639		list_add_tail(&inner->walk_list, inner_walk);
   1640	}
   1641}
   1642
   1643/* propagate the deltas to the parent */
   1644static void iocg_flush_stat_upward(struct ioc_gq *iocg)
   1645{
   1646	if (iocg->level > 0) {
   1647		struct iocg_stat *parent_stat =
   1648			&iocg->ancestors[iocg->level - 1]->stat;
   1649
   1650		parent_stat->usage_us +=
   1651			iocg->stat.usage_us - iocg->last_stat.usage_us;
   1652		parent_stat->wait_us +=
   1653			iocg->stat.wait_us - iocg->last_stat.wait_us;
   1654		parent_stat->indebt_us +=
   1655			iocg->stat.indebt_us - iocg->last_stat.indebt_us;
   1656		parent_stat->indelay_us +=
   1657			iocg->stat.indelay_us - iocg->last_stat.indelay_us;
   1658	}
   1659
   1660	iocg->last_stat = iocg->stat;
   1661}
   1662
   1663/* collect per-cpu counters and propagate the deltas to the parent */
   1664static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
   1665{
   1666	struct ioc *ioc = iocg->ioc;
   1667	u64 abs_vusage = 0;
   1668	u64 vusage_delta;
   1669	int cpu;
   1670
   1671	lockdep_assert_held(&iocg->ioc->lock);
   1672
   1673	/* collect per-cpu counters */
   1674	for_each_possible_cpu(cpu) {
   1675		abs_vusage += local64_read(
   1676				per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
   1677	}
   1678	vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
   1679	iocg->last_stat_abs_vusage = abs_vusage;
   1680
   1681	iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
   1682	iocg->stat.usage_us += iocg->usage_delta_us;
   1683
   1684	iocg_flush_stat_upward(iocg);
   1685}
   1686
   1687/* get stat counters ready for reading on all active iocgs */
   1688static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
   1689{
   1690	LIST_HEAD(inner_walk);
   1691	struct ioc_gq *iocg, *tiocg;
   1692
   1693	/* flush leaves and build inner node walk list */
   1694	list_for_each_entry(iocg, target_iocgs, active_list) {
   1695		iocg_flush_stat_leaf(iocg, now);
   1696		iocg_build_inner_walk(iocg, &inner_walk);
   1697	}
   1698
   1699	/* keep flushing upwards by walking the inner list backwards */
   1700	list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
   1701		iocg_flush_stat_upward(iocg);
   1702		list_del_init(&iocg->walk_list);
   1703	}
   1704}
   1705
   1706/*
   1707 * Determine what @iocg's hweight_inuse should be after donating unused
   1708 * capacity. @hwm is the upper bound and used to signal no donation. This
   1709 * function also throws away @iocg's excess budget.
   1710 */
   1711static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
   1712				  u32 usage, struct ioc_now *now)
   1713{
   1714	struct ioc *ioc = iocg->ioc;
   1715	u64 vtime = atomic64_read(&iocg->vtime);
   1716	s64 excess, delta, target, new_hwi;
   1717
   1718	/* debt handling owns inuse for debtors */
   1719	if (iocg->abs_vdebt)
   1720		return 1;
   1721
   1722	/* see whether minimum margin requirement is met */
   1723	if (waitqueue_active(&iocg->waitq) ||
   1724	    time_after64(vtime, now->vnow - ioc->margins.min))
   1725		return hwm;
   1726
   1727	/* throw away excess above target */
   1728	excess = now->vnow - vtime - ioc->margins.target;
   1729	if (excess > 0) {
   1730		atomic64_add(excess, &iocg->vtime);
   1731		atomic64_add(excess, &iocg->done_vtime);
   1732		vtime += excess;
   1733		ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
   1734	}
   1735
   1736	/*
   1737	 * Let's say the distance between iocg's and device's vtimes as a
   1738	 * fraction of period duration is delta. Assuming that the iocg will
   1739	 * consume the usage determined above, we want to determine new_hwi so
   1740	 * that delta equals MARGIN_TARGET at the end of the next period.
   1741	 *
   1742	 * We need to execute usage worth of IOs while spending the sum of the
   1743	 * new budget (1 - MARGIN_TARGET) and the leftover from the last period
   1744	 * (delta):
   1745	 *
   1746	 *   usage = (1 - MARGIN_TARGET + delta) * new_hwi
   1747	 *
   1748	 * Therefore, the new_hwi is:
   1749	 *
   1750	 *   new_hwi = usage / (1 - MARGIN_TARGET + delta)
   1751	 */
   1752	delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
   1753			  now->vnow - ioc->period_at_vtime);
   1754	target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
   1755	new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
   1756
   1757	return clamp_t(s64, new_hwi, 1, hwm);
   1758}
   1759
   1760/*
   1761 * For work-conservation, an iocg which isn't using all of its share should
   1762 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
   1763 * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
   1764 *
   1765 * #1 is mathematically simpler but has the drawback of requiring synchronous
   1766 * global hweight_inuse updates when idle iocg's get activated or inuse weights
   1767 * change due to donation snapbacks as it has the possibility of grossly
   1768 * overshooting what's allowed by the model and vrate.
   1769 *
   1770 * #2 is inherently safe with local operations. The donating iocg can easily
   1771 * snap back to higher weights when needed without worrying about impacts on
   1772 * other nodes as the impacts will be inherently correct. This also makes idle
   1773 * iocg activations safe. The only effect activations have is decreasing
   1774 * hweight_inuse of others, the right solution to which is for those iocgs to
   1775 * snap back to higher weights.
   1776 *
   1777 * So, we go with #2. The challenge is calculating how each donating iocg's
   1778 * inuse should be adjusted to achieve the target donation amounts. This is done
   1779 * using Andy's method described in the following pdf.
   1780 *
   1781 *   https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
   1782 *
   1783 * Given the weights and target after-donation hweight_inuse values, Andy's
   1784 * method determines how the proportional distribution should look like at each
   1785 * sibling level to maintain the relative relationship between all non-donating
   1786 * pairs. To roughly summarize, it divides the tree into donating and
   1787 * non-donating parts, calculates global donation rate which is used to
   1788 * determine the target hweight_inuse for each node, and then derives per-level
   1789 * proportions.
   1790 *
   1791 * The following pdf shows that global distribution calculated this way can be
   1792 * achieved by scaling inuse weights of donating leaves and propagating the
   1793 * adjustments upwards proportionally.
   1794 *
   1795 *   https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
   1796 *
   1797 * Combining the above two, we can determine how each leaf iocg's inuse should
   1798 * be adjusted to achieve the target donation.
   1799 *
   1800 *   https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
   1801 *
   1802 * The inline comments use symbols from the last pdf.
   1803 *
   1804 *   b is the sum of the absolute budgets in the subtree. 1 for the root node.
   1805 *   f is the sum of the absolute budgets of non-donating nodes in the subtree.
   1806 *   t is the sum of the absolute budgets of donating nodes in the subtree.
   1807 *   w is the weight of the node. w = w_f + w_t
   1808 *   w_f is the non-donating portion of w. w_f = w * f / b
   1809 *   w_b is the donating portion of w. w_t = w * t / b
   1810 *   s is the sum of all sibling weights. s = Sum(w) for siblings
   1811 *   s_f and s_t are the non-donating and donating portions of s.
   1812 *
   1813 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
   1814 * w_pt is the donating portion of the parent's weight and w'_pt the same value
   1815 * after adjustments. Subscript r denotes the root node's values.
   1816 */
   1817static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
   1818{
   1819	LIST_HEAD(over_hwa);
   1820	LIST_HEAD(inner_walk);
   1821	struct ioc_gq *iocg, *tiocg, *root_iocg;
   1822	u32 after_sum, over_sum, over_target, gamma;
   1823
   1824	/*
   1825	 * It's pretty unlikely but possible for the total sum of
   1826	 * hweight_after_donation's to be higher than WEIGHT_ONE, which will
   1827	 * confuse the following calculations. If such condition is detected,
   1828	 * scale down everyone over its full share equally to keep the sum below
   1829	 * WEIGHT_ONE.
   1830	 */
   1831	after_sum = 0;
   1832	over_sum = 0;
   1833	list_for_each_entry(iocg, surpluses, surplus_list) {
   1834		u32 hwa;
   1835
   1836		current_hweight(iocg, &hwa, NULL);
   1837		after_sum += iocg->hweight_after_donation;
   1838
   1839		if (iocg->hweight_after_donation > hwa) {
   1840			over_sum += iocg->hweight_after_donation;
   1841			list_add(&iocg->walk_list, &over_hwa);
   1842		}
   1843	}
   1844
   1845	if (after_sum >= WEIGHT_ONE) {
   1846		/*
   1847		 * The delta should be deducted from the over_sum, calculate
   1848		 * target over_sum value.
   1849		 */
   1850		u32 over_delta = after_sum - (WEIGHT_ONE - 1);
   1851		WARN_ON_ONCE(over_sum <= over_delta);
   1852		over_target = over_sum - over_delta;
   1853	} else {
   1854		over_target = 0;
   1855	}
   1856
   1857	list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
   1858		if (over_target)
   1859			iocg->hweight_after_donation =
   1860				div_u64((u64)iocg->hweight_after_donation *
   1861					over_target, over_sum);
   1862		list_del_init(&iocg->walk_list);
   1863	}
   1864
   1865	/*
   1866	 * Build pre-order inner node walk list and prepare for donation
   1867	 * adjustment calculations.
   1868	 */
   1869	list_for_each_entry(iocg, surpluses, surplus_list) {
   1870		iocg_build_inner_walk(iocg, &inner_walk);
   1871	}
   1872
   1873	root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
   1874	WARN_ON_ONCE(root_iocg->level > 0);
   1875
   1876	list_for_each_entry(iocg, &inner_walk, walk_list) {
   1877		iocg->child_adjusted_sum = 0;
   1878		iocg->hweight_donating = 0;
   1879		iocg->hweight_after_donation = 0;
   1880	}
   1881
   1882	/*
   1883	 * Propagate the donating budget (b_t) and after donation budget (b'_t)
   1884	 * up the hierarchy.
   1885	 */
   1886	list_for_each_entry(iocg, surpluses, surplus_list) {
   1887		struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
   1888
   1889		parent->hweight_donating += iocg->hweight_donating;
   1890		parent->hweight_after_donation += iocg->hweight_after_donation;
   1891	}
   1892
   1893	list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
   1894		if (iocg->level > 0) {
   1895			struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
   1896
   1897			parent->hweight_donating += iocg->hweight_donating;
   1898			parent->hweight_after_donation += iocg->hweight_after_donation;
   1899		}
   1900	}
   1901
   1902	/*
   1903	 * Calculate inner hwa's (b) and make sure the donation values are
   1904	 * within the accepted ranges as we're doing low res calculations with
   1905	 * roundups.
   1906	 */
   1907	list_for_each_entry(iocg, &inner_walk, walk_list) {
   1908		if (iocg->level) {
   1909			struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
   1910
   1911			iocg->hweight_active = DIV64_U64_ROUND_UP(
   1912				(u64)parent->hweight_active * iocg->active,
   1913				parent->child_active_sum);
   1914
   1915		}
   1916
   1917		iocg->hweight_donating = min(iocg->hweight_donating,
   1918					     iocg->hweight_active);
   1919		iocg->hweight_after_donation = min(iocg->hweight_after_donation,
   1920						   iocg->hweight_donating - 1);
   1921		if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
   1922				 iocg->hweight_donating <= 1 ||
   1923				 iocg->hweight_after_donation == 0)) {
   1924			pr_warn("iocg: invalid donation weights in ");
   1925			pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
   1926			pr_cont(": active=%u donating=%u after=%u\n",
   1927				iocg->hweight_active, iocg->hweight_donating,
   1928				iocg->hweight_after_donation);
   1929		}
   1930	}
   1931
   1932	/*
   1933	 * Calculate the global donation rate (gamma) - the rate to adjust
   1934	 * non-donating budgets by.
   1935	 *
   1936	 * No need to use 64bit multiplication here as the first operand is
   1937	 * guaranteed to be smaller than WEIGHT_ONE (1<<16).
   1938	 *
   1939	 * We know that there are beneficiary nodes and the sum of the donating
   1940	 * hweights can't be whole; however, due to the round-ups during hweight
   1941	 * calculations, root_iocg->hweight_donating might still end up equal to
   1942	 * or greater than whole. Limit the range when calculating the divider.
   1943	 *
   1944	 * gamma = (1 - t_r') / (1 - t_r)
   1945	 */
   1946	gamma = DIV_ROUND_UP(
   1947		(WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
   1948		WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
   1949
   1950	/*
   1951	 * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
   1952	 * nodes.
   1953	 */
   1954	list_for_each_entry(iocg, &inner_walk, walk_list) {
   1955		struct ioc_gq *parent;
   1956		u32 inuse, wpt, wptp;
   1957		u64 st, sf;
   1958
   1959		if (iocg->level == 0) {
   1960			/* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
   1961			iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
   1962				iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
   1963				WEIGHT_ONE - iocg->hweight_after_donation);
   1964			continue;
   1965		}
   1966
   1967		parent = iocg->ancestors[iocg->level - 1];
   1968
   1969		/* b' = gamma * b_f + b_t' */
   1970		iocg->hweight_inuse = DIV64_U64_ROUND_UP(
   1971			(u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
   1972			WEIGHT_ONE) + iocg->hweight_after_donation;
   1973
   1974		/* w' = s' * b' / b'_p */
   1975		inuse = DIV64_U64_ROUND_UP(
   1976			(u64)parent->child_adjusted_sum * iocg->hweight_inuse,
   1977			parent->hweight_inuse);
   1978
   1979		/* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
   1980		st = DIV64_U64_ROUND_UP(
   1981			iocg->child_active_sum * iocg->hweight_donating,
   1982			iocg->hweight_active);
   1983		sf = iocg->child_active_sum - st;
   1984		wpt = DIV64_U64_ROUND_UP(
   1985			(u64)iocg->active * iocg->hweight_donating,
   1986			iocg->hweight_active);
   1987		wptp = DIV64_U64_ROUND_UP(
   1988			(u64)inuse * iocg->hweight_after_donation,
   1989			iocg->hweight_inuse);
   1990
   1991		iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
   1992	}
   1993
   1994	/*
   1995	 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
   1996	 * we can finally determine leaf adjustments.
   1997	 */
   1998	list_for_each_entry(iocg, surpluses, surplus_list) {
   1999		struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
   2000		u32 inuse;
   2001
   2002		/*
   2003		 * In-debt iocgs participated in the donation calculation with
   2004		 * the minimum target hweight_inuse. Configuring inuse
   2005		 * accordingly would work fine but debt handling expects
   2006		 * @iocg->inuse stay at the minimum and we don't wanna
   2007		 * interfere.
   2008		 */
   2009		if (iocg->abs_vdebt) {
   2010			WARN_ON_ONCE(iocg->inuse > 1);
   2011			continue;
   2012		}
   2013
   2014		/* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
   2015		inuse = DIV64_U64_ROUND_UP(
   2016			parent->child_adjusted_sum * iocg->hweight_after_donation,
   2017			parent->hweight_inuse);
   2018
   2019		TRACE_IOCG_PATH(inuse_transfer, iocg, now,
   2020				iocg->inuse, inuse,
   2021				iocg->hweight_inuse,
   2022				iocg->hweight_after_donation);
   2023
   2024		__propagate_weights(iocg, iocg->active, inuse, true, now);
   2025	}
   2026
   2027	/* walk list should be dissolved after use */
   2028	list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
   2029		list_del_init(&iocg->walk_list);
   2030}
   2031
   2032/*
   2033 * A low weight iocg can amass a large amount of debt, for example, when
   2034 * anonymous memory gets reclaimed aggressively. If the system has a lot of
   2035 * memory paired with a slow IO device, the debt can span multiple seconds or
   2036 * more. If there are no other subsequent IO issuers, the in-debt iocg may end
   2037 * up blocked paying its debt while the IO device is idle.
   2038 *
   2039 * The following protects against such cases. If the device has been
   2040 * sufficiently idle for a while, the debts are halved and delays are
   2041 * recalculated.
   2042 */
   2043static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
   2044			      struct ioc_now *now)
   2045{
   2046	struct ioc_gq *iocg;
   2047	u64 dur, usage_pct, nr_cycles;
   2048
   2049	/* if no debtor, reset the cycle */
   2050	if (!nr_debtors) {
   2051		ioc->dfgv_period_at = now->now;
   2052		ioc->dfgv_period_rem = 0;
   2053		ioc->dfgv_usage_us_sum = 0;
   2054		return;
   2055	}
   2056
   2057	/*
   2058	 * Debtors can pass through a lot of writes choking the device and we
   2059	 * don't want to be forgiving debts while the device is struggling from
   2060	 * write bursts. If we're missing latency targets, consider the device
   2061	 * fully utilized.
   2062	 */
   2063	if (ioc->busy_level > 0)
   2064		usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
   2065
   2066	ioc->dfgv_usage_us_sum += usage_us_sum;
   2067	if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
   2068		return;
   2069
   2070	/*
   2071	 * At least DFGV_PERIOD has passed since the last period. Calculate the
   2072	 * average usage and reset the period counters.
   2073	 */
   2074	dur = now->now - ioc->dfgv_period_at;
   2075	usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
   2076
   2077	ioc->dfgv_period_at = now->now;
   2078	ioc->dfgv_usage_us_sum = 0;
   2079
   2080	/* if was too busy, reset everything */
   2081	if (usage_pct > DFGV_USAGE_PCT) {
   2082		ioc->dfgv_period_rem = 0;
   2083		return;
   2084	}
   2085
   2086	/*
   2087	 * Usage is lower than threshold. Let's forgive some debts. Debt
   2088	 * forgiveness runs off of the usual ioc timer but its period usually
   2089	 * doesn't match ioc's. Compensate the difference by performing the
   2090	 * reduction as many times as would fit in the duration since the last
   2091	 * run and carrying over the left-over duration in @ioc->dfgv_period_rem
   2092	 * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
   2093	 * reductions is doubled.
   2094	 */
   2095	nr_cycles = dur + ioc->dfgv_period_rem;
   2096	ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
   2097
   2098	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
   2099		u64 __maybe_unused old_debt, __maybe_unused old_delay;
   2100
   2101		if (!iocg->abs_vdebt && !iocg->delay)
   2102			continue;
   2103
   2104		spin_lock(&iocg->waitq.lock);
   2105
   2106		old_debt = iocg->abs_vdebt;
   2107		old_delay = iocg->delay;
   2108
   2109		if (iocg->abs_vdebt)
   2110			iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
   2111		if (iocg->delay)
   2112			iocg->delay = iocg->delay >> nr_cycles ?: 1;
   2113
   2114		iocg_kick_waitq(iocg, true, now);
   2115
   2116		TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
   2117				old_debt, iocg->abs_vdebt,
   2118				old_delay, iocg->delay);
   2119
   2120		spin_unlock(&iocg->waitq.lock);
   2121	}
   2122}
   2123
   2124/*
   2125 * Check the active iocgs' state to avoid oversleeping and deactive
   2126 * idle iocgs.
   2127 *
   2128 * Since waiters determine the sleep durations based on the vrate
   2129 * they saw at the time of sleep, if vrate has increased, some
   2130 * waiters could be sleeping for too long. Wake up tardy waiters
   2131 * which should have woken up in the last period and expire idle
   2132 * iocgs.
   2133 */
   2134static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
   2135{
   2136	int nr_debtors = 0;
   2137	struct ioc_gq *iocg, *tiocg;
   2138
   2139	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
   2140		if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
   2141		    !iocg->delay && !iocg_is_idle(iocg))
   2142			continue;
   2143
   2144		spin_lock(&iocg->waitq.lock);
   2145
   2146		/* flush wait and indebt stat deltas */
   2147		if (iocg->wait_since) {
   2148			iocg->stat.wait_us += now->now - iocg->wait_since;
   2149			iocg->wait_since = now->now;
   2150		}
   2151		if (iocg->indebt_since) {
   2152			iocg->stat.indebt_us +=
   2153				now->now - iocg->indebt_since;
   2154			iocg->indebt_since = now->now;
   2155		}
   2156		if (iocg->indelay_since) {
   2157			iocg->stat.indelay_us +=
   2158				now->now - iocg->indelay_since;
   2159			iocg->indelay_since = now->now;
   2160		}
   2161
   2162		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
   2163		    iocg->delay) {
   2164			/* might be oversleeping vtime / hweight changes, kick */
   2165			iocg_kick_waitq(iocg, true, now);
   2166			if (iocg->abs_vdebt || iocg->delay)
   2167				nr_debtors++;
   2168		} else if (iocg_is_idle(iocg)) {
   2169			/* no waiter and idle, deactivate */
   2170			u64 vtime = atomic64_read(&iocg->vtime);
   2171			s64 excess;
   2172
   2173			/*
   2174			 * @iocg has been inactive for a full duration and will
   2175			 * have a high budget. Account anything above target as
   2176			 * error and throw away. On reactivation, it'll start
   2177			 * with the target budget.
   2178			 */
   2179			excess = now->vnow - vtime - ioc->margins.target;
   2180			if (excess > 0) {
   2181				u32 old_hwi;
   2182
   2183				current_hweight(iocg, NULL, &old_hwi);
   2184				ioc->vtime_err -= div64_u64(excess * old_hwi,
   2185							    WEIGHT_ONE);
   2186			}
   2187
   2188			TRACE_IOCG_PATH(iocg_idle, iocg, now,
   2189					atomic64_read(&iocg->active_period),
   2190					atomic64_read(&ioc->cur_period), vtime);
   2191			__propagate_weights(iocg, 0, 0, false, now);
   2192			list_del_init(&iocg->active_list);
   2193		}
   2194
   2195		spin_unlock(&iocg->waitq.lock);
   2196	}
   2197
   2198	commit_weights(ioc);
   2199	return nr_debtors;
   2200}
   2201
   2202static void ioc_timer_fn(struct timer_list *timer)
   2203{
   2204	struct ioc *ioc = container_of(timer, struct ioc, timer);
   2205	struct ioc_gq *iocg, *tiocg;
   2206	struct ioc_now now;
   2207	LIST_HEAD(surpluses);
   2208	int nr_debtors, nr_shortages = 0, nr_lagging = 0;
   2209	u64 usage_us_sum = 0;
   2210	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
   2211	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
   2212	u32 missed_ppm[2], rq_wait_pct;
   2213	u64 period_vtime;
   2214	int prev_busy_level;
   2215
   2216	/* how were the latencies during the period? */
   2217	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
   2218
   2219	/* take care of active iocgs */
   2220	spin_lock_irq(&ioc->lock);
   2221
   2222	ioc_now(ioc, &now);
   2223
   2224	period_vtime = now.vnow - ioc->period_at_vtime;
   2225	if (WARN_ON_ONCE(!period_vtime)) {
   2226		spin_unlock_irq(&ioc->lock);
   2227		return;
   2228	}
   2229
   2230	nr_debtors = ioc_check_iocgs(ioc, &now);
   2231
   2232	/*
   2233	 * Wait and indebt stat are flushed above and the donation calculation
   2234	 * below needs updated usage stat. Let's bring stat up-to-date.
   2235	 */
   2236	iocg_flush_stat(&ioc->active_iocgs, &now);
   2237
   2238	/* calc usage and see whether some weights need to be moved around */
   2239	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
   2240		u64 vdone, vtime, usage_us;
   2241		u32 hw_active, hw_inuse;
   2242
   2243		/*
   2244		 * Collect unused and wind vtime closer to vnow to prevent
   2245		 * iocgs from accumulating a large amount of budget.
   2246		 */
   2247		vdone = atomic64_read(&iocg->done_vtime);
   2248		vtime = atomic64_read(&iocg->vtime);
   2249		current_hweight(iocg, &hw_active, &hw_inuse);
   2250
   2251		/*
   2252		 * Latency QoS detection doesn't account for IOs which are
   2253		 * in-flight for longer than a period.  Detect them by
   2254		 * comparing vdone against period start.  If lagging behind
   2255		 * IOs from past periods, don't increase vrate.
   2256		 */
   2257		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
   2258		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
   2259		    time_after64(vtime, vdone) &&
   2260		    time_after64(vtime, now.vnow -
   2261				 MAX_LAGGING_PERIODS * period_vtime) &&
   2262		    time_before64(vdone, now.vnow - period_vtime))
   2263			nr_lagging++;
   2264
   2265		/*
   2266		 * Determine absolute usage factoring in in-flight IOs to avoid
   2267		 * high-latency completions appearing as idle.
   2268		 */
   2269		usage_us = iocg->usage_delta_us;
   2270		usage_us_sum += usage_us;
   2271
   2272		/* see whether there's surplus vtime */
   2273		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
   2274		if (hw_inuse < hw_active ||
   2275		    (!waitqueue_active(&iocg->waitq) &&
   2276		     time_before64(vtime, now.vnow - ioc->margins.low))) {
   2277			u32 hwa, old_hwi, hwm, new_hwi, usage;
   2278			u64 usage_dur;
   2279
   2280			if (vdone != vtime) {
   2281				u64 inflight_us = DIV64_U64_ROUND_UP(
   2282					cost_to_abs_cost(vtime - vdone, hw_inuse),
   2283					ioc->vtime_base_rate);
   2284
   2285				usage_us = max(usage_us, inflight_us);
   2286			}
   2287
   2288			/* convert to hweight based usage ratio */
   2289			if (time_after64(iocg->activated_at, ioc->period_at))
   2290				usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
   2291			else
   2292				usage_dur = max_t(u64, now.now - ioc->period_at, 1);
   2293
   2294			usage = clamp_t(u32,
   2295				DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
   2296						   usage_dur),
   2297				1, WEIGHT_ONE);
   2298
   2299			/*
   2300			 * Already donating or accumulated enough to start.
   2301			 * Determine the donation amount.
   2302			 */
   2303			current_hweight(iocg, &hwa, &old_hwi);
   2304			hwm = current_hweight_max(iocg);
   2305			new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
   2306							 usage, &now);
   2307			/*
   2308			 * Donation calculation assumes hweight_after_donation
   2309			 * to be positive, a condition that a donor w/ hwa < 2
   2310			 * can't meet. Don't bother with donation if hwa is
   2311			 * below 2. It's not gonna make a meaningful difference
   2312			 * anyway.
   2313			 */
   2314			if (new_hwi < hwm && hwa >= 2) {
   2315				iocg->hweight_donating = hwa;
   2316				iocg->hweight_after_donation = new_hwi;
   2317				list_add(&iocg->surplus_list, &surpluses);
   2318			} else if (!iocg->abs_vdebt) {
   2319				/*
   2320				 * @iocg doesn't have enough to donate. Reset
   2321				 * its inuse to active.
   2322				 *
   2323				 * Don't reset debtors as their inuse's are
   2324				 * owned by debt handling. This shouldn't affect
   2325				 * donation calculuation in any meaningful way
   2326				 * as @iocg doesn't have a meaningful amount of
   2327				 * share anyway.
   2328				 */
   2329				TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
   2330						iocg->inuse, iocg->active,
   2331						iocg->hweight_inuse, new_hwi);
   2332
   2333				__propagate_weights(iocg, iocg->active,
   2334						    iocg->active, true, &now);
   2335				nr_shortages++;
   2336			}
   2337		} else {
   2338			/* genuinely short on vtime */
   2339			nr_shortages++;
   2340		}
   2341	}
   2342
   2343	if (!list_empty(&surpluses) && nr_shortages)
   2344		transfer_surpluses(&surpluses, &now);
   2345
   2346	commit_weights(ioc);
   2347
   2348	/* surplus list should be dissolved after use */
   2349	list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
   2350		list_del_init(&iocg->surplus_list);
   2351
   2352	/*
   2353	 * If q is getting clogged or we're missing too much, we're issuing
   2354	 * too much IO and should lower vtime rate.  If we're not missing
   2355	 * and experiencing shortages but not surpluses, we're too stingy
   2356	 * and should increase vtime rate.
   2357	 */
   2358	prev_busy_level = ioc->busy_level;
   2359	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
   2360	    missed_ppm[READ] > ppm_rthr ||
   2361	    missed_ppm[WRITE] > ppm_wthr) {
   2362		/* clearly missing QoS targets, slow down vrate */
   2363		ioc->busy_level = max(ioc->busy_level, 0);
   2364		ioc->busy_level++;
   2365	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
   2366		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
   2367		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
   2368		/* QoS targets are being met with >25% margin */
   2369		if (nr_shortages) {
   2370			/*
   2371			 * We're throttling while the device has spare
   2372			 * capacity.  If vrate was being slowed down, stop.
   2373			 */
   2374			ioc->busy_level = min(ioc->busy_level, 0);
   2375
   2376			/*
   2377			 * If there are IOs spanning multiple periods, wait
   2378			 * them out before pushing the device harder.
   2379			 */
   2380			if (!nr_lagging)
   2381				ioc->busy_level--;
   2382		} else {
   2383			/*
   2384			 * Nobody is being throttled and the users aren't
   2385			 * issuing enough IOs to saturate the device.  We
   2386			 * simply don't know how close the device is to
   2387			 * saturation.  Coast.
   2388			 */
   2389			ioc->busy_level = 0;
   2390		}
   2391	} else {
   2392		/* inside the hysterisis margin, we're good */
   2393		ioc->busy_level = 0;
   2394	}
   2395
   2396	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
   2397
   2398	ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
   2399			      prev_busy_level, missed_ppm);
   2400
   2401	ioc_refresh_params(ioc, false);
   2402
   2403	ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
   2404
   2405	/*
   2406	 * This period is done.  Move onto the next one.  If nothing's
   2407	 * going on with the device, stop the timer.
   2408	 */
   2409	atomic64_inc(&ioc->cur_period);
   2410
   2411	if (ioc->running != IOC_STOP) {
   2412		if (!list_empty(&ioc->active_iocgs)) {
   2413			ioc_start_period(ioc, &now);
   2414		} else {
   2415			ioc->busy_level = 0;
   2416			ioc->vtime_err = 0;
   2417			ioc->running = IOC_IDLE;
   2418		}
   2419
   2420		ioc_refresh_vrate(ioc, &now);
   2421	}
   2422
   2423	spin_unlock_irq(&ioc->lock);
   2424}
   2425
   2426static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
   2427				      u64 abs_cost, struct ioc_now *now)
   2428{
   2429	struct ioc *ioc = iocg->ioc;
   2430	struct ioc_margins *margins = &ioc->margins;
   2431	u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
   2432	u32 hwi, adj_step;
   2433	s64 margin;
   2434	u64 cost, new_inuse;
   2435
   2436	current_hweight(iocg, NULL, &hwi);
   2437	old_hwi = hwi;
   2438	cost = abs_cost_to_cost(abs_cost, hwi);
   2439	margin = now->vnow - vtime - cost;
   2440
   2441	/* debt handling owns inuse for debtors */
   2442	if (iocg->abs_vdebt)
   2443		return cost;
   2444
   2445	/*
   2446	 * We only increase inuse during period and do so if the margin has
   2447	 * deteriorated since the previous adjustment.
   2448	 */
   2449	if (margin >= iocg->saved_margin || margin >= margins->low ||
   2450	    iocg->inuse == iocg->active)
   2451		return cost;
   2452
   2453	spin_lock_irq(&ioc->lock);
   2454
   2455	/* we own inuse only when @iocg is in the normal active state */
   2456	if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
   2457		spin_unlock_irq(&ioc->lock);
   2458		return cost;
   2459	}
   2460
   2461	/*
   2462	 * Bump up inuse till @abs_cost fits in the existing budget.
   2463	 * adj_step must be determined after acquiring ioc->lock - we might
   2464	 * have raced and lost to another thread for activation and could
   2465	 * be reading 0 iocg->active before ioc->lock which will lead to
   2466	 * infinite loop.
   2467	 */
   2468	new_inuse = iocg->inuse;
   2469	adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
   2470	do {
   2471		new_inuse = new_inuse + adj_step;
   2472		propagate_weights(iocg, iocg->active, new_inuse, true, now);
   2473		current_hweight(iocg, NULL, &hwi);
   2474		cost = abs_cost_to_cost(abs_cost, hwi);
   2475	} while (time_after64(vtime + cost, now->vnow) &&
   2476		 iocg->inuse != iocg->active);
   2477
   2478	spin_unlock_irq(&ioc->lock);
   2479
   2480	TRACE_IOCG_PATH(inuse_adjust, iocg, now,
   2481			old_inuse, iocg->inuse, old_hwi, hwi);
   2482
   2483	return cost;
   2484}
   2485
   2486static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
   2487				    bool is_merge, u64 *costp)
   2488{
   2489	struct ioc *ioc = iocg->ioc;
   2490	u64 coef_seqio, coef_randio, coef_page;
   2491	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
   2492	u64 seek_pages = 0;
   2493	u64 cost = 0;
   2494
   2495	switch (bio_op(bio)) {
   2496	case REQ_OP_READ:
   2497		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
   2498		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
   2499		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
   2500		break;
   2501	case REQ_OP_WRITE:
   2502		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
   2503		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
   2504		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
   2505		break;
   2506	default:
   2507		goto out;
   2508	}
   2509
   2510	if (iocg->cursor) {
   2511		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
   2512		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
   2513	}
   2514
   2515	if (!is_merge) {
   2516		if (seek_pages > LCOEF_RANDIO_PAGES) {
   2517			cost += coef_randio;
   2518		} else {
   2519			cost += coef_seqio;
   2520		}
   2521	}
   2522	cost += pages * coef_page;
   2523out:
   2524	*costp = cost;
   2525}
   2526
   2527static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
   2528{
   2529	u64 cost;
   2530
   2531	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
   2532	return cost;
   2533}
   2534
   2535static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
   2536					 u64 *costp)
   2537{
   2538	unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
   2539
   2540	switch (req_op(rq)) {
   2541	case REQ_OP_READ:
   2542		*costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
   2543		break;
   2544	case REQ_OP_WRITE:
   2545		*costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
   2546		break;
   2547	default:
   2548		*costp = 0;
   2549	}
   2550}
   2551
   2552static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
   2553{
   2554	u64 cost;
   2555
   2556	calc_size_vtime_cost_builtin(rq, ioc, &cost);
   2557	return cost;
   2558}
   2559
   2560static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
   2561{
   2562	struct blkcg_gq *blkg = bio->bi_blkg;
   2563	struct ioc *ioc = rqos_to_ioc(rqos);
   2564	struct ioc_gq *iocg = blkg_to_iocg(blkg);
   2565	struct ioc_now now;
   2566	struct iocg_wait wait;
   2567	u64 abs_cost, cost, vtime;
   2568	bool use_debt, ioc_locked;
   2569	unsigned long flags;
   2570
   2571	/* bypass IOs if disabled, still initializing, or for root cgroup */
   2572	if (!ioc->enabled || !iocg || !iocg->level)
   2573		return;
   2574
   2575	/* calculate the absolute vtime cost */
   2576	abs_cost = calc_vtime_cost(bio, iocg, false);
   2577	if (!abs_cost)
   2578		return;
   2579
   2580	if (!iocg_activate(iocg, &now))
   2581		return;
   2582
   2583	iocg->cursor = bio_end_sector(bio);
   2584	vtime = atomic64_read(&iocg->vtime);
   2585	cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
   2586
   2587	/*
   2588	 * If no one's waiting and within budget, issue right away.  The
   2589	 * tests are racy but the races aren't systemic - we only miss once
   2590	 * in a while which is fine.
   2591	 */
   2592	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
   2593	    time_before_eq64(vtime + cost, now.vnow)) {
   2594		iocg_commit_bio(iocg, bio, abs_cost, cost);
   2595		return;
   2596	}
   2597
   2598	/*
   2599	 * We're over budget. This can be handled in two ways. IOs which may
   2600	 * cause priority inversions are punted to @ioc->aux_iocg and charged as
   2601	 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
   2602	 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
   2603	 * whether debt handling is needed and acquire locks accordingly.
   2604	 */
   2605	use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
   2606	ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
   2607retry_lock:
   2608	iocg_lock(iocg, ioc_locked, &flags);
   2609
   2610	/*
   2611	 * @iocg must stay activated for debt and waitq handling. Deactivation
   2612	 * is synchronized against both ioc->lock and waitq.lock and we won't
   2613	 * get deactivated as long as we're waiting or has debt, so we're good
   2614	 * if we're activated here. In the unlikely cases that we aren't, just
   2615	 * issue the IO.
   2616	 */
   2617	if (unlikely(list_empty(&iocg->active_list))) {
   2618		iocg_unlock(iocg, ioc_locked, &flags);
   2619		iocg_commit_bio(iocg, bio, abs_cost, cost);
   2620		return;
   2621	}
   2622
   2623	/*
   2624	 * We're over budget. If @bio has to be issued regardless, remember
   2625	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
   2626	 * off the debt before waking more IOs.
   2627	 *
   2628	 * This way, the debt is continuously paid off each period with the
   2629	 * actual budget available to the cgroup. If we just wound vtime, we
   2630	 * would incorrectly use the current hw_inuse for the entire amount
   2631	 * which, for example, can lead to the cgroup staying blocked for a
   2632	 * long time even with substantially raised hw_inuse.
   2633	 *
   2634	 * An iocg with vdebt should stay online so that the timer can keep
   2635	 * deducting its vdebt and [de]activate use_delay mechanism
   2636	 * accordingly. We don't want to race against the timer trying to
   2637	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
   2638	 * penalizing the cgroup and its descendants.
   2639	 */
   2640	if (use_debt) {
   2641		iocg_incur_debt(iocg, abs_cost, &now);
   2642		if (iocg_kick_delay(iocg, &now))
   2643			blkcg_schedule_throttle(rqos->q,
   2644					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
   2645		iocg_unlock(iocg, ioc_locked, &flags);
   2646		return;
   2647	}
   2648
   2649	/* guarantee that iocgs w/ waiters have maximum inuse */
   2650	if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
   2651		if (!ioc_locked) {
   2652			iocg_unlock(iocg, false, &flags);
   2653			ioc_locked = true;
   2654			goto retry_lock;
   2655		}
   2656		propagate_weights(iocg, iocg->active, iocg->active, true,
   2657				  &now);
   2658	}
   2659
   2660	/*
   2661	 * Append self to the waitq and schedule the wakeup timer if we're
   2662	 * the first waiter.  The timer duration is calculated based on the
   2663	 * current vrate.  vtime and hweight changes can make it too short
   2664	 * or too long.  Each wait entry records the absolute cost it's
   2665	 * waiting for to allow re-evaluation using a custom wait entry.
   2666	 *
   2667	 * If too short, the timer simply reschedules itself.  If too long,
   2668	 * the period timer will notice and trigger wakeups.
   2669	 *
   2670	 * All waiters are on iocg->waitq and the wait states are
   2671	 * synchronized using waitq.lock.
   2672	 */
   2673	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
   2674	wait.wait.private = current;
   2675	wait.bio = bio;
   2676	wait.abs_cost = abs_cost;
   2677	wait.committed = false;	/* will be set true by waker */
   2678
   2679	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
   2680	iocg_kick_waitq(iocg, ioc_locked, &now);
   2681
   2682	iocg_unlock(iocg, ioc_locked, &flags);
   2683
   2684	while (true) {
   2685		set_current_state(TASK_UNINTERRUPTIBLE);
   2686		if (wait.committed)
   2687			break;
   2688		io_schedule();
   2689	}
   2690
   2691	/* waker already committed us, proceed */
   2692	finish_wait(&iocg->waitq, &wait.wait);
   2693}
   2694
   2695static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
   2696			   struct bio *bio)
   2697{
   2698	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
   2699	struct ioc *ioc = rqos_to_ioc(rqos);
   2700	sector_t bio_end = bio_end_sector(bio);
   2701	struct ioc_now now;
   2702	u64 vtime, abs_cost, cost;
   2703	unsigned long flags;
   2704
   2705	/* bypass if disabled, still initializing, or for root cgroup */
   2706	if (!ioc->enabled || !iocg || !iocg->level)
   2707		return;
   2708
   2709	abs_cost = calc_vtime_cost(bio, iocg, true);
   2710	if (!abs_cost)
   2711		return;
   2712
   2713	ioc_now(ioc, &now);
   2714
   2715	vtime = atomic64_read(&iocg->vtime);
   2716	cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
   2717
   2718	/* update cursor if backmerging into the request at the cursor */
   2719	if (blk_rq_pos(rq) < bio_end &&
   2720	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
   2721		iocg->cursor = bio_end;
   2722
   2723	/*
   2724	 * Charge if there's enough vtime budget and the existing request has
   2725	 * cost assigned.
   2726	 */
   2727	if (rq->bio && rq->bio->bi_iocost_cost &&
   2728	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
   2729		iocg_commit_bio(iocg, bio, abs_cost, cost);
   2730		return;
   2731	}
   2732
   2733	/*
   2734	 * Otherwise, account it as debt if @iocg is online, which it should
   2735	 * be for the vast majority of cases. See debt handling in
   2736	 * ioc_rqos_throttle() for details.
   2737	 */
   2738	spin_lock_irqsave(&ioc->lock, flags);
   2739	spin_lock(&iocg->waitq.lock);
   2740
   2741	if (likely(!list_empty(&iocg->active_list))) {
   2742		iocg_incur_debt(iocg, abs_cost, &now);
   2743		if (iocg_kick_delay(iocg, &now))
   2744			blkcg_schedule_throttle(rqos->q,
   2745					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
   2746	} else {
   2747		iocg_commit_bio(iocg, bio, abs_cost, cost);
   2748	}
   2749
   2750	spin_unlock(&iocg->waitq.lock);
   2751	spin_unlock_irqrestore(&ioc->lock, flags);
   2752}
   2753
   2754static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
   2755{
   2756	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
   2757
   2758	if (iocg && bio->bi_iocost_cost)
   2759		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
   2760}
   2761
   2762static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
   2763{
   2764	struct ioc *ioc = rqos_to_ioc(rqos);
   2765	struct ioc_pcpu_stat *ccs;
   2766	u64 on_q_ns, rq_wait_ns, size_nsec;
   2767	int pidx, rw;
   2768
   2769	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
   2770		return;
   2771
   2772	switch (req_op(rq) & REQ_OP_MASK) {
   2773	case REQ_OP_READ:
   2774		pidx = QOS_RLAT;
   2775		rw = READ;
   2776		break;
   2777	case REQ_OP_WRITE:
   2778		pidx = QOS_WLAT;
   2779		rw = WRITE;
   2780		break;
   2781	default:
   2782		return;
   2783	}
   2784
   2785	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
   2786	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
   2787	size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
   2788
   2789	ccs = get_cpu_ptr(ioc->pcpu_stat);
   2790
   2791	if (on_q_ns <= size_nsec ||
   2792	    on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
   2793		local_inc(&ccs->missed[rw].nr_met);
   2794	else
   2795		local_inc(&ccs->missed[rw].nr_missed);
   2796
   2797	local64_add(rq_wait_ns, &ccs->rq_wait_ns);
   2798
   2799	put_cpu_ptr(ccs);
   2800}
   2801
   2802static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
   2803{
   2804	struct ioc *ioc = rqos_to_ioc(rqos);
   2805
   2806	spin_lock_irq(&ioc->lock);
   2807	ioc_refresh_params(ioc, false);
   2808	spin_unlock_irq(&ioc->lock);
   2809}
   2810
   2811static void ioc_rqos_exit(struct rq_qos *rqos)
   2812{
   2813	struct ioc *ioc = rqos_to_ioc(rqos);
   2814
   2815	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
   2816
   2817	spin_lock_irq(&ioc->lock);
   2818	ioc->running = IOC_STOP;
   2819	spin_unlock_irq(&ioc->lock);
   2820
   2821	del_timer_sync(&ioc->timer);
   2822	free_percpu(ioc->pcpu_stat);
   2823	kfree(ioc);
   2824}
   2825
   2826static struct rq_qos_ops ioc_rqos_ops = {
   2827	.throttle = ioc_rqos_throttle,
   2828	.merge = ioc_rqos_merge,
   2829	.done_bio = ioc_rqos_done_bio,
   2830	.done = ioc_rqos_done,
   2831	.queue_depth_changed = ioc_rqos_queue_depth_changed,
   2832	.exit = ioc_rqos_exit,
   2833};
   2834
   2835static int blk_iocost_init(struct request_queue *q)
   2836{
   2837	struct ioc *ioc;
   2838	struct rq_qos *rqos;
   2839	int i, cpu, ret;
   2840
   2841	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
   2842	if (!ioc)
   2843		return -ENOMEM;
   2844
   2845	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
   2846	if (!ioc->pcpu_stat) {
   2847		kfree(ioc);
   2848		return -ENOMEM;
   2849	}
   2850
   2851	for_each_possible_cpu(cpu) {
   2852		struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
   2853
   2854		for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
   2855			local_set(&ccs->missed[i].nr_met, 0);
   2856			local_set(&ccs->missed[i].nr_missed, 0);
   2857		}
   2858		local64_set(&ccs->rq_wait_ns, 0);
   2859	}
   2860
   2861	rqos = &ioc->rqos;
   2862	rqos->id = RQ_QOS_COST;
   2863	rqos->ops = &ioc_rqos_ops;
   2864	rqos->q = q;
   2865
   2866	spin_lock_init(&ioc->lock);
   2867	timer_setup(&ioc->timer, ioc_timer_fn, 0);
   2868	INIT_LIST_HEAD(&ioc->active_iocgs);
   2869
   2870	ioc->running = IOC_IDLE;
   2871	ioc->vtime_base_rate = VTIME_PER_USEC;
   2872	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
   2873	seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
   2874	ioc->period_at = ktime_to_us(ktime_get());
   2875	atomic64_set(&ioc->cur_period, 0);
   2876	atomic_set(&ioc->hweight_gen, 0);
   2877
   2878	spin_lock_irq(&ioc->lock);
   2879	ioc->autop_idx = AUTOP_INVALID;
   2880	ioc_refresh_params(ioc, true);
   2881	spin_unlock_irq(&ioc->lock);
   2882
   2883	/*
   2884	 * rqos must be added before activation to allow iocg_pd_init() to
   2885	 * lookup the ioc from q. This means that the rqos methods may get
   2886	 * called before policy activation completion, can't assume that the
   2887	 * target bio has an iocg associated and need to test for NULL iocg.
   2888	 */
   2889	rq_qos_add(q, rqos);
   2890	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
   2891	if (ret) {
   2892		rq_qos_del(q, rqos);
   2893		free_percpu(ioc->pcpu_stat);
   2894		kfree(ioc);
   2895		return ret;
   2896	}
   2897	return 0;
   2898}
   2899
   2900static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
   2901{
   2902	struct ioc_cgrp *iocc;
   2903
   2904	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
   2905	if (!iocc)
   2906		return NULL;
   2907
   2908	iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
   2909	return &iocc->cpd;
   2910}
   2911
   2912static void ioc_cpd_free(struct blkcg_policy_data *cpd)
   2913{
   2914	kfree(container_of(cpd, struct ioc_cgrp, cpd));
   2915}
   2916
   2917static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
   2918					     struct blkcg *blkcg)
   2919{
   2920	int levels = blkcg->css.cgroup->level + 1;
   2921	struct ioc_gq *iocg;
   2922
   2923	iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
   2924	if (!iocg)
   2925		return NULL;
   2926
   2927	iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
   2928	if (!iocg->pcpu_stat) {
   2929		kfree(iocg);
   2930		return NULL;
   2931	}
   2932
   2933	return &iocg->pd;
   2934}
   2935
   2936static void ioc_pd_init(struct blkg_policy_data *pd)
   2937{
   2938	struct ioc_gq *iocg = pd_to_iocg(pd);
   2939	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
   2940	struct ioc *ioc = q_to_ioc(blkg->q);
   2941	struct ioc_now now;
   2942	struct blkcg_gq *tblkg;
   2943	unsigned long flags;
   2944
   2945	ioc_now(ioc, &now);
   2946
   2947	iocg->ioc = ioc;
   2948	atomic64_set(&iocg->vtime, now.vnow);
   2949	atomic64_set(&iocg->done_vtime, now.vnow);
   2950	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
   2951	INIT_LIST_HEAD(&iocg->active_list);
   2952	INIT_LIST_HEAD(&iocg->walk_list);
   2953	INIT_LIST_HEAD(&iocg->surplus_list);
   2954	iocg->hweight_active = WEIGHT_ONE;
   2955	iocg->hweight_inuse = WEIGHT_ONE;
   2956
   2957	init_waitqueue_head(&iocg->waitq);
   2958	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
   2959	iocg->waitq_timer.function = iocg_waitq_timer_fn;
   2960
   2961	iocg->level = blkg->blkcg->css.cgroup->level;
   2962
   2963	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
   2964		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
   2965		iocg->ancestors[tiocg->level] = tiocg;
   2966	}
   2967
   2968	spin_lock_irqsave(&ioc->lock, flags);
   2969	weight_updated(iocg, &now);
   2970	spin_unlock_irqrestore(&ioc->lock, flags);
   2971}
   2972
   2973static void ioc_pd_free(struct blkg_policy_data *pd)
   2974{
   2975	struct ioc_gq *iocg = pd_to_iocg(pd);
   2976	struct ioc *ioc = iocg->ioc;
   2977	unsigned long flags;
   2978
   2979	if (ioc) {
   2980		spin_lock_irqsave(&ioc->lock, flags);
   2981
   2982		if (!list_empty(&iocg->active_list)) {
   2983			struct ioc_now now;
   2984
   2985			ioc_now(ioc, &now);
   2986			propagate_weights(iocg, 0, 0, false, &now);
   2987			list_del_init(&iocg->active_list);
   2988		}
   2989
   2990		WARN_ON_ONCE(!list_empty(&iocg->walk_list));
   2991		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
   2992
   2993		spin_unlock_irqrestore(&ioc->lock, flags);
   2994
   2995		hrtimer_cancel(&iocg->waitq_timer);
   2996	}
   2997	free_percpu(iocg->pcpu_stat);
   2998	kfree(iocg);
   2999}
   3000
   3001static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
   3002{
   3003	struct ioc_gq *iocg = pd_to_iocg(pd);
   3004	struct ioc *ioc = iocg->ioc;
   3005
   3006	if (!ioc->enabled)
   3007		return;
   3008
   3009	if (iocg->level == 0) {
   3010		unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
   3011			ioc->vtime_base_rate * 10000,
   3012			VTIME_PER_USEC);
   3013		seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
   3014	}
   3015
   3016	seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
   3017
   3018	if (blkcg_debug_stats)
   3019		seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
   3020			iocg->last_stat.wait_us,
   3021			iocg->last_stat.indebt_us,
   3022			iocg->last_stat.indelay_us);
   3023}
   3024
   3025static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
   3026			     int off)
   3027{
   3028	const char *dname = blkg_dev_name(pd->blkg);
   3029	struct ioc_gq *iocg = pd_to_iocg(pd);
   3030
   3031	if (dname && iocg->cfg_weight)
   3032		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
   3033	return 0;
   3034}
   3035
   3036
   3037static int ioc_weight_show(struct seq_file *sf, void *v)
   3038{
   3039	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
   3040	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
   3041
   3042	seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
   3043	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
   3044			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
   3045	return 0;
   3046}
   3047
   3048static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
   3049				size_t nbytes, loff_t off)
   3050{
   3051	struct blkcg *blkcg = css_to_blkcg(of_css(of));
   3052	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
   3053	struct blkg_conf_ctx ctx;
   3054	struct ioc_now now;
   3055	struct ioc_gq *iocg;
   3056	u32 v;
   3057	int ret;
   3058
   3059	if (!strchr(buf, ':')) {
   3060		struct blkcg_gq *blkg;
   3061
   3062		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
   3063			return -EINVAL;
   3064
   3065		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
   3066			return -EINVAL;
   3067
   3068		spin_lock_irq(&blkcg->lock);
   3069		iocc->dfl_weight = v * WEIGHT_ONE;
   3070		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
   3071			struct ioc_gq *iocg = blkg_to_iocg(blkg);
   3072
   3073			if (iocg) {
   3074				spin_lock(&iocg->ioc->lock);
   3075				ioc_now(iocg->ioc, &now);
   3076				weight_updated(iocg, &now);
   3077				spin_unlock(&iocg->ioc->lock);
   3078			}
   3079		}
   3080		spin_unlock_irq(&blkcg->lock);
   3081
   3082		return nbytes;
   3083	}
   3084
   3085	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
   3086	if (ret)
   3087		return ret;
   3088
   3089	iocg = blkg_to_iocg(ctx.blkg);
   3090
   3091	if (!strncmp(ctx.body, "default", 7)) {
   3092		v = 0;
   3093	} else {
   3094		if (!sscanf(ctx.body, "%u", &v))
   3095			goto einval;
   3096		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
   3097			goto einval;
   3098	}
   3099
   3100	spin_lock(&iocg->ioc->lock);
   3101	iocg->cfg_weight = v * WEIGHT_ONE;
   3102	ioc_now(iocg->ioc, &now);
   3103	weight_updated(iocg, &now);
   3104	spin_unlock(&iocg->ioc->lock);
   3105
   3106	blkg_conf_finish(&ctx);
   3107	return nbytes;
   3108
   3109einval:
   3110	blkg_conf_finish(&ctx);
   3111	return -EINVAL;
   3112}
   3113
   3114static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
   3115			  int off)
   3116{
   3117	const char *dname = blkg_dev_name(pd->blkg);
   3118	struct ioc *ioc = pd_to_iocg(pd)->ioc;
   3119
   3120	if (!dname)
   3121		return 0;
   3122
   3123	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
   3124		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
   3125		   ioc->params.qos[QOS_RPPM] / 10000,
   3126		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
   3127		   ioc->params.qos[QOS_RLAT],
   3128		   ioc->params.qos[QOS_WPPM] / 10000,
   3129		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
   3130		   ioc->params.qos[QOS_WLAT],
   3131		   ioc->params.qos[QOS_MIN] / 10000,
   3132		   ioc->params.qos[QOS_MIN] % 10000 / 100,
   3133		   ioc->params.qos[QOS_MAX] / 10000,
   3134		   ioc->params.qos[QOS_MAX] % 10000 / 100);
   3135	return 0;
   3136}
   3137
   3138static int ioc_qos_show(struct seq_file *sf, void *v)
   3139{
   3140	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
   3141
   3142	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
   3143			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
   3144	return 0;
   3145}
   3146
   3147static const match_table_t qos_ctrl_tokens = {
   3148	{ QOS_ENABLE,		"enable=%u"	},
   3149	{ QOS_CTRL,		"ctrl=%s"	},
   3150	{ NR_QOS_CTRL_PARAMS,	NULL		},
   3151};
   3152
   3153static const match_table_t qos_tokens = {
   3154	{ QOS_RPPM,		"rpct=%s"	},
   3155	{ QOS_RLAT,		"rlat=%u"	},
   3156	{ QOS_WPPM,		"wpct=%s"	},
   3157	{ QOS_WLAT,		"wlat=%u"	},
   3158	{ QOS_MIN,		"min=%s"	},
   3159	{ QOS_MAX,		"max=%s"	},
   3160	{ NR_QOS_PARAMS,	NULL		},
   3161};
   3162
   3163static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
   3164			     size_t nbytes, loff_t off)
   3165{
   3166	struct block_device *bdev;
   3167	struct ioc *ioc;
   3168	u32 qos[NR_QOS_PARAMS];
   3169	bool enable, user;
   3170	char *p;
   3171	int ret;
   3172
   3173	bdev = blkcg_conf_open_bdev(&input);
   3174	if (IS_ERR(bdev))
   3175		return PTR_ERR(bdev);
   3176
   3177	ioc = q_to_ioc(bdev_get_queue(bdev));
   3178	if (!ioc) {
   3179		ret = blk_iocost_init(bdev_get_queue(bdev));
   3180		if (ret)
   3181			goto err;
   3182		ioc = q_to_ioc(bdev_get_queue(bdev));
   3183	}
   3184
   3185	spin_lock_irq(&ioc->lock);
   3186	memcpy(qos, ioc->params.qos, sizeof(qos));
   3187	enable = ioc->enabled;
   3188	user = ioc->user_qos_params;
   3189	spin_unlock_irq(&ioc->lock);
   3190
   3191	while ((p = strsep(&input, " \t\n"))) {
   3192		substring_t args[MAX_OPT_ARGS];
   3193		char buf[32];
   3194		int tok;
   3195		s64 v;
   3196
   3197		if (!*p)
   3198			continue;
   3199
   3200		switch (match_token(p, qos_ctrl_tokens, args)) {
   3201		case QOS_ENABLE:
   3202			match_u64(&args[0], &v);
   3203			enable = v;
   3204			continue;
   3205		case QOS_CTRL:
   3206			match_strlcpy(buf, &args[0], sizeof(buf));
   3207			if (!strcmp(buf, "auto"))
   3208				user = false;
   3209			else if (!strcmp(buf, "user"))
   3210				user = true;
   3211			else
   3212				goto einval;
   3213			continue;
   3214		}
   3215
   3216		tok = match_token(p, qos_tokens, args);
   3217		switch (tok) {
   3218		case QOS_RPPM:
   3219		case QOS_WPPM:
   3220			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
   3221			    sizeof(buf))
   3222				goto einval;
   3223			if (cgroup_parse_float(buf, 2, &v))
   3224				goto einval;
   3225			if (v < 0 || v > 10000)
   3226				goto einval;
   3227			qos[tok] = v * 100;
   3228			break;
   3229		case QOS_RLAT:
   3230		case QOS_WLAT:
   3231			if (match_u64(&args[0], &v))
   3232				goto einval;
   3233			qos[tok] = v;
   3234			break;
   3235		case QOS_MIN:
   3236		case QOS_MAX:
   3237			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
   3238			    sizeof(buf))
   3239				goto einval;
   3240			if (cgroup_parse_float(buf, 2, &v))
   3241				goto einval;
   3242			if (v < 0)
   3243				goto einval;
   3244			qos[tok] = clamp_t(s64, v * 100,
   3245					   VRATE_MIN_PPM, VRATE_MAX_PPM);
   3246			break;
   3247		default:
   3248			goto einval;
   3249		}
   3250		user = true;
   3251	}
   3252
   3253	if (qos[QOS_MIN] > qos[QOS_MAX])
   3254		goto einval;
   3255
   3256	spin_lock_irq(&ioc->lock);
   3257
   3258	if (enable) {
   3259		blk_stat_enable_accounting(ioc->rqos.q);
   3260		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
   3261		ioc->enabled = true;
   3262	} else {
   3263		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
   3264		ioc->enabled = false;
   3265	}
   3266
   3267	if (user) {
   3268		memcpy(ioc->params.qos, qos, sizeof(qos));
   3269		ioc->user_qos_params = true;
   3270	} else {
   3271		ioc->user_qos_params = false;
   3272	}
   3273
   3274	ioc_refresh_params(ioc, true);
   3275	spin_unlock_irq(&ioc->lock);
   3276
   3277	blkdev_put_no_open(bdev);
   3278	return nbytes;
   3279einval:
   3280	ret = -EINVAL;
   3281err:
   3282	blkdev_put_no_open(bdev);
   3283	return ret;
   3284}
   3285
   3286static u64 ioc_cost_model_prfill(struct seq_file *sf,
   3287				 struct blkg_policy_data *pd, int off)
   3288{
   3289	const char *dname = blkg_dev_name(pd->blkg);
   3290	struct ioc *ioc = pd_to_iocg(pd)->ioc;
   3291	u64 *u = ioc->params.i_lcoefs;
   3292
   3293	if (!dname)
   3294		return 0;
   3295
   3296	seq_printf(sf, "%s ctrl=%s model=linear "
   3297		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
   3298		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
   3299		   dname, ioc->user_cost_model ? "user" : "auto",
   3300		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
   3301		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
   3302	return 0;
   3303}
   3304
   3305static int ioc_cost_model_show(struct seq_file *sf, void *v)
   3306{
   3307	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
   3308
   3309	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
   3310			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
   3311	return 0;
   3312}
   3313
   3314static const match_table_t cost_ctrl_tokens = {
   3315	{ COST_CTRL,		"ctrl=%s"	},
   3316	{ COST_MODEL,		"model=%s"	},
   3317	{ NR_COST_CTRL_PARAMS,	NULL		},
   3318};
   3319
   3320static const match_table_t i_lcoef_tokens = {
   3321	{ I_LCOEF_RBPS,		"rbps=%u"	},
   3322	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
   3323	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
   3324	{ I_LCOEF_WBPS,		"wbps=%u"	},
   3325	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
   3326	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
   3327	{ NR_I_LCOEFS,		NULL		},
   3328};
   3329
   3330static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
   3331				    size_t nbytes, loff_t off)
   3332{
   3333	struct block_device *bdev;
   3334	struct ioc *ioc;
   3335	u64 u[NR_I_LCOEFS];
   3336	bool user;
   3337	char *p;
   3338	int ret;
   3339
   3340	bdev = blkcg_conf_open_bdev(&input);
   3341	if (IS_ERR(bdev))
   3342		return PTR_ERR(bdev);
   3343
   3344	ioc = q_to_ioc(bdev_get_queue(bdev));
   3345	if (!ioc) {
   3346		ret = blk_iocost_init(bdev_get_queue(bdev));
   3347		if (ret)
   3348			goto err;
   3349		ioc = q_to_ioc(bdev_get_queue(bdev));
   3350	}
   3351
   3352	spin_lock_irq(&ioc->lock);
   3353	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
   3354	user = ioc->user_cost_model;
   3355	spin_unlock_irq(&ioc->lock);
   3356
   3357	while ((p = strsep(&input, " \t\n"))) {
   3358		substring_t args[MAX_OPT_ARGS];
   3359		char buf[32];
   3360		int tok;
   3361		u64 v;
   3362
   3363		if (!*p)
   3364			continue;
   3365
   3366		switch (match_token(p, cost_ctrl_tokens, args)) {
   3367		case COST_CTRL:
   3368			match_strlcpy(buf, &args[0], sizeof(buf));
   3369			if (!strcmp(buf, "auto"))
   3370				user = false;
   3371			else if (!strcmp(buf, "user"))
   3372				user = true;
   3373			else
   3374				goto einval;
   3375			continue;
   3376		case COST_MODEL:
   3377			match_strlcpy(buf, &args[0], sizeof(buf));
   3378			if (strcmp(buf, "linear"))
   3379				goto einval;
   3380			continue;
   3381		}
   3382
   3383		tok = match_token(p, i_lcoef_tokens, args);
   3384		if (tok == NR_I_LCOEFS)
   3385			goto einval;
   3386		if (match_u64(&args[0], &v))
   3387			goto einval;
   3388		u[tok] = v;
   3389		user = true;
   3390	}
   3391
   3392	spin_lock_irq(&ioc->lock);
   3393	if (user) {
   3394		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
   3395		ioc->user_cost_model = true;
   3396	} else {
   3397		ioc->user_cost_model = false;
   3398	}
   3399	ioc_refresh_params(ioc, true);
   3400	spin_unlock_irq(&ioc->lock);
   3401
   3402	blkdev_put_no_open(bdev);
   3403	return nbytes;
   3404
   3405einval:
   3406	ret = -EINVAL;
   3407err:
   3408	blkdev_put_no_open(bdev);
   3409	return ret;
   3410}
   3411
   3412static struct cftype ioc_files[] = {
   3413	{
   3414		.name = "weight",
   3415		.flags = CFTYPE_NOT_ON_ROOT,
   3416		.seq_show = ioc_weight_show,
   3417		.write = ioc_weight_write,
   3418	},
   3419	{
   3420		.name = "cost.qos",
   3421		.flags = CFTYPE_ONLY_ON_ROOT,
   3422		.seq_show = ioc_qos_show,
   3423		.write = ioc_qos_write,
   3424	},
   3425	{
   3426		.name = "cost.model",
   3427		.flags = CFTYPE_ONLY_ON_ROOT,
   3428		.seq_show = ioc_cost_model_show,
   3429		.write = ioc_cost_model_write,
   3430	},
   3431	{}
   3432};
   3433
   3434static struct blkcg_policy blkcg_policy_iocost = {
   3435	.dfl_cftypes	= ioc_files,
   3436	.cpd_alloc_fn	= ioc_cpd_alloc,
   3437	.cpd_free_fn	= ioc_cpd_free,
   3438	.pd_alloc_fn	= ioc_pd_alloc,
   3439	.pd_init_fn	= ioc_pd_init,
   3440	.pd_free_fn	= ioc_pd_free,
   3441	.pd_stat_fn	= ioc_pd_stat,
   3442};
   3443
   3444static int __init ioc_init(void)
   3445{
   3446	return blkcg_policy_register(&blkcg_policy_iocost);
   3447}
   3448
   3449static void __exit ioc_exit(void)
   3450{
   3451	blkcg_policy_unregister(&blkcg_policy_iocost);
   3452}
   3453
   3454module_init(ioc_init);
   3455module_exit(ioc_exit);