cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

page-writeback.c (94027B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * mm/page-writeback.c
      4 *
      5 * Copyright (C) 2002, Linus Torvalds.
      6 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
      7 *
      8 * Contains functions related to writing back dirty pages at the
      9 * address_space level.
     10 *
     11 * 10Apr2002	Andrew Morton
     12 *		Initial version
     13 */
     14
     15#include <linux/kernel.h>
     16#include <linux/export.h>
     17#include <linux/spinlock.h>
     18#include <linux/fs.h>
     19#include <linux/mm.h>
     20#include <linux/swap.h>
     21#include <linux/slab.h>
     22#include <linux/pagemap.h>
     23#include <linux/writeback.h>
     24#include <linux/init.h>
     25#include <linux/backing-dev.h>
     26#include <linux/task_io_accounting_ops.h>
     27#include <linux/blkdev.h>
     28#include <linux/mpage.h>
     29#include <linux/rmap.h>
     30#include <linux/percpu.h>
     31#include <linux/smp.h>
     32#include <linux/sysctl.h>
     33#include <linux/cpu.h>
     34#include <linux/syscalls.h>
     35#include <linux/pagevec.h>
     36#include <linux/timer.h>
     37#include <linux/sched/rt.h>
     38#include <linux/sched/signal.h>
     39#include <linux/mm_inline.h>
     40#include <trace/events/writeback.h>
     41
     42#include "internal.h"
     43
     44/*
     45 * Sleep at most 200ms at a time in balance_dirty_pages().
     46 */
     47#define MAX_PAUSE		max(HZ/5, 1)
     48
     49/*
     50 * Try to keep balance_dirty_pages() call intervals higher than this many pages
     51 * by raising pause time to max_pause when falls below it.
     52 */
     53#define DIRTY_POLL_THRESH	(128 >> (PAGE_SHIFT - 10))
     54
     55/*
     56 * Estimate write bandwidth at 200ms intervals.
     57 */
     58#define BANDWIDTH_INTERVAL	max(HZ/5, 1)
     59
     60#define RATELIMIT_CALC_SHIFT	10
     61
     62/*
     63 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
     64 * will look to see if it needs to force writeback or throttling.
     65 */
     66static long ratelimit_pages = 32;
     67
     68/* The following parameters are exported via /proc/sys/vm */
     69
     70/*
     71 * Start background writeback (via writeback threads) at this percentage
     72 */
     73static int dirty_background_ratio = 10;
     74
     75/*
     76 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
     77 * dirty_background_ratio * the amount of dirtyable memory
     78 */
     79static unsigned long dirty_background_bytes;
     80
     81/*
     82 * free highmem will not be subtracted from the total free memory
     83 * for calculating free ratios if vm_highmem_is_dirtyable is true
     84 */
     85static int vm_highmem_is_dirtyable;
     86
     87/*
     88 * The generator of dirty data starts writeback at this percentage
     89 */
     90static int vm_dirty_ratio = 20;
     91
     92/*
     93 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
     94 * vm_dirty_ratio * the amount of dirtyable memory
     95 */
     96static unsigned long vm_dirty_bytes;
     97
     98/*
     99 * The interval between `kupdate'-style writebacks
    100 */
    101unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
    102
    103EXPORT_SYMBOL_GPL(dirty_writeback_interval);
    104
    105/*
    106 * The longest time for which data is allowed to remain dirty
    107 */
    108unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
    109
    110/*
    111 * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
    112 * a full sync is triggered after this time elapses without any disk activity.
    113 */
    114int laptop_mode;
    115
    116EXPORT_SYMBOL(laptop_mode);
    117
    118/* End of sysctl-exported parameters */
    119
    120struct wb_domain global_wb_domain;
    121
    122/* consolidated parameters for balance_dirty_pages() and its subroutines */
    123struct dirty_throttle_control {
    124#ifdef CONFIG_CGROUP_WRITEBACK
    125	struct wb_domain	*dom;
    126	struct dirty_throttle_control *gdtc;	/* only set in memcg dtc's */
    127#endif
    128	struct bdi_writeback	*wb;
    129	struct fprop_local_percpu *wb_completions;
    130
    131	unsigned long		avail;		/* dirtyable */
    132	unsigned long		dirty;		/* file_dirty + write + nfs */
    133	unsigned long		thresh;		/* dirty threshold */
    134	unsigned long		bg_thresh;	/* dirty background threshold */
    135
    136	unsigned long		wb_dirty;	/* per-wb counterparts */
    137	unsigned long		wb_thresh;
    138	unsigned long		wb_bg_thresh;
    139
    140	unsigned long		pos_ratio;
    141};
    142
    143/*
    144 * Length of period for aging writeout fractions of bdis. This is an
    145 * arbitrarily chosen number. The longer the period, the slower fractions will
    146 * reflect changes in current writeout rate.
    147 */
    148#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
    149
    150#ifdef CONFIG_CGROUP_WRITEBACK
    151
    152#define GDTC_INIT(__wb)		.wb = (__wb),				\
    153				.dom = &global_wb_domain,		\
    154				.wb_completions = &(__wb)->completions
    155
    156#define GDTC_INIT_NO_WB		.dom = &global_wb_domain
    157
    158#define MDTC_INIT(__wb, __gdtc)	.wb = (__wb),				\
    159				.dom = mem_cgroup_wb_domain(__wb),	\
    160				.wb_completions = &(__wb)->memcg_completions, \
    161				.gdtc = __gdtc
    162
    163static bool mdtc_valid(struct dirty_throttle_control *dtc)
    164{
    165	return dtc->dom;
    166}
    167
    168static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
    169{
    170	return dtc->dom;
    171}
    172
    173static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
    174{
    175	return mdtc->gdtc;
    176}
    177
    178static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
    179{
    180	return &wb->memcg_completions;
    181}
    182
    183static void wb_min_max_ratio(struct bdi_writeback *wb,
    184			     unsigned long *minp, unsigned long *maxp)
    185{
    186	unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth);
    187	unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
    188	unsigned long long min = wb->bdi->min_ratio;
    189	unsigned long long max = wb->bdi->max_ratio;
    190
    191	/*
    192	 * @wb may already be clean by the time control reaches here and
    193	 * the total may not include its bw.
    194	 */
    195	if (this_bw < tot_bw) {
    196		if (min) {
    197			min *= this_bw;
    198			min = div64_ul(min, tot_bw);
    199		}
    200		if (max < 100) {
    201			max *= this_bw;
    202			max = div64_ul(max, tot_bw);
    203		}
    204	}
    205
    206	*minp = min;
    207	*maxp = max;
    208}
    209
    210#else	/* CONFIG_CGROUP_WRITEBACK */
    211
    212#define GDTC_INIT(__wb)		.wb = (__wb),                           \
    213				.wb_completions = &(__wb)->completions
    214#define GDTC_INIT_NO_WB
    215#define MDTC_INIT(__wb, __gdtc)
    216
    217static bool mdtc_valid(struct dirty_throttle_control *dtc)
    218{
    219	return false;
    220}
    221
    222static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
    223{
    224	return &global_wb_domain;
    225}
    226
    227static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
    228{
    229	return NULL;
    230}
    231
    232static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
    233{
    234	return NULL;
    235}
    236
    237static void wb_min_max_ratio(struct bdi_writeback *wb,
    238			     unsigned long *minp, unsigned long *maxp)
    239{
    240	*minp = wb->bdi->min_ratio;
    241	*maxp = wb->bdi->max_ratio;
    242}
    243
    244#endif	/* CONFIG_CGROUP_WRITEBACK */
    245
    246/*
    247 * In a memory zone, there is a certain amount of pages we consider
    248 * available for the page cache, which is essentially the number of
    249 * free and reclaimable pages, minus some zone reserves to protect
    250 * lowmem and the ability to uphold the zone's watermarks without
    251 * requiring writeback.
    252 *
    253 * This number of dirtyable pages is the base value of which the
    254 * user-configurable dirty ratio is the effective number of pages that
    255 * are allowed to be actually dirtied.  Per individual zone, or
    256 * globally by using the sum of dirtyable pages over all zones.
    257 *
    258 * Because the user is allowed to specify the dirty limit globally as
    259 * absolute number of bytes, calculating the per-zone dirty limit can
    260 * require translating the configured limit into a percentage of
    261 * global dirtyable memory first.
    262 */
    263
    264/**
    265 * node_dirtyable_memory - number of dirtyable pages in a node
    266 * @pgdat: the node
    267 *
    268 * Return: the node's number of pages potentially available for dirty
    269 * page cache.  This is the base value for the per-node dirty limits.
    270 */
    271static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
    272{
    273	unsigned long nr_pages = 0;
    274	int z;
    275
    276	for (z = 0; z < MAX_NR_ZONES; z++) {
    277		struct zone *zone = pgdat->node_zones + z;
    278
    279		if (!populated_zone(zone))
    280			continue;
    281
    282		nr_pages += zone_page_state(zone, NR_FREE_PAGES);
    283	}
    284
    285	/*
    286	 * Pages reserved for the kernel should not be considered
    287	 * dirtyable, to prevent a situation where reclaim has to
    288	 * clean pages in order to balance the zones.
    289	 */
    290	nr_pages -= min(nr_pages, pgdat->totalreserve_pages);
    291
    292	nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
    293	nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);
    294
    295	return nr_pages;
    296}
    297
    298static unsigned long highmem_dirtyable_memory(unsigned long total)
    299{
    300#ifdef CONFIG_HIGHMEM
    301	int node;
    302	unsigned long x = 0;
    303	int i;
    304
    305	for_each_node_state(node, N_HIGH_MEMORY) {
    306		for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
    307			struct zone *z;
    308			unsigned long nr_pages;
    309
    310			if (!is_highmem_idx(i))
    311				continue;
    312
    313			z = &NODE_DATA(node)->node_zones[i];
    314			if (!populated_zone(z))
    315				continue;
    316
    317			nr_pages = zone_page_state(z, NR_FREE_PAGES);
    318			/* watch for underflows */
    319			nr_pages -= min(nr_pages, high_wmark_pages(z));
    320			nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
    321			nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
    322			x += nr_pages;
    323		}
    324	}
    325
    326	/*
    327	 * Make sure that the number of highmem pages is never larger
    328	 * than the number of the total dirtyable memory. This can only
    329	 * occur in very strange VM situations but we want to make sure
    330	 * that this does not occur.
    331	 */
    332	return min(x, total);
    333#else
    334	return 0;
    335#endif
    336}
    337
    338/**
    339 * global_dirtyable_memory - number of globally dirtyable pages
    340 *
    341 * Return: the global number of pages potentially available for dirty
    342 * page cache.  This is the base value for the global dirty limits.
    343 */
    344static unsigned long global_dirtyable_memory(void)
    345{
    346	unsigned long x;
    347
    348	x = global_zone_page_state(NR_FREE_PAGES);
    349	/*
    350	 * Pages reserved for the kernel should not be considered
    351	 * dirtyable, to prevent a situation where reclaim has to
    352	 * clean pages in order to balance the zones.
    353	 */
    354	x -= min(x, totalreserve_pages);
    355
    356	x += global_node_page_state(NR_INACTIVE_FILE);
    357	x += global_node_page_state(NR_ACTIVE_FILE);
    358
    359	if (!vm_highmem_is_dirtyable)
    360		x -= highmem_dirtyable_memory(x);
    361
    362	return x + 1;	/* Ensure that we never return 0 */
    363}
    364
    365/**
    366 * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
    367 * @dtc: dirty_throttle_control of interest
    368 *
    369 * Calculate @dtc->thresh and ->bg_thresh considering
    370 * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
    371 * must ensure that @dtc->avail is set before calling this function.  The
    372 * dirty limits will be lifted by 1/4 for real-time tasks.
    373 */
    374static void domain_dirty_limits(struct dirty_throttle_control *dtc)
    375{
    376	const unsigned long available_memory = dtc->avail;
    377	struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
    378	unsigned long bytes = vm_dirty_bytes;
    379	unsigned long bg_bytes = dirty_background_bytes;
    380	/* convert ratios to per-PAGE_SIZE for higher precision */
    381	unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
    382	unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
    383	unsigned long thresh;
    384	unsigned long bg_thresh;
    385	struct task_struct *tsk;
    386
    387	/* gdtc is !NULL iff @dtc is for memcg domain */
    388	if (gdtc) {
    389		unsigned long global_avail = gdtc->avail;
    390
    391		/*
    392		 * The byte settings can't be applied directly to memcg
    393		 * domains.  Convert them to ratios by scaling against
    394		 * globally available memory.  As the ratios are in
    395		 * per-PAGE_SIZE, they can be obtained by dividing bytes by
    396		 * number of pages.
    397		 */
    398		if (bytes)
    399			ratio = min(DIV_ROUND_UP(bytes, global_avail),
    400				    PAGE_SIZE);
    401		if (bg_bytes)
    402			bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
    403				       PAGE_SIZE);
    404		bytes = bg_bytes = 0;
    405	}
    406
    407	if (bytes)
    408		thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
    409	else
    410		thresh = (ratio * available_memory) / PAGE_SIZE;
    411
    412	if (bg_bytes)
    413		bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
    414	else
    415		bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
    416
    417	if (bg_thresh >= thresh)
    418		bg_thresh = thresh / 2;
    419	tsk = current;
    420	if (rt_task(tsk)) {
    421		bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
    422		thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
    423	}
    424	dtc->thresh = thresh;
    425	dtc->bg_thresh = bg_thresh;
    426
    427	/* we should eventually report the domain in the TP */
    428	if (!gdtc)
    429		trace_global_dirty_state(bg_thresh, thresh);
    430}
    431
    432/**
    433 * global_dirty_limits - background-writeback and dirty-throttling thresholds
    434 * @pbackground: out parameter for bg_thresh
    435 * @pdirty: out parameter for thresh
    436 *
    437 * Calculate bg_thresh and thresh for global_wb_domain.  See
    438 * domain_dirty_limits() for details.
    439 */
    440void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
    441{
    442	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
    443
    444	gdtc.avail = global_dirtyable_memory();
    445	domain_dirty_limits(&gdtc);
    446
    447	*pbackground = gdtc.bg_thresh;
    448	*pdirty = gdtc.thresh;
    449}
    450
    451/**
    452 * node_dirty_limit - maximum number of dirty pages allowed in a node
    453 * @pgdat: the node
    454 *
    455 * Return: the maximum number of dirty pages allowed in a node, based
    456 * on the node's dirtyable memory.
    457 */
    458static unsigned long node_dirty_limit(struct pglist_data *pgdat)
    459{
    460	unsigned long node_memory = node_dirtyable_memory(pgdat);
    461	struct task_struct *tsk = current;
    462	unsigned long dirty;
    463
    464	if (vm_dirty_bytes)
    465		dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
    466			node_memory / global_dirtyable_memory();
    467	else
    468		dirty = vm_dirty_ratio * node_memory / 100;
    469
    470	if (rt_task(tsk))
    471		dirty += dirty / 4;
    472
    473	return dirty;
    474}
    475
    476/**
    477 * node_dirty_ok - tells whether a node is within its dirty limits
    478 * @pgdat: the node to check
    479 *
    480 * Return: %true when the dirty pages in @pgdat are within the node's
    481 * dirty limit, %false if the limit is exceeded.
    482 */
    483bool node_dirty_ok(struct pglist_data *pgdat)
    484{
    485	unsigned long limit = node_dirty_limit(pgdat);
    486	unsigned long nr_pages = 0;
    487
    488	nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
    489	nr_pages += node_page_state(pgdat, NR_WRITEBACK);
    490
    491	return nr_pages <= limit;
    492}
    493
    494#ifdef CONFIG_SYSCTL
    495static int dirty_background_ratio_handler(struct ctl_table *table, int write,
    496		void *buffer, size_t *lenp, loff_t *ppos)
    497{
    498	int ret;
    499
    500	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
    501	if (ret == 0 && write)
    502		dirty_background_bytes = 0;
    503	return ret;
    504}
    505
    506static int dirty_background_bytes_handler(struct ctl_table *table, int write,
    507		void *buffer, size_t *lenp, loff_t *ppos)
    508{
    509	int ret;
    510
    511	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
    512	if (ret == 0 && write)
    513		dirty_background_ratio = 0;
    514	return ret;
    515}
    516
    517static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
    518		size_t *lenp, loff_t *ppos)
    519{
    520	int old_ratio = vm_dirty_ratio;
    521	int ret;
    522
    523	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
    524	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
    525		writeback_set_ratelimit();
    526		vm_dirty_bytes = 0;
    527	}
    528	return ret;
    529}
    530
    531static int dirty_bytes_handler(struct ctl_table *table, int write,
    532		void *buffer, size_t *lenp, loff_t *ppos)
    533{
    534	unsigned long old_bytes = vm_dirty_bytes;
    535	int ret;
    536
    537	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
    538	if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
    539		writeback_set_ratelimit();
    540		vm_dirty_ratio = 0;
    541	}
    542	return ret;
    543}
    544#endif
    545
    546static unsigned long wp_next_time(unsigned long cur_time)
    547{
    548	cur_time += VM_COMPLETIONS_PERIOD_LEN;
    549	/* 0 has a special meaning... */
    550	if (!cur_time)
    551		return 1;
    552	return cur_time;
    553}
    554
    555static void wb_domain_writeout_add(struct wb_domain *dom,
    556				   struct fprop_local_percpu *completions,
    557				   unsigned int max_prop_frac, long nr)
    558{
    559	__fprop_add_percpu_max(&dom->completions, completions,
    560			       max_prop_frac, nr);
    561	/* First event after period switching was turned off? */
    562	if (unlikely(!dom->period_time)) {
    563		/*
    564		 * We can race with other __bdi_writeout_inc calls here but
    565		 * it does not cause any harm since the resulting time when
    566		 * timer will fire and what is in writeout_period_time will be
    567		 * roughly the same.
    568		 */
    569		dom->period_time = wp_next_time(jiffies);
    570		mod_timer(&dom->period_timer, dom->period_time);
    571	}
    572}
    573
    574/*
    575 * Increment @wb's writeout completion count and the global writeout
    576 * completion count. Called from __folio_end_writeback().
    577 */
    578static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr)
    579{
    580	struct wb_domain *cgdom;
    581
    582	wb_stat_mod(wb, WB_WRITTEN, nr);
    583	wb_domain_writeout_add(&global_wb_domain, &wb->completions,
    584			       wb->bdi->max_prop_frac, nr);
    585
    586	cgdom = mem_cgroup_wb_domain(wb);
    587	if (cgdom)
    588		wb_domain_writeout_add(cgdom, wb_memcg_completions(wb),
    589				       wb->bdi->max_prop_frac, nr);
    590}
    591
    592void wb_writeout_inc(struct bdi_writeback *wb)
    593{
    594	unsigned long flags;
    595
    596	local_irq_save(flags);
    597	__wb_writeout_add(wb, 1);
    598	local_irq_restore(flags);
    599}
    600EXPORT_SYMBOL_GPL(wb_writeout_inc);
    601
    602/*
    603 * On idle system, we can be called long after we scheduled because we use
    604 * deferred timers so count with missed periods.
    605 */
    606static void writeout_period(struct timer_list *t)
    607{
    608	struct wb_domain *dom = from_timer(dom, t, period_timer);
    609	int miss_periods = (jiffies - dom->period_time) /
    610						 VM_COMPLETIONS_PERIOD_LEN;
    611
    612	if (fprop_new_period(&dom->completions, miss_periods + 1)) {
    613		dom->period_time = wp_next_time(dom->period_time +
    614				miss_periods * VM_COMPLETIONS_PERIOD_LEN);
    615		mod_timer(&dom->period_timer, dom->period_time);
    616	} else {
    617		/*
    618		 * Aging has zeroed all fractions. Stop wasting CPU on period
    619		 * updates.
    620		 */
    621		dom->period_time = 0;
    622	}
    623}
    624
    625int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
    626{
    627	memset(dom, 0, sizeof(*dom));
    628
    629	spin_lock_init(&dom->lock);
    630
    631	timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);
    632
    633	dom->dirty_limit_tstamp = jiffies;
    634
    635	return fprop_global_init(&dom->completions, gfp);
    636}
    637
    638#ifdef CONFIG_CGROUP_WRITEBACK
    639void wb_domain_exit(struct wb_domain *dom)
    640{
    641	del_timer_sync(&dom->period_timer);
    642	fprop_global_destroy(&dom->completions);
    643}
    644#endif
    645
    646/*
    647 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
    648 * registered backing devices, which, for obvious reasons, can not
    649 * exceed 100%.
    650 */
    651static unsigned int bdi_min_ratio;
    652
    653int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
    654{
    655	unsigned int delta;
    656	int ret = 0;
    657
    658	spin_lock_bh(&bdi_lock);
    659	if (min_ratio > bdi->max_ratio) {
    660		ret = -EINVAL;
    661	} else {
    662		if (min_ratio < bdi->min_ratio) {
    663			delta = bdi->min_ratio - min_ratio;
    664			bdi_min_ratio -= delta;
    665			bdi->min_ratio = min_ratio;
    666		} else {
    667			delta = min_ratio - bdi->min_ratio;
    668			if (bdi_min_ratio + delta < 100) {
    669				bdi_min_ratio += delta;
    670				bdi->min_ratio = min_ratio;
    671			} else {
    672				ret = -EINVAL;
    673			}
    674		}
    675	}
    676	spin_unlock_bh(&bdi_lock);
    677
    678	return ret;
    679}
    680
    681int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
    682{
    683	int ret = 0;
    684
    685	if (max_ratio > 100)
    686		return -EINVAL;
    687
    688	spin_lock_bh(&bdi_lock);
    689	if (bdi->min_ratio > max_ratio) {
    690		ret = -EINVAL;
    691	} else {
    692		bdi->max_ratio = max_ratio;
    693		bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
    694	}
    695	spin_unlock_bh(&bdi_lock);
    696
    697	return ret;
    698}
    699EXPORT_SYMBOL(bdi_set_max_ratio);
    700
    701static unsigned long dirty_freerun_ceiling(unsigned long thresh,
    702					   unsigned long bg_thresh)
    703{
    704	return (thresh + bg_thresh) / 2;
    705}
    706
    707static unsigned long hard_dirty_limit(struct wb_domain *dom,
    708				      unsigned long thresh)
    709{
    710	return max(thresh, dom->dirty_limit);
    711}
    712
    713/*
    714 * Memory which can be further allocated to a memcg domain is capped by
    715 * system-wide clean memory excluding the amount being used in the domain.
    716 */
    717static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
    718			    unsigned long filepages, unsigned long headroom)
    719{
    720	struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
    721	unsigned long clean = filepages - min(filepages, mdtc->dirty);
    722	unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
    723	unsigned long other_clean = global_clean - min(global_clean, clean);
    724
    725	mdtc->avail = filepages + min(headroom, other_clean);
    726}
    727
    728/**
    729 * __wb_calc_thresh - @wb's share of dirty throttling threshold
    730 * @dtc: dirty_throttle_context of interest
    731 *
    732 * Note that balance_dirty_pages() will only seriously take it as a hard limit
    733 * when sleeping max_pause per page is not enough to keep the dirty pages under
    734 * control. For example, when the device is completely stalled due to some error
    735 * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
    736 * In the other normal situations, it acts more gently by throttling the tasks
    737 * more (rather than completely block them) when the wb dirty pages go high.
    738 *
    739 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
    740 * - starving fast devices
    741 * - piling up dirty pages (that will take long time to sync) on slow devices
    742 *
    743 * The wb's share of dirty limit will be adapting to its throughput and
    744 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
    745 *
    746 * Return: @wb's dirty limit in pages. The term "dirty" in the context of
    747 * dirty balancing includes all PG_dirty and PG_writeback pages.
    748 */
    749static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
    750{
    751	struct wb_domain *dom = dtc_dom(dtc);
    752	unsigned long thresh = dtc->thresh;
    753	u64 wb_thresh;
    754	unsigned long numerator, denominator;
    755	unsigned long wb_min_ratio, wb_max_ratio;
    756
    757	/*
    758	 * Calculate this BDI's share of the thresh ratio.
    759	 */
    760	fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
    761			      &numerator, &denominator);
    762
    763	wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
    764	wb_thresh *= numerator;
    765	wb_thresh = div64_ul(wb_thresh, denominator);
    766
    767	wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
    768
    769	wb_thresh += (thresh * wb_min_ratio) / 100;
    770	if (wb_thresh > (thresh * wb_max_ratio) / 100)
    771		wb_thresh = thresh * wb_max_ratio / 100;
    772
    773	return wb_thresh;
    774}
    775
    776unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
    777{
    778	struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
    779					       .thresh = thresh };
    780	return __wb_calc_thresh(&gdtc);
    781}
    782
    783/*
    784 *                           setpoint - dirty 3
    785 *        f(dirty) := 1.0 + (----------------)
    786 *                           limit - setpoint
    787 *
    788 * it's a 3rd order polynomial that subjects to
    789 *
    790 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
    791 * (2) f(setpoint) = 1.0 => the balance point
    792 * (3) f(limit)    = 0   => the hard limit
    793 * (4) df/dx      <= 0	 => negative feedback control
    794 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
    795 *     => fast response on large errors; small oscillation near setpoint
    796 */
    797static long long pos_ratio_polynom(unsigned long setpoint,
    798					  unsigned long dirty,
    799					  unsigned long limit)
    800{
    801	long long pos_ratio;
    802	long x;
    803
    804	x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
    805		      (limit - setpoint) | 1);
    806	pos_ratio = x;
    807	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
    808	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
    809	pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
    810
    811	return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
    812}
    813
    814/*
    815 * Dirty position control.
    816 *
    817 * (o) global/bdi setpoints
    818 *
    819 * We want the dirty pages be balanced around the global/wb setpoints.
    820 * When the number of dirty pages is higher/lower than the setpoint, the
    821 * dirty position control ratio (and hence task dirty ratelimit) will be
    822 * decreased/increased to bring the dirty pages back to the setpoint.
    823 *
    824 *     pos_ratio = 1 << RATELIMIT_CALC_SHIFT
    825 *
    826 *     if (dirty < setpoint) scale up   pos_ratio
    827 *     if (dirty > setpoint) scale down pos_ratio
    828 *
    829 *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
    830 *     if (wb_dirty > wb_setpoint) scale down pos_ratio
    831 *
    832 *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
    833 *
    834 * (o) global control line
    835 *
    836 *     ^ pos_ratio
    837 *     |
    838 *     |            |<===== global dirty control scope ======>|
    839 * 2.0  * * * * * * *
    840 *     |            .*
    841 *     |            . *
    842 *     |            .   *
    843 *     |            .     *
    844 *     |            .        *
    845 *     |            .            *
    846 * 1.0 ................................*
    847 *     |            .                  .     *
    848 *     |            .                  .          *
    849 *     |            .                  .              *
    850 *     |            .                  .                 *
    851 *     |            .                  .                    *
    852 *   0 +------------.------------------.----------------------*------------->
    853 *           freerun^          setpoint^                 limit^   dirty pages
    854 *
    855 * (o) wb control line
    856 *
    857 *     ^ pos_ratio
    858 *     |
    859 *     |            *
    860 *     |              *
    861 *     |                *
    862 *     |                  *
    863 *     |                    * |<=========== span ============>|
    864 * 1.0 .......................*
    865 *     |                      . *
    866 *     |                      .   *
    867 *     |                      .     *
    868 *     |                      .       *
    869 *     |                      .         *
    870 *     |                      .           *
    871 *     |                      .             *
    872 *     |                      .               *
    873 *     |                      .                 *
    874 *     |                      .                   *
    875 *     |                      .                     *
    876 * 1/4 ...............................................* * * * * * * * * * * *
    877 *     |                      .                         .
    878 *     |                      .                           .
    879 *     |                      .                             .
    880 *   0 +----------------------.-------------------------------.------------->
    881 *                wb_setpoint^                    x_intercept^
    882 *
    883 * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
    884 * be smoothly throttled down to normal if it starts high in situations like
    885 * - start writing to a slow SD card and a fast disk at the same time. The SD
    886 *   card's wb_dirty may rush to many times higher than wb_setpoint.
    887 * - the wb dirty thresh drops quickly due to change of JBOD workload
    888 */
    889static void wb_position_ratio(struct dirty_throttle_control *dtc)
    890{
    891	struct bdi_writeback *wb = dtc->wb;
    892	unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
    893	unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
    894	unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
    895	unsigned long wb_thresh = dtc->wb_thresh;
    896	unsigned long x_intercept;
    897	unsigned long setpoint;		/* dirty pages' target balance point */
    898	unsigned long wb_setpoint;
    899	unsigned long span;
    900	long long pos_ratio;		/* for scaling up/down the rate limit */
    901	long x;
    902
    903	dtc->pos_ratio = 0;
    904
    905	if (unlikely(dtc->dirty >= limit))
    906		return;
    907
    908	/*
    909	 * global setpoint
    910	 *
    911	 * See comment for pos_ratio_polynom().
    912	 */
    913	setpoint = (freerun + limit) / 2;
    914	pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
    915
    916	/*
    917	 * The strictlimit feature is a tool preventing mistrusted filesystems
    918	 * from growing a large number of dirty pages before throttling. For
    919	 * such filesystems balance_dirty_pages always checks wb counters
    920	 * against wb limits. Even if global "nr_dirty" is under "freerun".
    921	 * This is especially important for fuse which sets bdi->max_ratio to
    922	 * 1% by default. Without strictlimit feature, fuse writeback may
    923	 * consume arbitrary amount of RAM because it is accounted in
    924	 * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
    925	 *
    926	 * Here, in wb_position_ratio(), we calculate pos_ratio based on
    927	 * two values: wb_dirty and wb_thresh. Let's consider an example:
    928	 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
    929	 * limits are set by default to 10% and 20% (background and throttle).
    930	 * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
    931	 * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
    932	 * about ~6K pages (as the average of background and throttle wb
    933	 * limits). The 3rd order polynomial will provide positive feedback if
    934	 * wb_dirty is under wb_setpoint and vice versa.
    935	 *
    936	 * Note, that we cannot use global counters in these calculations
    937	 * because we want to throttle process writing to a strictlimit wb
    938	 * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
    939	 * in the example above).
    940	 */
    941	if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
    942		long long wb_pos_ratio;
    943
    944		if (dtc->wb_dirty < 8) {
    945			dtc->pos_ratio = min_t(long long, pos_ratio * 2,
    946					   2 << RATELIMIT_CALC_SHIFT);
    947			return;
    948		}
    949
    950		if (dtc->wb_dirty >= wb_thresh)
    951			return;
    952
    953		wb_setpoint = dirty_freerun_ceiling(wb_thresh,
    954						    dtc->wb_bg_thresh);
    955
    956		if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
    957			return;
    958
    959		wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
    960						 wb_thresh);
    961
    962		/*
    963		 * Typically, for strictlimit case, wb_setpoint << setpoint
    964		 * and pos_ratio >> wb_pos_ratio. In the other words global
    965		 * state ("dirty") is not limiting factor and we have to
    966		 * make decision based on wb counters. But there is an
    967		 * important case when global pos_ratio should get precedence:
    968		 * global limits are exceeded (e.g. due to activities on other
    969		 * wb's) while given strictlimit wb is below limit.
    970		 *
    971		 * "pos_ratio * wb_pos_ratio" would work for the case above,
    972		 * but it would look too non-natural for the case of all
    973		 * activity in the system coming from a single strictlimit wb
    974		 * with bdi->max_ratio == 100%.
    975		 *
    976		 * Note that min() below somewhat changes the dynamics of the
    977		 * control system. Normally, pos_ratio value can be well over 3
    978		 * (when globally we are at freerun and wb is well below wb
    979		 * setpoint). Now the maximum pos_ratio in the same situation
    980		 * is 2. We might want to tweak this if we observe the control
    981		 * system is too slow to adapt.
    982		 */
    983		dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
    984		return;
    985	}
    986
    987	/*
    988	 * We have computed basic pos_ratio above based on global situation. If
    989	 * the wb is over/under its share of dirty pages, we want to scale
    990	 * pos_ratio further down/up. That is done by the following mechanism.
    991	 */
    992
    993	/*
    994	 * wb setpoint
    995	 *
    996	 *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
    997	 *
    998	 *                        x_intercept - wb_dirty
    999	 *                     := --------------------------
   1000	 *                        x_intercept - wb_setpoint
   1001	 *
   1002	 * The main wb control line is a linear function that subjects to
   1003	 *
   1004	 * (1) f(wb_setpoint) = 1.0
   1005	 * (2) k = - 1 / (8 * write_bw)  (in single wb case)
   1006	 *     or equally: x_intercept = wb_setpoint + 8 * write_bw
   1007	 *
   1008	 * For single wb case, the dirty pages are observed to fluctuate
   1009	 * regularly within range
   1010	 *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
   1011	 * for various filesystems, where (2) can yield in a reasonable 12.5%
   1012	 * fluctuation range for pos_ratio.
   1013	 *
   1014	 * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
   1015	 * own size, so move the slope over accordingly and choose a slope that
   1016	 * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
   1017	 */
   1018	if (unlikely(wb_thresh > dtc->thresh))
   1019		wb_thresh = dtc->thresh;
   1020	/*
   1021	 * It's very possible that wb_thresh is close to 0 not because the
   1022	 * device is slow, but that it has remained inactive for long time.
   1023	 * Honour such devices a reasonable good (hopefully IO efficient)
   1024	 * threshold, so that the occasional writes won't be blocked and active
   1025	 * writes can rampup the threshold quickly.
   1026	 */
   1027	wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
   1028	/*
   1029	 * scale global setpoint to wb's:
   1030	 *	wb_setpoint = setpoint * wb_thresh / thresh
   1031	 */
   1032	x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
   1033	wb_setpoint = setpoint * (u64)x >> 16;
   1034	/*
   1035	 * Use span=(8*write_bw) in single wb case as indicated by
   1036	 * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
   1037	 *
   1038	 *        wb_thresh                    thresh - wb_thresh
   1039	 * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
   1040	 *         thresh                           thresh
   1041	 */
   1042	span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
   1043	x_intercept = wb_setpoint + span;
   1044
   1045	if (dtc->wb_dirty < x_intercept - span / 4) {
   1046		pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
   1047				      (x_intercept - wb_setpoint) | 1);
   1048	} else
   1049		pos_ratio /= 4;
   1050
   1051	/*
   1052	 * wb reserve area, safeguard against dirty pool underrun and disk idle
   1053	 * It may push the desired control point of global dirty pages higher
   1054	 * than setpoint.
   1055	 */
   1056	x_intercept = wb_thresh / 2;
   1057	if (dtc->wb_dirty < x_intercept) {
   1058		if (dtc->wb_dirty > x_intercept / 8)
   1059			pos_ratio = div_u64(pos_ratio * x_intercept,
   1060					    dtc->wb_dirty);
   1061		else
   1062			pos_ratio *= 8;
   1063	}
   1064
   1065	dtc->pos_ratio = pos_ratio;
   1066}
   1067
   1068static void wb_update_write_bandwidth(struct bdi_writeback *wb,
   1069				      unsigned long elapsed,
   1070				      unsigned long written)
   1071{
   1072	const unsigned long period = roundup_pow_of_two(3 * HZ);
   1073	unsigned long avg = wb->avg_write_bandwidth;
   1074	unsigned long old = wb->write_bandwidth;
   1075	u64 bw;
   1076
   1077	/*
   1078	 * bw = written * HZ / elapsed
   1079	 *
   1080	 *                   bw * elapsed + write_bandwidth * (period - elapsed)
   1081	 * write_bandwidth = ---------------------------------------------------
   1082	 *                                          period
   1083	 *
   1084	 * @written may have decreased due to folio_account_redirty().
   1085	 * Avoid underflowing @bw calculation.
   1086	 */
   1087	bw = written - min(written, wb->written_stamp);
   1088	bw *= HZ;
   1089	if (unlikely(elapsed > period)) {
   1090		bw = div64_ul(bw, elapsed);
   1091		avg = bw;
   1092		goto out;
   1093	}
   1094	bw += (u64)wb->write_bandwidth * (period - elapsed);
   1095	bw >>= ilog2(period);
   1096
   1097	/*
   1098	 * one more level of smoothing, for filtering out sudden spikes
   1099	 */
   1100	if (avg > old && old >= (unsigned long)bw)
   1101		avg -= (avg - old) >> 3;
   1102
   1103	if (avg < old && old <= (unsigned long)bw)
   1104		avg += (old - avg) >> 3;
   1105
   1106out:
   1107	/* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
   1108	avg = max(avg, 1LU);
   1109	if (wb_has_dirty_io(wb)) {
   1110		long delta = avg - wb->avg_write_bandwidth;
   1111		WARN_ON_ONCE(atomic_long_add_return(delta,
   1112					&wb->bdi->tot_write_bandwidth) <= 0);
   1113	}
   1114	wb->write_bandwidth = bw;
   1115	WRITE_ONCE(wb->avg_write_bandwidth, avg);
   1116}
   1117
   1118static void update_dirty_limit(struct dirty_throttle_control *dtc)
   1119{
   1120	struct wb_domain *dom = dtc_dom(dtc);
   1121	unsigned long thresh = dtc->thresh;
   1122	unsigned long limit = dom->dirty_limit;
   1123
   1124	/*
   1125	 * Follow up in one step.
   1126	 */
   1127	if (limit < thresh) {
   1128		limit = thresh;
   1129		goto update;
   1130	}
   1131
   1132	/*
   1133	 * Follow down slowly. Use the higher one as the target, because thresh
   1134	 * may drop below dirty. This is exactly the reason to introduce
   1135	 * dom->dirty_limit which is guaranteed to lie above the dirty pages.
   1136	 */
   1137	thresh = max(thresh, dtc->dirty);
   1138	if (limit > thresh) {
   1139		limit -= (limit - thresh) >> 5;
   1140		goto update;
   1141	}
   1142	return;
   1143update:
   1144	dom->dirty_limit = limit;
   1145}
   1146
   1147static void domain_update_dirty_limit(struct dirty_throttle_control *dtc,
   1148				      unsigned long now)
   1149{
   1150	struct wb_domain *dom = dtc_dom(dtc);
   1151
   1152	/*
   1153	 * check locklessly first to optimize away locking for the most time
   1154	 */
   1155	if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
   1156		return;
   1157
   1158	spin_lock(&dom->lock);
   1159	if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
   1160		update_dirty_limit(dtc);
   1161		dom->dirty_limit_tstamp = now;
   1162	}
   1163	spin_unlock(&dom->lock);
   1164}
   1165
   1166/*
   1167 * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
   1168 *
   1169 * Normal wb tasks will be curbed at or below it in long term.
   1170 * Obviously it should be around (write_bw / N) when there are N dd tasks.
   1171 */
   1172static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
   1173				      unsigned long dirtied,
   1174				      unsigned long elapsed)
   1175{
   1176	struct bdi_writeback *wb = dtc->wb;
   1177	unsigned long dirty = dtc->dirty;
   1178	unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
   1179	unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
   1180	unsigned long setpoint = (freerun + limit) / 2;
   1181	unsigned long write_bw = wb->avg_write_bandwidth;
   1182	unsigned long dirty_ratelimit = wb->dirty_ratelimit;
   1183	unsigned long dirty_rate;
   1184	unsigned long task_ratelimit;
   1185	unsigned long balanced_dirty_ratelimit;
   1186	unsigned long step;
   1187	unsigned long x;
   1188	unsigned long shift;
   1189
   1190	/*
   1191	 * The dirty rate will match the writeout rate in long term, except
   1192	 * when dirty pages are truncated by userspace or re-dirtied by FS.
   1193	 */
   1194	dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
   1195
   1196	/*
   1197	 * task_ratelimit reflects each dd's dirty rate for the past 200ms.
   1198	 */
   1199	task_ratelimit = (u64)dirty_ratelimit *
   1200					dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
   1201	task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
   1202
   1203	/*
   1204	 * A linear estimation of the "balanced" throttle rate. The theory is,
   1205	 * if there are N dd tasks, each throttled at task_ratelimit, the wb's
   1206	 * dirty_rate will be measured to be (N * task_ratelimit). So the below
   1207	 * formula will yield the balanced rate limit (write_bw / N).
   1208	 *
   1209	 * Note that the expanded form is not a pure rate feedback:
   1210	 *	rate_(i+1) = rate_(i) * (write_bw / dirty_rate)		     (1)
   1211	 * but also takes pos_ratio into account:
   1212	 *	rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio  (2)
   1213	 *
   1214	 * (1) is not realistic because pos_ratio also takes part in balancing
   1215	 * the dirty rate.  Consider the state
   1216	 *	pos_ratio = 0.5						     (3)
   1217	 *	rate = 2 * (write_bw / N)				     (4)
   1218	 * If (1) is used, it will stuck in that state! Because each dd will
   1219	 * be throttled at
   1220	 *	task_ratelimit = pos_ratio * rate = (write_bw / N)	     (5)
   1221	 * yielding
   1222	 *	dirty_rate = N * task_ratelimit = write_bw		     (6)
   1223	 * put (6) into (1) we get
   1224	 *	rate_(i+1) = rate_(i)					     (7)
   1225	 *
   1226	 * So we end up using (2) to always keep
   1227	 *	rate_(i+1) ~= (write_bw / N)				     (8)
   1228	 * regardless of the value of pos_ratio. As long as (8) is satisfied,
   1229	 * pos_ratio is able to drive itself to 1.0, which is not only where
   1230	 * the dirty count meet the setpoint, but also where the slope of
   1231	 * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
   1232	 */
   1233	balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
   1234					   dirty_rate | 1);
   1235	/*
   1236	 * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
   1237	 */
   1238	if (unlikely(balanced_dirty_ratelimit > write_bw))
   1239		balanced_dirty_ratelimit = write_bw;
   1240
   1241	/*
   1242	 * We could safely do this and return immediately:
   1243	 *
   1244	 *	wb->dirty_ratelimit = balanced_dirty_ratelimit;
   1245	 *
   1246	 * However to get a more stable dirty_ratelimit, the below elaborated
   1247	 * code makes use of task_ratelimit to filter out singular points and
   1248	 * limit the step size.
   1249	 *
   1250	 * The below code essentially only uses the relative value of
   1251	 *
   1252	 *	task_ratelimit - dirty_ratelimit
   1253	 *	= (pos_ratio - 1) * dirty_ratelimit
   1254	 *
   1255	 * which reflects the direction and size of dirty position error.
   1256	 */
   1257
   1258	/*
   1259	 * dirty_ratelimit will follow balanced_dirty_ratelimit iff
   1260	 * task_ratelimit is on the same side of dirty_ratelimit, too.
   1261	 * For example, when
   1262	 * - dirty_ratelimit > balanced_dirty_ratelimit
   1263	 * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
   1264	 * lowering dirty_ratelimit will help meet both the position and rate
   1265	 * control targets. Otherwise, don't update dirty_ratelimit if it will
   1266	 * only help meet the rate target. After all, what the users ultimately
   1267	 * feel and care are stable dirty rate and small position error.
   1268	 *
   1269	 * |task_ratelimit - dirty_ratelimit| is used to limit the step size
   1270	 * and filter out the singular points of balanced_dirty_ratelimit. Which
   1271	 * keeps jumping around randomly and can even leap far away at times
   1272	 * due to the small 200ms estimation period of dirty_rate (we want to
   1273	 * keep that period small to reduce time lags).
   1274	 */
   1275	step = 0;
   1276
   1277	/*
   1278	 * For strictlimit case, calculations above were based on wb counters
   1279	 * and limits (starting from pos_ratio = wb_position_ratio() and up to
   1280	 * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
   1281	 * Hence, to calculate "step" properly, we have to use wb_dirty as
   1282	 * "dirty" and wb_setpoint as "setpoint".
   1283	 *
   1284	 * We rampup dirty_ratelimit forcibly if wb_dirty is low because
   1285	 * it's possible that wb_thresh is close to zero due to inactivity
   1286	 * of backing device.
   1287	 */
   1288	if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
   1289		dirty = dtc->wb_dirty;
   1290		if (dtc->wb_dirty < 8)
   1291			setpoint = dtc->wb_dirty + 1;
   1292		else
   1293			setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
   1294	}
   1295
   1296	if (dirty < setpoint) {
   1297		x = min3(wb->balanced_dirty_ratelimit,
   1298			 balanced_dirty_ratelimit, task_ratelimit);
   1299		if (dirty_ratelimit < x)
   1300			step = x - dirty_ratelimit;
   1301	} else {
   1302		x = max3(wb->balanced_dirty_ratelimit,
   1303			 balanced_dirty_ratelimit, task_ratelimit);
   1304		if (dirty_ratelimit > x)
   1305			step = dirty_ratelimit - x;
   1306	}
   1307
   1308	/*
   1309	 * Don't pursue 100% rate matching. It's impossible since the balanced
   1310	 * rate itself is constantly fluctuating. So decrease the track speed
   1311	 * when it gets close to the target. Helps eliminate pointless tremors.
   1312	 */
   1313	shift = dirty_ratelimit / (2 * step + 1);
   1314	if (shift < BITS_PER_LONG)
   1315		step = DIV_ROUND_UP(step >> shift, 8);
   1316	else
   1317		step = 0;
   1318
   1319	if (dirty_ratelimit < balanced_dirty_ratelimit)
   1320		dirty_ratelimit += step;
   1321	else
   1322		dirty_ratelimit -= step;
   1323
   1324	WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL));
   1325	wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
   1326
   1327	trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
   1328}
   1329
   1330static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
   1331				  struct dirty_throttle_control *mdtc,
   1332				  bool update_ratelimit)
   1333{
   1334	struct bdi_writeback *wb = gdtc->wb;
   1335	unsigned long now = jiffies;
   1336	unsigned long elapsed;
   1337	unsigned long dirtied;
   1338	unsigned long written;
   1339
   1340	spin_lock(&wb->list_lock);
   1341
   1342	/*
   1343	 * Lockless checks for elapsed time are racy and delayed update after
   1344	 * IO completion doesn't do it at all (to make sure written pages are
   1345	 * accounted reasonably quickly). Make sure elapsed >= 1 to avoid
   1346	 * division errors.
   1347	 */
   1348	elapsed = max(now - wb->bw_time_stamp, 1UL);
   1349	dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
   1350	written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
   1351
   1352	if (update_ratelimit) {
   1353		domain_update_dirty_limit(gdtc, now);
   1354		wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
   1355
   1356		/*
   1357		 * @mdtc is always NULL if !CGROUP_WRITEBACK but the
   1358		 * compiler has no way to figure that out.  Help it.
   1359		 */
   1360		if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
   1361			domain_update_dirty_limit(mdtc, now);
   1362			wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
   1363		}
   1364	}
   1365	wb_update_write_bandwidth(wb, elapsed, written);
   1366
   1367	wb->dirtied_stamp = dirtied;
   1368	wb->written_stamp = written;
   1369	WRITE_ONCE(wb->bw_time_stamp, now);
   1370	spin_unlock(&wb->list_lock);
   1371}
   1372
   1373void wb_update_bandwidth(struct bdi_writeback *wb)
   1374{
   1375	struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
   1376
   1377	__wb_update_bandwidth(&gdtc, NULL, false);
   1378}
   1379
   1380/* Interval after which we consider wb idle and don't estimate bandwidth */
   1381#define WB_BANDWIDTH_IDLE_JIF (HZ)
   1382
   1383static void wb_bandwidth_estimate_start(struct bdi_writeback *wb)
   1384{
   1385	unsigned long now = jiffies;
   1386	unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp);
   1387
   1388	if (elapsed > WB_BANDWIDTH_IDLE_JIF &&
   1389	    !atomic_read(&wb->writeback_inodes)) {
   1390		spin_lock(&wb->list_lock);
   1391		wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED);
   1392		wb->written_stamp = wb_stat(wb, WB_WRITTEN);
   1393		WRITE_ONCE(wb->bw_time_stamp, now);
   1394		spin_unlock(&wb->list_lock);
   1395	}
   1396}
   1397
   1398/*
   1399 * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
   1400 * will look to see if it needs to start dirty throttling.
   1401 *
   1402 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
   1403 * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
   1404 * (the number of pages we may dirty without exceeding the dirty limits).
   1405 */
   1406static unsigned long dirty_poll_interval(unsigned long dirty,
   1407					 unsigned long thresh)
   1408{
   1409	if (thresh > dirty)
   1410		return 1UL << (ilog2(thresh - dirty) >> 1);
   1411
   1412	return 1;
   1413}
   1414
   1415static unsigned long wb_max_pause(struct bdi_writeback *wb,
   1416				  unsigned long wb_dirty)
   1417{
   1418	unsigned long bw = READ_ONCE(wb->avg_write_bandwidth);
   1419	unsigned long t;
   1420
   1421	/*
   1422	 * Limit pause time for small memory systems. If sleeping for too long
   1423	 * time, a small pool of dirty/writeback pages may go empty and disk go
   1424	 * idle.
   1425	 *
   1426	 * 8 serves as the safety ratio.
   1427	 */
   1428	t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
   1429	t++;
   1430
   1431	return min_t(unsigned long, t, MAX_PAUSE);
   1432}
   1433
   1434static long wb_min_pause(struct bdi_writeback *wb,
   1435			 long max_pause,
   1436			 unsigned long task_ratelimit,
   1437			 unsigned long dirty_ratelimit,
   1438			 int *nr_dirtied_pause)
   1439{
   1440	long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth));
   1441	long lo = ilog2(READ_ONCE(wb->dirty_ratelimit));
   1442	long t;		/* target pause */
   1443	long pause;	/* estimated next pause */
   1444	int pages;	/* target nr_dirtied_pause */
   1445
   1446	/* target for 10ms pause on 1-dd case */
   1447	t = max(1, HZ / 100);
   1448
   1449	/*
   1450	 * Scale up pause time for concurrent dirtiers in order to reduce CPU
   1451	 * overheads.
   1452	 *
   1453	 * (N * 10ms) on 2^N concurrent tasks.
   1454	 */
   1455	if (hi > lo)
   1456		t += (hi - lo) * (10 * HZ) / 1024;
   1457
   1458	/*
   1459	 * This is a bit convoluted. We try to base the next nr_dirtied_pause
   1460	 * on the much more stable dirty_ratelimit. However the next pause time
   1461	 * will be computed based on task_ratelimit and the two rate limits may
   1462	 * depart considerably at some time. Especially if task_ratelimit goes
   1463	 * below dirty_ratelimit/2 and the target pause is max_pause, the next
   1464	 * pause time will be max_pause*2 _trimmed down_ to max_pause.  As a
   1465	 * result task_ratelimit won't be executed faithfully, which could
   1466	 * eventually bring down dirty_ratelimit.
   1467	 *
   1468	 * We apply two rules to fix it up:
   1469	 * 1) try to estimate the next pause time and if necessary, use a lower
   1470	 *    nr_dirtied_pause so as not to exceed max_pause. When this happens,
   1471	 *    nr_dirtied_pause will be "dancing" with task_ratelimit.
   1472	 * 2) limit the target pause time to max_pause/2, so that the normal
   1473	 *    small fluctuations of task_ratelimit won't trigger rule (1) and
   1474	 *    nr_dirtied_pause will remain as stable as dirty_ratelimit.
   1475	 */
   1476	t = min(t, 1 + max_pause / 2);
   1477	pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
   1478
   1479	/*
   1480	 * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
   1481	 * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
   1482	 * When the 16 consecutive reads are often interrupted by some dirty
   1483	 * throttling pause during the async writes, cfq will go into idles
   1484	 * (deadline is fine). So push nr_dirtied_pause as high as possible
   1485	 * until reaches DIRTY_POLL_THRESH=32 pages.
   1486	 */
   1487	if (pages < DIRTY_POLL_THRESH) {
   1488		t = max_pause;
   1489		pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
   1490		if (pages > DIRTY_POLL_THRESH) {
   1491			pages = DIRTY_POLL_THRESH;
   1492			t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
   1493		}
   1494	}
   1495
   1496	pause = HZ * pages / (task_ratelimit + 1);
   1497	if (pause > max_pause) {
   1498		t = max_pause;
   1499		pages = task_ratelimit * t / roundup_pow_of_two(HZ);
   1500	}
   1501
   1502	*nr_dirtied_pause = pages;
   1503	/*
   1504	 * The minimal pause time will normally be half the target pause time.
   1505	 */
   1506	return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
   1507}
   1508
   1509static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
   1510{
   1511	struct bdi_writeback *wb = dtc->wb;
   1512	unsigned long wb_reclaimable;
   1513
   1514	/*
   1515	 * wb_thresh is not treated as some limiting factor as
   1516	 * dirty_thresh, due to reasons
   1517	 * - in JBOD setup, wb_thresh can fluctuate a lot
   1518	 * - in a system with HDD and USB key, the USB key may somehow
   1519	 *   go into state (wb_dirty >> wb_thresh) either because
   1520	 *   wb_dirty starts high, or because wb_thresh drops low.
   1521	 *   In this case we don't want to hard throttle the USB key
   1522	 *   dirtiers for 100 seconds until wb_dirty drops under
   1523	 *   wb_thresh. Instead the auxiliary wb control line in
   1524	 *   wb_position_ratio() will let the dirtier task progress
   1525	 *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
   1526	 */
   1527	dtc->wb_thresh = __wb_calc_thresh(dtc);
   1528	dtc->wb_bg_thresh = dtc->thresh ?
   1529		div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
   1530
   1531	/*
   1532	 * In order to avoid the stacked BDI deadlock we need
   1533	 * to ensure we accurately count the 'dirty' pages when
   1534	 * the threshold is low.
   1535	 *
   1536	 * Otherwise it would be possible to get thresh+n pages
   1537	 * reported dirty, even though there are thresh-m pages
   1538	 * actually dirty; with m+n sitting in the percpu
   1539	 * deltas.
   1540	 */
   1541	if (dtc->wb_thresh < 2 * wb_stat_error()) {
   1542		wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
   1543		dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
   1544	} else {
   1545		wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
   1546		dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
   1547	}
   1548}
   1549
   1550/*
   1551 * balance_dirty_pages() must be called by processes which are generating dirty
   1552 * data.  It looks at the number of dirty pages in the machine and will force
   1553 * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
   1554 * If we're over `background_thresh' then the writeback threads are woken to
   1555 * perform some writeout.
   1556 */
   1557static void balance_dirty_pages(struct bdi_writeback *wb,
   1558				unsigned long pages_dirtied)
   1559{
   1560	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
   1561	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
   1562	struct dirty_throttle_control * const gdtc = &gdtc_stor;
   1563	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
   1564						     &mdtc_stor : NULL;
   1565	struct dirty_throttle_control *sdtc;
   1566	unsigned long nr_reclaimable;	/* = file_dirty */
   1567	long period;
   1568	long pause;
   1569	long max_pause;
   1570	long min_pause;
   1571	int nr_dirtied_pause;
   1572	bool dirty_exceeded = false;
   1573	unsigned long task_ratelimit;
   1574	unsigned long dirty_ratelimit;
   1575	struct backing_dev_info *bdi = wb->bdi;
   1576	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
   1577	unsigned long start_time = jiffies;
   1578
   1579	for (;;) {
   1580		unsigned long now = jiffies;
   1581		unsigned long dirty, thresh, bg_thresh;
   1582		unsigned long m_dirty = 0;	/* stop bogus uninit warnings */
   1583		unsigned long m_thresh = 0;
   1584		unsigned long m_bg_thresh = 0;
   1585
   1586		nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
   1587		gdtc->avail = global_dirtyable_memory();
   1588		gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
   1589
   1590		domain_dirty_limits(gdtc);
   1591
   1592		if (unlikely(strictlimit)) {
   1593			wb_dirty_limits(gdtc);
   1594
   1595			dirty = gdtc->wb_dirty;
   1596			thresh = gdtc->wb_thresh;
   1597			bg_thresh = gdtc->wb_bg_thresh;
   1598		} else {
   1599			dirty = gdtc->dirty;
   1600			thresh = gdtc->thresh;
   1601			bg_thresh = gdtc->bg_thresh;
   1602		}
   1603
   1604		if (mdtc) {
   1605			unsigned long filepages, headroom, writeback;
   1606
   1607			/*
   1608			 * If @wb belongs to !root memcg, repeat the same
   1609			 * basic calculations for the memcg domain.
   1610			 */
   1611			mem_cgroup_wb_stats(wb, &filepages, &headroom,
   1612					    &mdtc->dirty, &writeback);
   1613			mdtc->dirty += writeback;
   1614			mdtc_calc_avail(mdtc, filepages, headroom);
   1615
   1616			domain_dirty_limits(mdtc);
   1617
   1618			if (unlikely(strictlimit)) {
   1619				wb_dirty_limits(mdtc);
   1620				m_dirty = mdtc->wb_dirty;
   1621				m_thresh = mdtc->wb_thresh;
   1622				m_bg_thresh = mdtc->wb_bg_thresh;
   1623			} else {
   1624				m_dirty = mdtc->dirty;
   1625				m_thresh = mdtc->thresh;
   1626				m_bg_thresh = mdtc->bg_thresh;
   1627			}
   1628		}
   1629
   1630		/*
   1631		 * Throttle it only when the background writeback cannot
   1632		 * catch-up. This avoids (excessively) small writeouts
   1633		 * when the wb limits are ramping up in case of !strictlimit.
   1634		 *
   1635		 * In strictlimit case make decision based on the wb counters
   1636		 * and limits. Small writeouts when the wb limits are ramping
   1637		 * up are the price we consciously pay for strictlimit-ing.
   1638		 *
   1639		 * If memcg domain is in effect, @dirty should be under
   1640		 * both global and memcg freerun ceilings.
   1641		 */
   1642		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
   1643		    (!mdtc ||
   1644		     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
   1645			unsigned long intv;
   1646			unsigned long m_intv;
   1647
   1648free_running:
   1649			intv = dirty_poll_interval(dirty, thresh);
   1650			m_intv = ULONG_MAX;
   1651
   1652			current->dirty_paused_when = now;
   1653			current->nr_dirtied = 0;
   1654			if (mdtc)
   1655				m_intv = dirty_poll_interval(m_dirty, m_thresh);
   1656			current->nr_dirtied_pause = min(intv, m_intv);
   1657			break;
   1658		}
   1659
   1660		if (unlikely(!writeback_in_progress(wb)))
   1661			wb_start_background_writeback(wb);
   1662
   1663		mem_cgroup_flush_foreign(wb);
   1664
   1665		/*
   1666		 * Calculate global domain's pos_ratio and select the
   1667		 * global dtc by default.
   1668		 */
   1669		if (!strictlimit) {
   1670			wb_dirty_limits(gdtc);
   1671
   1672			if ((current->flags & PF_LOCAL_THROTTLE) &&
   1673			    gdtc->wb_dirty <
   1674			    dirty_freerun_ceiling(gdtc->wb_thresh,
   1675						  gdtc->wb_bg_thresh))
   1676				/*
   1677				 * LOCAL_THROTTLE tasks must not be throttled
   1678				 * when below the per-wb freerun ceiling.
   1679				 */
   1680				goto free_running;
   1681		}
   1682
   1683		dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
   1684			((gdtc->dirty > gdtc->thresh) || strictlimit);
   1685
   1686		wb_position_ratio(gdtc);
   1687		sdtc = gdtc;
   1688
   1689		if (mdtc) {
   1690			/*
   1691			 * If memcg domain is in effect, calculate its
   1692			 * pos_ratio.  @wb should satisfy constraints from
   1693			 * both global and memcg domains.  Choose the one
   1694			 * w/ lower pos_ratio.
   1695			 */
   1696			if (!strictlimit) {
   1697				wb_dirty_limits(mdtc);
   1698
   1699				if ((current->flags & PF_LOCAL_THROTTLE) &&
   1700				    mdtc->wb_dirty <
   1701				    dirty_freerun_ceiling(mdtc->wb_thresh,
   1702							  mdtc->wb_bg_thresh))
   1703					/*
   1704					 * LOCAL_THROTTLE tasks must not be
   1705					 * throttled when below the per-wb
   1706					 * freerun ceiling.
   1707					 */
   1708					goto free_running;
   1709			}
   1710			dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
   1711				((mdtc->dirty > mdtc->thresh) || strictlimit);
   1712
   1713			wb_position_ratio(mdtc);
   1714			if (mdtc->pos_ratio < gdtc->pos_ratio)
   1715				sdtc = mdtc;
   1716		}
   1717
   1718		if (dirty_exceeded && !wb->dirty_exceeded)
   1719			wb->dirty_exceeded = 1;
   1720
   1721		if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
   1722					   BANDWIDTH_INTERVAL))
   1723			__wb_update_bandwidth(gdtc, mdtc, true);
   1724
   1725		/* throttle according to the chosen dtc */
   1726		dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
   1727		task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
   1728							RATELIMIT_CALC_SHIFT;
   1729		max_pause = wb_max_pause(wb, sdtc->wb_dirty);
   1730		min_pause = wb_min_pause(wb, max_pause,
   1731					 task_ratelimit, dirty_ratelimit,
   1732					 &nr_dirtied_pause);
   1733
   1734		if (unlikely(task_ratelimit == 0)) {
   1735			period = max_pause;
   1736			pause = max_pause;
   1737			goto pause;
   1738		}
   1739		period = HZ * pages_dirtied / task_ratelimit;
   1740		pause = period;
   1741		if (current->dirty_paused_when)
   1742			pause -= now - current->dirty_paused_when;
   1743		/*
   1744		 * For less than 1s think time (ext3/4 may block the dirtier
   1745		 * for up to 800ms from time to time on 1-HDD; so does xfs,
   1746		 * however at much less frequency), try to compensate it in
   1747		 * future periods by updating the virtual time; otherwise just
   1748		 * do a reset, as it may be a light dirtier.
   1749		 */
   1750		if (pause < min_pause) {
   1751			trace_balance_dirty_pages(wb,
   1752						  sdtc->thresh,
   1753						  sdtc->bg_thresh,
   1754						  sdtc->dirty,
   1755						  sdtc->wb_thresh,
   1756						  sdtc->wb_dirty,
   1757						  dirty_ratelimit,
   1758						  task_ratelimit,
   1759						  pages_dirtied,
   1760						  period,
   1761						  min(pause, 0L),
   1762						  start_time);
   1763			if (pause < -HZ) {
   1764				current->dirty_paused_when = now;
   1765				current->nr_dirtied = 0;
   1766			} else if (period) {
   1767				current->dirty_paused_when += period;
   1768				current->nr_dirtied = 0;
   1769			} else if (current->nr_dirtied_pause <= pages_dirtied)
   1770				current->nr_dirtied_pause += pages_dirtied;
   1771			break;
   1772		}
   1773		if (unlikely(pause > max_pause)) {
   1774			/* for occasional dropped task_ratelimit */
   1775			now += min(pause - max_pause, max_pause);
   1776			pause = max_pause;
   1777		}
   1778
   1779pause:
   1780		trace_balance_dirty_pages(wb,
   1781					  sdtc->thresh,
   1782					  sdtc->bg_thresh,
   1783					  sdtc->dirty,
   1784					  sdtc->wb_thresh,
   1785					  sdtc->wb_dirty,
   1786					  dirty_ratelimit,
   1787					  task_ratelimit,
   1788					  pages_dirtied,
   1789					  period,
   1790					  pause,
   1791					  start_time);
   1792		__set_current_state(TASK_KILLABLE);
   1793		wb->dirty_sleep = now;
   1794		io_schedule_timeout(pause);
   1795
   1796		current->dirty_paused_when = now + pause;
   1797		current->nr_dirtied = 0;
   1798		current->nr_dirtied_pause = nr_dirtied_pause;
   1799
   1800		/*
   1801		 * This is typically equal to (dirty < thresh) and can also
   1802		 * keep "1000+ dd on a slow USB stick" under control.
   1803		 */
   1804		if (task_ratelimit)
   1805			break;
   1806
   1807		/*
   1808		 * In the case of an unresponsive NFS server and the NFS dirty
   1809		 * pages exceeds dirty_thresh, give the other good wb's a pipe
   1810		 * to go through, so that tasks on them still remain responsive.
   1811		 *
   1812		 * In theory 1 page is enough to keep the consumer-producer
   1813		 * pipe going: the flusher cleans 1 page => the task dirties 1
   1814		 * more page. However wb_dirty has accounting errors.  So use
   1815		 * the larger and more IO friendly wb_stat_error.
   1816		 */
   1817		if (sdtc->wb_dirty <= wb_stat_error())
   1818			break;
   1819
   1820		if (fatal_signal_pending(current))
   1821			break;
   1822	}
   1823
   1824	if (!dirty_exceeded && wb->dirty_exceeded)
   1825		wb->dirty_exceeded = 0;
   1826
   1827	if (writeback_in_progress(wb))
   1828		return;
   1829
   1830	/*
   1831	 * In laptop mode, we wait until hitting the higher threshold before
   1832	 * starting background writeout, and then write out all the way down
   1833	 * to the lower threshold.  So slow writers cause minimal disk activity.
   1834	 *
   1835	 * In normal mode, we start background writeout at the lower
   1836	 * background_thresh, to keep the amount of dirty memory low.
   1837	 */
   1838	if (laptop_mode)
   1839		return;
   1840
   1841	if (nr_reclaimable > gdtc->bg_thresh)
   1842		wb_start_background_writeback(wb);
   1843}
   1844
   1845static DEFINE_PER_CPU(int, bdp_ratelimits);
   1846
   1847/*
   1848 * Normal tasks are throttled by
   1849 *	loop {
   1850 *		dirty tsk->nr_dirtied_pause pages;
   1851 *		take a snap in balance_dirty_pages();
   1852 *	}
   1853 * However there is a worst case. If every task exit immediately when dirtied
   1854 * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
   1855 * called to throttle the page dirties. The solution is to save the not yet
   1856 * throttled page dirties in dirty_throttle_leaks on task exit and charge them
   1857 * randomly into the running tasks. This works well for the above worst case,
   1858 * as the new task will pick up and accumulate the old task's leaked dirty
   1859 * count and eventually get throttled.
   1860 */
   1861DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
   1862
   1863/**
   1864 * balance_dirty_pages_ratelimited - balance dirty memory state
   1865 * @mapping: address_space which was dirtied
   1866 *
   1867 * Processes which are dirtying memory should call in here once for each page
   1868 * which was newly dirtied.  The function will periodically check the system's
   1869 * dirty state and will initiate writeback if needed.
   1870 *
   1871 * Once we're over the dirty memory limit we decrease the ratelimiting
   1872 * by a lot, to prevent individual processes from overshooting the limit
   1873 * by (ratelimit_pages) each.
   1874 */
   1875void balance_dirty_pages_ratelimited(struct address_space *mapping)
   1876{
   1877	struct inode *inode = mapping->host;
   1878	struct backing_dev_info *bdi = inode_to_bdi(inode);
   1879	struct bdi_writeback *wb = NULL;
   1880	int ratelimit;
   1881	int *p;
   1882
   1883	if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
   1884		return;
   1885
   1886	if (inode_cgwb_enabled(inode))
   1887		wb = wb_get_create_current(bdi, GFP_KERNEL);
   1888	if (!wb)
   1889		wb = &bdi->wb;
   1890
   1891	ratelimit = current->nr_dirtied_pause;
   1892	if (wb->dirty_exceeded)
   1893		ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
   1894
   1895	preempt_disable();
   1896	/*
   1897	 * This prevents one CPU to accumulate too many dirtied pages without
   1898	 * calling into balance_dirty_pages(), which can happen when there are
   1899	 * 1000+ tasks, all of them start dirtying pages at exactly the same
   1900	 * time, hence all honoured too large initial task->nr_dirtied_pause.
   1901	 */
   1902	p =  this_cpu_ptr(&bdp_ratelimits);
   1903	if (unlikely(current->nr_dirtied >= ratelimit))
   1904		*p = 0;
   1905	else if (unlikely(*p >= ratelimit_pages)) {
   1906		*p = 0;
   1907		ratelimit = 0;
   1908	}
   1909	/*
   1910	 * Pick up the dirtied pages by the exited tasks. This avoids lots of
   1911	 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
   1912	 * the dirty throttling and livelock other long-run dirtiers.
   1913	 */
   1914	p = this_cpu_ptr(&dirty_throttle_leaks);
   1915	if (*p > 0 && current->nr_dirtied < ratelimit) {
   1916		unsigned long nr_pages_dirtied;
   1917		nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
   1918		*p -= nr_pages_dirtied;
   1919		current->nr_dirtied += nr_pages_dirtied;
   1920	}
   1921	preempt_enable();
   1922
   1923	if (unlikely(current->nr_dirtied >= ratelimit))
   1924		balance_dirty_pages(wb, current->nr_dirtied);
   1925
   1926	wb_put(wb);
   1927}
   1928EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
   1929
   1930/**
   1931 * wb_over_bg_thresh - does @wb need to be written back?
   1932 * @wb: bdi_writeback of interest
   1933 *
   1934 * Determines whether background writeback should keep writing @wb or it's
   1935 * clean enough.
   1936 *
   1937 * Return: %true if writeback should continue.
   1938 */
   1939bool wb_over_bg_thresh(struct bdi_writeback *wb)
   1940{
   1941	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
   1942	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
   1943	struct dirty_throttle_control * const gdtc = &gdtc_stor;
   1944	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
   1945						     &mdtc_stor : NULL;
   1946	unsigned long reclaimable;
   1947	unsigned long thresh;
   1948
   1949	/*
   1950	 * Similar to balance_dirty_pages() but ignores pages being written
   1951	 * as we're trying to decide whether to put more under writeback.
   1952	 */
   1953	gdtc->avail = global_dirtyable_memory();
   1954	gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
   1955	domain_dirty_limits(gdtc);
   1956
   1957	if (gdtc->dirty > gdtc->bg_thresh)
   1958		return true;
   1959
   1960	thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh);
   1961	if (thresh < 2 * wb_stat_error())
   1962		reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
   1963	else
   1964		reclaimable = wb_stat(wb, WB_RECLAIMABLE);
   1965
   1966	if (reclaimable > thresh)
   1967		return true;
   1968
   1969	if (mdtc) {
   1970		unsigned long filepages, headroom, writeback;
   1971
   1972		mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
   1973				    &writeback);
   1974		mdtc_calc_avail(mdtc, filepages, headroom);
   1975		domain_dirty_limits(mdtc);	/* ditto, ignore writeback */
   1976
   1977		if (mdtc->dirty > mdtc->bg_thresh)
   1978			return true;
   1979
   1980		thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh);
   1981		if (thresh < 2 * wb_stat_error())
   1982			reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
   1983		else
   1984			reclaimable = wb_stat(wb, WB_RECLAIMABLE);
   1985
   1986		if (reclaimable > thresh)
   1987			return true;
   1988	}
   1989
   1990	return false;
   1991}
   1992
   1993#ifdef CONFIG_SYSCTL
   1994/*
   1995 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
   1996 */
   1997static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
   1998		void *buffer, size_t *length, loff_t *ppos)
   1999{
   2000	unsigned int old_interval = dirty_writeback_interval;
   2001	int ret;
   2002
   2003	ret = proc_dointvec(table, write, buffer, length, ppos);
   2004
   2005	/*
   2006	 * Writing 0 to dirty_writeback_interval will disable periodic writeback
   2007	 * and a different non-zero value will wakeup the writeback threads.
   2008	 * wb_wakeup_delayed() would be more appropriate, but it's a pain to
   2009	 * iterate over all bdis and wbs.
   2010	 * The reason we do this is to make the change take effect immediately.
   2011	 */
   2012	if (!ret && write && dirty_writeback_interval &&
   2013		dirty_writeback_interval != old_interval)
   2014		wakeup_flusher_threads(WB_REASON_PERIODIC);
   2015
   2016	return ret;
   2017}
   2018#endif
   2019
   2020void laptop_mode_timer_fn(struct timer_list *t)
   2021{
   2022	struct backing_dev_info *backing_dev_info =
   2023		from_timer(backing_dev_info, t, laptop_mode_wb_timer);
   2024
   2025	wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
   2026}
   2027
   2028/*
   2029 * We've spun up the disk and we're in laptop mode: schedule writeback
   2030 * of all dirty data a few seconds from now.  If the flush is already scheduled
   2031 * then push it back - the user is still using the disk.
   2032 */
   2033void laptop_io_completion(struct backing_dev_info *info)
   2034{
   2035	mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
   2036}
   2037
   2038/*
   2039 * We're in laptop mode and we've just synced. The sync's writes will have
   2040 * caused another writeback to be scheduled by laptop_io_completion.
   2041 * Nothing needs to be written back anymore, so we unschedule the writeback.
   2042 */
   2043void laptop_sync_completion(void)
   2044{
   2045	struct backing_dev_info *bdi;
   2046
   2047	rcu_read_lock();
   2048
   2049	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
   2050		del_timer(&bdi->laptop_mode_wb_timer);
   2051
   2052	rcu_read_unlock();
   2053}
   2054
   2055/*
   2056 * If ratelimit_pages is too high then we can get into dirty-data overload
   2057 * if a large number of processes all perform writes at the same time.
   2058 *
   2059 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
   2060 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
   2061 * thresholds.
   2062 */
   2063
   2064void writeback_set_ratelimit(void)
   2065{
   2066	struct wb_domain *dom = &global_wb_domain;
   2067	unsigned long background_thresh;
   2068	unsigned long dirty_thresh;
   2069
   2070	global_dirty_limits(&background_thresh, &dirty_thresh);
   2071	dom->dirty_limit = dirty_thresh;
   2072	ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
   2073	if (ratelimit_pages < 16)
   2074		ratelimit_pages = 16;
   2075}
   2076
   2077static int page_writeback_cpu_online(unsigned int cpu)
   2078{
   2079	writeback_set_ratelimit();
   2080	return 0;
   2081}
   2082
   2083#ifdef CONFIG_SYSCTL
   2084
   2085/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
   2086static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
   2087
   2088static struct ctl_table vm_page_writeback_sysctls[] = {
   2089	{
   2090		.procname   = "dirty_background_ratio",
   2091		.data       = &dirty_background_ratio,
   2092		.maxlen     = sizeof(dirty_background_ratio),
   2093		.mode       = 0644,
   2094		.proc_handler   = dirty_background_ratio_handler,
   2095		.extra1     = SYSCTL_ZERO,
   2096		.extra2     = SYSCTL_ONE_HUNDRED,
   2097	},
   2098	{
   2099		.procname   = "dirty_background_bytes",
   2100		.data       = &dirty_background_bytes,
   2101		.maxlen     = sizeof(dirty_background_bytes),
   2102		.mode       = 0644,
   2103		.proc_handler   = dirty_background_bytes_handler,
   2104		.extra1     = SYSCTL_LONG_ONE,
   2105	},
   2106	{
   2107		.procname   = "dirty_ratio",
   2108		.data       = &vm_dirty_ratio,
   2109		.maxlen     = sizeof(vm_dirty_ratio),
   2110		.mode       = 0644,
   2111		.proc_handler   = dirty_ratio_handler,
   2112		.extra1     = SYSCTL_ZERO,
   2113		.extra2     = SYSCTL_ONE_HUNDRED,
   2114	},
   2115	{
   2116		.procname   = "dirty_bytes",
   2117		.data       = &vm_dirty_bytes,
   2118		.maxlen     = sizeof(vm_dirty_bytes),
   2119		.mode       = 0644,
   2120		.proc_handler   = dirty_bytes_handler,
   2121		.extra1     = (void *)&dirty_bytes_min,
   2122	},
   2123	{
   2124		.procname   = "dirty_writeback_centisecs",
   2125		.data       = &dirty_writeback_interval,
   2126		.maxlen     = sizeof(dirty_writeback_interval),
   2127		.mode       = 0644,
   2128		.proc_handler   = dirty_writeback_centisecs_handler,
   2129	},
   2130	{
   2131		.procname   = "dirty_expire_centisecs",
   2132		.data       = &dirty_expire_interval,
   2133		.maxlen     = sizeof(dirty_expire_interval),
   2134		.mode       = 0644,
   2135		.proc_handler   = proc_dointvec_minmax,
   2136		.extra1     = SYSCTL_ZERO,
   2137	},
   2138#ifdef CONFIG_HIGHMEM
   2139	{
   2140		.procname	= "highmem_is_dirtyable",
   2141		.data		= &vm_highmem_is_dirtyable,
   2142		.maxlen		= sizeof(vm_highmem_is_dirtyable),
   2143		.mode		= 0644,
   2144		.proc_handler	= proc_dointvec_minmax,
   2145		.extra1		= SYSCTL_ZERO,
   2146		.extra2		= SYSCTL_ONE,
   2147	},
   2148#endif
   2149	{
   2150		.procname	= "laptop_mode",
   2151		.data		= &laptop_mode,
   2152		.maxlen		= sizeof(laptop_mode),
   2153		.mode		= 0644,
   2154		.proc_handler	= proc_dointvec_jiffies,
   2155	},
   2156	{}
   2157};
   2158#endif
   2159
   2160/*
   2161 * Called early on to tune the page writeback dirty limits.
   2162 *
   2163 * We used to scale dirty pages according to how total memory
   2164 * related to pages that could be allocated for buffers.
   2165 *
   2166 * However, that was when we used "dirty_ratio" to scale with
   2167 * all memory, and we don't do that any more. "dirty_ratio"
   2168 * is now applied to total non-HIGHPAGE memory, and as such we can't
   2169 * get into the old insane situation any more where we had
   2170 * large amounts of dirty pages compared to a small amount of
   2171 * non-HIGHMEM memory.
   2172 *
   2173 * But we might still want to scale the dirty_ratio by how
   2174 * much memory the box has..
   2175 */
   2176void __init page_writeback_init(void)
   2177{
   2178	BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
   2179
   2180	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online",
   2181			  page_writeback_cpu_online, NULL);
   2182	cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
   2183			  page_writeback_cpu_online);
   2184#ifdef CONFIG_SYSCTL
   2185	register_sysctl_init("vm", vm_page_writeback_sysctls);
   2186#endif
   2187}
   2188
   2189/**
   2190 * tag_pages_for_writeback - tag pages to be written by write_cache_pages
   2191 * @mapping: address space structure to write
   2192 * @start: starting page index
   2193 * @end: ending page index (inclusive)
   2194 *
   2195 * This function scans the page range from @start to @end (inclusive) and tags
   2196 * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
   2197 * that write_cache_pages (or whoever calls this function) will then use
   2198 * TOWRITE tag to identify pages eligible for writeback.  This mechanism is
   2199 * used to avoid livelocking of writeback by a process steadily creating new
   2200 * dirty pages in the file (thus it is important for this function to be quick
   2201 * so that it can tag pages faster than a dirtying process can create them).
   2202 */
   2203void tag_pages_for_writeback(struct address_space *mapping,
   2204			     pgoff_t start, pgoff_t end)
   2205{
   2206	XA_STATE(xas, &mapping->i_pages, start);
   2207	unsigned int tagged = 0;
   2208	void *page;
   2209
   2210	xas_lock_irq(&xas);
   2211	xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
   2212		xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
   2213		if (++tagged % XA_CHECK_SCHED)
   2214			continue;
   2215
   2216		xas_pause(&xas);
   2217		xas_unlock_irq(&xas);
   2218		cond_resched();
   2219		xas_lock_irq(&xas);
   2220	}
   2221	xas_unlock_irq(&xas);
   2222}
   2223EXPORT_SYMBOL(tag_pages_for_writeback);
   2224
   2225/**
   2226 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
   2227 * @mapping: address space structure to write
   2228 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
   2229 * @writepage: function called for each page
   2230 * @data: data passed to writepage function
   2231 *
   2232 * If a page is already under I/O, write_cache_pages() skips it, even
   2233 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
   2234 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
   2235 * and msync() need to guarantee that all the data which was dirty at the time
   2236 * the call was made get new I/O started against them.  If wbc->sync_mode is
   2237 * WB_SYNC_ALL then we were called for data integrity and we must wait for
   2238 * existing IO to complete.
   2239 *
   2240 * To avoid livelocks (when other process dirties new pages), we first tag
   2241 * pages which should be written back with TOWRITE tag and only then start
   2242 * writing them. For data-integrity sync we have to be careful so that we do
   2243 * not miss some pages (e.g., because some other process has cleared TOWRITE
   2244 * tag we set). The rule we follow is that TOWRITE tag can be cleared only
   2245 * by the process clearing the DIRTY tag (and submitting the page for IO).
   2246 *
   2247 * To avoid deadlocks between range_cyclic writeback and callers that hold
   2248 * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
   2249 * we do not loop back to the start of the file. Doing so causes a page
   2250 * lock/page writeback access order inversion - we should only ever lock
   2251 * multiple pages in ascending page->index order, and looping back to the start
   2252 * of the file violates that rule and causes deadlocks.
   2253 *
   2254 * Return: %0 on success, negative error code otherwise
   2255 */
   2256int write_cache_pages(struct address_space *mapping,
   2257		      struct writeback_control *wbc, writepage_t writepage,
   2258		      void *data)
   2259{
   2260	int ret = 0;
   2261	int done = 0;
   2262	int error;
   2263	struct pagevec pvec;
   2264	int nr_pages;
   2265	pgoff_t index;
   2266	pgoff_t end;		/* Inclusive */
   2267	pgoff_t done_index;
   2268	int range_whole = 0;
   2269	xa_mark_t tag;
   2270
   2271	pagevec_init(&pvec);
   2272	if (wbc->range_cyclic) {
   2273		index = mapping->writeback_index; /* prev offset */
   2274		end = -1;
   2275	} else {
   2276		index = wbc->range_start >> PAGE_SHIFT;
   2277		end = wbc->range_end >> PAGE_SHIFT;
   2278		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
   2279			range_whole = 1;
   2280	}
   2281	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
   2282		tag_pages_for_writeback(mapping, index, end);
   2283		tag = PAGECACHE_TAG_TOWRITE;
   2284	} else {
   2285		tag = PAGECACHE_TAG_DIRTY;
   2286	}
   2287	done_index = index;
   2288	while (!done && (index <= end)) {
   2289		int i;
   2290
   2291		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
   2292				tag);
   2293		if (nr_pages == 0)
   2294			break;
   2295
   2296		for (i = 0; i < nr_pages; i++) {
   2297			struct page *page = pvec.pages[i];
   2298
   2299			done_index = page->index;
   2300
   2301			lock_page(page);
   2302
   2303			/*
   2304			 * Page truncated or invalidated. We can freely skip it
   2305			 * then, even for data integrity operations: the page
   2306			 * has disappeared concurrently, so there could be no
   2307			 * real expectation of this data integrity operation
   2308			 * even if there is now a new, dirty page at the same
   2309			 * pagecache address.
   2310			 */
   2311			if (unlikely(page->mapping != mapping)) {
   2312continue_unlock:
   2313				unlock_page(page);
   2314				continue;
   2315			}
   2316
   2317			if (!PageDirty(page)) {
   2318				/* someone wrote it for us */
   2319				goto continue_unlock;
   2320			}
   2321
   2322			if (PageWriteback(page)) {
   2323				if (wbc->sync_mode != WB_SYNC_NONE)
   2324					wait_on_page_writeback(page);
   2325				else
   2326					goto continue_unlock;
   2327			}
   2328
   2329			BUG_ON(PageWriteback(page));
   2330			if (!clear_page_dirty_for_io(page))
   2331				goto continue_unlock;
   2332
   2333			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
   2334			error = (*writepage)(page, wbc, data);
   2335			if (unlikely(error)) {
   2336				/*
   2337				 * Handle errors according to the type of
   2338				 * writeback. There's no need to continue for
   2339				 * background writeback. Just push done_index
   2340				 * past this page so media errors won't choke
   2341				 * writeout for the entire file. For integrity
   2342				 * writeback, we must process the entire dirty
   2343				 * set regardless of errors because the fs may
   2344				 * still have state to clear for each page. In
   2345				 * that case we continue processing and return
   2346				 * the first error.
   2347				 */
   2348				if (error == AOP_WRITEPAGE_ACTIVATE) {
   2349					unlock_page(page);
   2350					error = 0;
   2351				} else if (wbc->sync_mode != WB_SYNC_ALL) {
   2352					ret = error;
   2353					done_index = page->index + 1;
   2354					done = 1;
   2355					break;
   2356				}
   2357				if (!ret)
   2358					ret = error;
   2359			}
   2360
   2361			/*
   2362			 * We stop writing back only if we are not doing
   2363			 * integrity sync. In case of integrity sync we have to
   2364			 * keep going until we have written all the pages
   2365			 * we tagged for writeback prior to entering this loop.
   2366			 */
   2367			if (--wbc->nr_to_write <= 0 &&
   2368			    wbc->sync_mode == WB_SYNC_NONE) {
   2369				done = 1;
   2370				break;
   2371			}
   2372		}
   2373		pagevec_release(&pvec);
   2374		cond_resched();
   2375	}
   2376
   2377	/*
   2378	 * If we hit the last page and there is more work to be done: wrap
   2379	 * back the index back to the start of the file for the next
   2380	 * time we are called.
   2381	 */
   2382	if (wbc->range_cyclic && !done)
   2383		done_index = 0;
   2384	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
   2385		mapping->writeback_index = done_index;
   2386
   2387	return ret;
   2388}
   2389EXPORT_SYMBOL(write_cache_pages);
   2390
   2391/*
   2392 * Function used by generic_writepages to call the real writepage
   2393 * function and set the mapping flags on error
   2394 */
   2395static int __writepage(struct page *page, struct writeback_control *wbc,
   2396		       void *data)
   2397{
   2398	struct address_space *mapping = data;
   2399	int ret = mapping->a_ops->writepage(page, wbc);
   2400	mapping_set_error(mapping, ret);
   2401	return ret;
   2402}
   2403
   2404/**
   2405 * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
   2406 * @mapping: address space structure to write
   2407 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
   2408 *
   2409 * This is a library function, which implements the writepages()
   2410 * address_space_operation.
   2411 *
   2412 * Return: %0 on success, negative error code otherwise
   2413 */
   2414int generic_writepages(struct address_space *mapping,
   2415		       struct writeback_control *wbc)
   2416{
   2417	struct blk_plug plug;
   2418	int ret;
   2419
   2420	/* deal with chardevs and other special file */
   2421	if (!mapping->a_ops->writepage)
   2422		return 0;
   2423
   2424	blk_start_plug(&plug);
   2425	ret = write_cache_pages(mapping, wbc, __writepage, mapping);
   2426	blk_finish_plug(&plug);
   2427	return ret;
   2428}
   2429
   2430EXPORT_SYMBOL(generic_writepages);
   2431
   2432int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
   2433{
   2434	int ret;
   2435	struct bdi_writeback *wb;
   2436
   2437	if (wbc->nr_to_write <= 0)
   2438		return 0;
   2439	wb = inode_to_wb_wbc(mapping->host, wbc);
   2440	wb_bandwidth_estimate_start(wb);
   2441	while (1) {
   2442		if (mapping->a_ops->writepages)
   2443			ret = mapping->a_ops->writepages(mapping, wbc);
   2444		else
   2445			ret = generic_writepages(mapping, wbc);
   2446		if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
   2447			break;
   2448
   2449		/*
   2450		 * Lacking an allocation context or the locality or writeback
   2451		 * state of any of the inode's pages, throttle based on
   2452		 * writeback activity on the local node. It's as good a
   2453		 * guess as any.
   2454		 */
   2455		reclaim_throttle(NODE_DATA(numa_node_id()),
   2456			VMSCAN_THROTTLE_WRITEBACK);
   2457	}
   2458	/*
   2459	 * Usually few pages are written by now from those we've just submitted
   2460	 * but if there's constant writeback being submitted, this makes sure
   2461	 * writeback bandwidth is updated once in a while.
   2462	 */
   2463	if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
   2464				   BANDWIDTH_INTERVAL))
   2465		wb_update_bandwidth(wb);
   2466	return ret;
   2467}
   2468
   2469/**
   2470 * folio_write_one - write out a single folio and wait on I/O.
   2471 * @folio: The folio to write.
   2472 *
   2473 * The folio must be locked by the caller and will be unlocked upon return.
   2474 *
   2475 * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
   2476 * function returns.
   2477 *
   2478 * Return: %0 on success, negative error code otherwise
   2479 */
   2480int folio_write_one(struct folio *folio)
   2481{
   2482	struct address_space *mapping = folio->mapping;
   2483	int ret = 0;
   2484	struct writeback_control wbc = {
   2485		.sync_mode = WB_SYNC_ALL,
   2486		.nr_to_write = folio_nr_pages(folio),
   2487	};
   2488
   2489	BUG_ON(!folio_test_locked(folio));
   2490
   2491	folio_wait_writeback(folio);
   2492
   2493	if (folio_clear_dirty_for_io(folio)) {
   2494		folio_get(folio);
   2495		ret = mapping->a_ops->writepage(&folio->page, &wbc);
   2496		if (ret == 0)
   2497			folio_wait_writeback(folio);
   2498		folio_put(folio);
   2499	} else {
   2500		folio_unlock(folio);
   2501	}
   2502
   2503	if (!ret)
   2504		ret = filemap_check_errors(mapping);
   2505	return ret;
   2506}
   2507EXPORT_SYMBOL(folio_write_one);
   2508
   2509/*
   2510 * For address_spaces which do not use buffers nor write back.
   2511 */
   2512bool noop_dirty_folio(struct address_space *mapping, struct folio *folio)
   2513{
   2514	if (!folio_test_dirty(folio))
   2515		return !folio_test_set_dirty(folio);
   2516	return false;
   2517}
   2518EXPORT_SYMBOL(noop_dirty_folio);
   2519
   2520/*
   2521 * Helper function for set_page_dirty family.
   2522 *
   2523 * Caller must hold lock_page_memcg().
   2524 *
   2525 * NOTE: This relies on being atomic wrt interrupts.
   2526 */
   2527static void folio_account_dirtied(struct folio *folio,
   2528		struct address_space *mapping)
   2529{
   2530	struct inode *inode = mapping->host;
   2531
   2532	trace_writeback_dirty_folio(folio, mapping);
   2533
   2534	if (mapping_can_writeback(mapping)) {
   2535		struct bdi_writeback *wb;
   2536		long nr = folio_nr_pages(folio);
   2537
   2538		inode_attach_wb(inode, &folio->page);
   2539		wb = inode_to_wb(inode);
   2540
   2541		__lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
   2542		__zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
   2543		__node_stat_mod_folio(folio, NR_DIRTIED, nr);
   2544		wb_stat_mod(wb, WB_RECLAIMABLE, nr);
   2545		wb_stat_mod(wb, WB_DIRTIED, nr);
   2546		task_io_account_write(nr * PAGE_SIZE);
   2547		current->nr_dirtied += nr;
   2548		__this_cpu_add(bdp_ratelimits, nr);
   2549
   2550		mem_cgroup_track_foreign_dirty(folio, wb);
   2551	}
   2552}
   2553
   2554/*
   2555 * Helper function for deaccounting dirty page without writeback.
   2556 *
   2557 * Caller must hold lock_page_memcg().
   2558 */
   2559void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
   2560{
   2561	long nr = folio_nr_pages(folio);
   2562
   2563	lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
   2564	zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
   2565	wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
   2566	task_io_account_cancelled_write(nr * PAGE_SIZE);
   2567}
   2568
   2569/*
   2570 * Mark the folio dirty, and set it dirty in the page cache, and mark
   2571 * the inode dirty.
   2572 *
   2573 * If warn is true, then emit a warning if the folio is not uptodate and has
   2574 * not been truncated.
   2575 *
   2576 * The caller must hold lock_page_memcg().  Most callers have the folio
   2577 * locked.  A few have the folio blocked from truncation through other
   2578 * means (eg zap_page_range() has it mapped and is holding the page table
   2579 * lock).  This can also be called from mark_buffer_dirty(), which I
   2580 * cannot prove is always protected against truncate.
   2581 */
   2582void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
   2583			     int warn)
   2584{
   2585	unsigned long flags;
   2586
   2587	xa_lock_irqsave(&mapping->i_pages, flags);
   2588	if (folio->mapping) {	/* Race with truncate? */
   2589		WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
   2590		folio_account_dirtied(folio, mapping);
   2591		__xa_set_mark(&mapping->i_pages, folio_index(folio),
   2592				PAGECACHE_TAG_DIRTY);
   2593	}
   2594	xa_unlock_irqrestore(&mapping->i_pages, flags);
   2595}
   2596
   2597/**
   2598 * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
   2599 * @mapping: Address space this folio belongs to.
   2600 * @folio: Folio to be marked as dirty.
   2601 *
   2602 * Filesystems which do not use buffer heads should call this function
   2603 * from their set_page_dirty address space operation.  It ignores the
   2604 * contents of folio_get_private(), so if the filesystem marks individual
   2605 * blocks as dirty, the filesystem should handle that itself.
   2606 *
   2607 * This is also sometimes used by filesystems which use buffer_heads when
   2608 * a single buffer is being dirtied: we want to set the folio dirty in
   2609 * that case, but not all the buffers.  This is a "bottom-up" dirtying,
   2610 * whereas block_dirty_folio() is a "top-down" dirtying.
   2611 *
   2612 * The caller must ensure this doesn't race with truncation.  Most will
   2613 * simply hold the folio lock, but e.g. zap_pte_range() calls with the
   2614 * folio mapped and the pte lock held, which also locks out truncation.
   2615 */
   2616bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
   2617{
   2618	folio_memcg_lock(folio);
   2619	if (folio_test_set_dirty(folio)) {
   2620		folio_memcg_unlock(folio);
   2621		return false;
   2622	}
   2623
   2624	__folio_mark_dirty(folio, mapping, !folio_test_private(folio));
   2625	folio_memcg_unlock(folio);
   2626
   2627	if (mapping->host) {
   2628		/* !PageAnon && !swapper_space */
   2629		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
   2630	}
   2631	return true;
   2632}
   2633EXPORT_SYMBOL(filemap_dirty_folio);
   2634
   2635/**
   2636 * folio_account_redirty - Manually account for redirtying a page.
   2637 * @folio: The folio which is being redirtied.
   2638 *
   2639 * Most filesystems should call folio_redirty_for_writepage() instead
   2640 * of this fuction.  If your filesystem is doing writeback outside the
   2641 * context of a writeback_control(), it can call this when redirtying
   2642 * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED,
   2643 * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN,
   2644 * WB_WRITTEN) in long term. The mismatches will lead to systematic errors
   2645 * in balanced_dirty_ratelimit and the dirty pages position control.
   2646 */
   2647void folio_account_redirty(struct folio *folio)
   2648{
   2649	struct address_space *mapping = folio->mapping;
   2650
   2651	if (mapping && mapping_can_writeback(mapping)) {
   2652		struct inode *inode = mapping->host;
   2653		struct bdi_writeback *wb;
   2654		struct wb_lock_cookie cookie = {};
   2655		long nr = folio_nr_pages(folio);
   2656
   2657		wb = unlocked_inode_to_wb_begin(inode, &cookie);
   2658		current->nr_dirtied -= nr;
   2659		node_stat_mod_folio(folio, NR_DIRTIED, -nr);
   2660		wb_stat_mod(wb, WB_DIRTIED, -nr);
   2661		unlocked_inode_to_wb_end(inode, &cookie);
   2662	}
   2663}
   2664EXPORT_SYMBOL(folio_account_redirty);
   2665
   2666/**
   2667 * folio_redirty_for_writepage - Decline to write a dirty folio.
   2668 * @wbc: The writeback control.
   2669 * @folio: The folio.
   2670 *
   2671 * When a writepage implementation decides that it doesn't want to write
   2672 * @folio for some reason, it should call this function, unlock @folio and
   2673 * return 0.
   2674 *
   2675 * Return: True if we redirtied the folio.  False if someone else dirtied
   2676 * it first.
   2677 */
   2678bool folio_redirty_for_writepage(struct writeback_control *wbc,
   2679		struct folio *folio)
   2680{
   2681	bool ret;
   2682	long nr = folio_nr_pages(folio);
   2683
   2684	wbc->pages_skipped += nr;
   2685	ret = filemap_dirty_folio(folio->mapping, folio);
   2686	folio_account_redirty(folio);
   2687
   2688	return ret;
   2689}
   2690EXPORT_SYMBOL(folio_redirty_for_writepage);
   2691
   2692/**
   2693 * folio_mark_dirty - Mark a folio as being modified.
   2694 * @folio: The folio.
   2695 *
   2696 * The folio may not be truncated while this function is running.
   2697 * Holding the folio lock is sufficient to prevent truncation, but some
   2698 * callers cannot acquire a sleeping lock.  These callers instead hold
   2699 * the page table lock for a page table which contains at least one page
   2700 * in this folio.  Truncation will block on the page table lock as it
   2701 * unmaps pages before removing the folio from its mapping.
   2702 *
   2703 * Return: True if the folio was newly dirtied, false if it was already dirty.
   2704 */
   2705bool folio_mark_dirty(struct folio *folio)
   2706{
   2707	struct address_space *mapping = folio_mapping(folio);
   2708
   2709	if (likely(mapping)) {
   2710		/*
   2711		 * readahead/lru_deactivate_page could remain
   2712		 * PG_readahead/PG_reclaim due to race with folio_end_writeback
   2713		 * About readahead, if the folio is written, the flags would be
   2714		 * reset. So no problem.
   2715		 * About lru_deactivate_page, if the folio is redirtied,
   2716		 * the flag will be reset. So no problem. but if the
   2717		 * folio is used by readahead it will confuse readahead
   2718		 * and make it restart the size rampup process. But it's
   2719		 * a trivial problem.
   2720		 */
   2721		if (folio_test_reclaim(folio))
   2722			folio_clear_reclaim(folio);
   2723		return mapping->a_ops->dirty_folio(mapping, folio);
   2724	}
   2725
   2726	return noop_dirty_folio(mapping, folio);
   2727}
   2728EXPORT_SYMBOL(folio_mark_dirty);
   2729
   2730/*
   2731 * set_page_dirty() is racy if the caller has no reference against
   2732 * page->mapping->host, and if the page is unlocked.  This is because another
   2733 * CPU could truncate the page off the mapping and then free the mapping.
   2734 *
   2735 * Usually, the page _is_ locked, or the caller is a user-space process which
   2736 * holds a reference on the inode by having an open file.
   2737 *
   2738 * In other cases, the page should be locked before running set_page_dirty().
   2739 */
   2740int set_page_dirty_lock(struct page *page)
   2741{
   2742	int ret;
   2743
   2744	lock_page(page);
   2745	ret = set_page_dirty(page);
   2746	unlock_page(page);
   2747	return ret;
   2748}
   2749EXPORT_SYMBOL(set_page_dirty_lock);
   2750
   2751/*
   2752 * This cancels just the dirty bit on the kernel page itself, it does NOT
   2753 * actually remove dirty bits on any mmap's that may be around. It also
   2754 * leaves the page tagged dirty, so any sync activity will still find it on
   2755 * the dirty lists, and in particular, clear_page_dirty_for_io() will still
   2756 * look at the dirty bits in the VM.
   2757 *
   2758 * Doing this should *normally* only ever be done when a page is truncated,
   2759 * and is not actually mapped anywhere at all. However, fs/buffer.c does
   2760 * this when it notices that somebody has cleaned out all the buffers on a
   2761 * page without actually doing it through the VM. Can you say "ext3 is
   2762 * horribly ugly"? Thought you could.
   2763 */
   2764void __folio_cancel_dirty(struct folio *folio)
   2765{
   2766	struct address_space *mapping = folio_mapping(folio);
   2767
   2768	if (mapping_can_writeback(mapping)) {
   2769		struct inode *inode = mapping->host;
   2770		struct bdi_writeback *wb;
   2771		struct wb_lock_cookie cookie = {};
   2772
   2773		folio_memcg_lock(folio);
   2774		wb = unlocked_inode_to_wb_begin(inode, &cookie);
   2775
   2776		if (folio_test_clear_dirty(folio))
   2777			folio_account_cleaned(folio, wb);
   2778
   2779		unlocked_inode_to_wb_end(inode, &cookie);
   2780		folio_memcg_unlock(folio);
   2781	} else {
   2782		folio_clear_dirty(folio);
   2783	}
   2784}
   2785EXPORT_SYMBOL(__folio_cancel_dirty);
   2786
   2787/*
   2788 * Clear a folio's dirty flag, while caring for dirty memory accounting.
   2789 * Returns true if the folio was previously dirty.
   2790 *
   2791 * This is for preparing to put the folio under writeout.  We leave
   2792 * the folio tagged as dirty in the xarray so that a concurrent
   2793 * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
   2794 * The ->writepage implementation will run either folio_start_writeback()
   2795 * or folio_mark_dirty(), at which stage we bring the folio's dirty flag
   2796 * and xarray dirty tag back into sync.
   2797 *
   2798 * This incoherency between the folio's dirty flag and xarray tag is
   2799 * unfortunate, but it only exists while the folio is locked.
   2800 */
   2801bool folio_clear_dirty_for_io(struct folio *folio)
   2802{
   2803	struct address_space *mapping = folio_mapping(folio);
   2804	bool ret = false;
   2805
   2806	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
   2807
   2808	if (mapping && mapping_can_writeback(mapping)) {
   2809		struct inode *inode = mapping->host;
   2810		struct bdi_writeback *wb;
   2811		struct wb_lock_cookie cookie = {};
   2812
   2813		/*
   2814		 * Yes, Virginia, this is indeed insane.
   2815		 *
   2816		 * We use this sequence to make sure that
   2817		 *  (a) we account for dirty stats properly
   2818		 *  (b) we tell the low-level filesystem to
   2819		 *      mark the whole folio dirty if it was
   2820		 *      dirty in a pagetable. Only to then
   2821		 *  (c) clean the folio again and return 1 to
   2822		 *      cause the writeback.
   2823		 *
   2824		 * This way we avoid all nasty races with the
   2825		 * dirty bit in multiple places and clearing
   2826		 * them concurrently from different threads.
   2827		 *
   2828		 * Note! Normally the "folio_mark_dirty(folio)"
   2829		 * has no effect on the actual dirty bit - since
   2830		 * that will already usually be set. But we
   2831		 * need the side effects, and it can help us
   2832		 * avoid races.
   2833		 *
   2834		 * We basically use the folio "master dirty bit"
   2835		 * as a serialization point for all the different
   2836		 * threads doing their things.
   2837		 */
   2838		if (folio_mkclean(folio))
   2839			folio_mark_dirty(folio);
   2840		/*
   2841		 * We carefully synchronise fault handlers against
   2842		 * installing a dirty pte and marking the folio dirty
   2843		 * at this point.  We do this by having them hold the
   2844		 * page lock while dirtying the folio, and folios are
   2845		 * always locked coming in here, so we get the desired
   2846		 * exclusion.
   2847		 */
   2848		wb = unlocked_inode_to_wb_begin(inode, &cookie);
   2849		if (folio_test_clear_dirty(folio)) {
   2850			long nr = folio_nr_pages(folio);
   2851			lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
   2852			zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
   2853			wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
   2854			ret = true;
   2855		}
   2856		unlocked_inode_to_wb_end(inode, &cookie);
   2857		return ret;
   2858	}
   2859	return folio_test_clear_dirty(folio);
   2860}
   2861EXPORT_SYMBOL(folio_clear_dirty_for_io);
   2862
   2863static void wb_inode_writeback_start(struct bdi_writeback *wb)
   2864{
   2865	atomic_inc(&wb->writeback_inodes);
   2866}
   2867
   2868static void wb_inode_writeback_end(struct bdi_writeback *wb)
   2869{
   2870	atomic_dec(&wb->writeback_inodes);
   2871	/*
   2872	 * Make sure estimate of writeback throughput gets updated after
   2873	 * writeback completed. We delay the update by BANDWIDTH_INTERVAL
   2874	 * (which is the interval other bandwidth updates use for batching) so
   2875	 * that if multiple inodes end writeback at a similar time, they get
   2876	 * batched into one bandwidth update.
   2877	 */
   2878	queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
   2879}
   2880
   2881bool __folio_end_writeback(struct folio *folio)
   2882{
   2883	long nr = folio_nr_pages(folio);
   2884	struct address_space *mapping = folio_mapping(folio);
   2885	bool ret;
   2886
   2887	folio_memcg_lock(folio);
   2888	if (mapping && mapping_use_writeback_tags(mapping)) {
   2889		struct inode *inode = mapping->host;
   2890		struct backing_dev_info *bdi = inode_to_bdi(inode);
   2891		unsigned long flags;
   2892
   2893		xa_lock_irqsave(&mapping->i_pages, flags);
   2894		ret = folio_test_clear_writeback(folio);
   2895		if (ret) {
   2896			__xa_clear_mark(&mapping->i_pages, folio_index(folio),
   2897						PAGECACHE_TAG_WRITEBACK);
   2898			if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
   2899				struct bdi_writeback *wb = inode_to_wb(inode);
   2900
   2901				wb_stat_mod(wb, WB_WRITEBACK, -nr);
   2902				__wb_writeout_add(wb, nr);
   2903				if (!mapping_tagged(mapping,
   2904						    PAGECACHE_TAG_WRITEBACK))
   2905					wb_inode_writeback_end(wb);
   2906			}
   2907		}
   2908
   2909		if (mapping->host && !mapping_tagged(mapping,
   2910						     PAGECACHE_TAG_WRITEBACK))
   2911			sb_clear_inode_writeback(mapping->host);
   2912
   2913		xa_unlock_irqrestore(&mapping->i_pages, flags);
   2914	} else {
   2915		ret = folio_test_clear_writeback(folio);
   2916	}
   2917	if (ret) {
   2918		lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
   2919		zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
   2920		node_stat_mod_folio(folio, NR_WRITTEN, nr);
   2921	}
   2922	folio_memcg_unlock(folio);
   2923	return ret;
   2924}
   2925
   2926bool __folio_start_writeback(struct folio *folio, bool keep_write)
   2927{
   2928	long nr = folio_nr_pages(folio);
   2929	struct address_space *mapping = folio_mapping(folio);
   2930	bool ret;
   2931	int access_ret;
   2932
   2933	folio_memcg_lock(folio);
   2934	if (mapping && mapping_use_writeback_tags(mapping)) {
   2935		XA_STATE(xas, &mapping->i_pages, folio_index(folio));
   2936		struct inode *inode = mapping->host;
   2937		struct backing_dev_info *bdi = inode_to_bdi(inode);
   2938		unsigned long flags;
   2939
   2940		xas_lock_irqsave(&xas, flags);
   2941		xas_load(&xas);
   2942		ret = folio_test_set_writeback(folio);
   2943		if (!ret) {
   2944			bool on_wblist;
   2945
   2946			on_wblist = mapping_tagged(mapping,
   2947						   PAGECACHE_TAG_WRITEBACK);
   2948
   2949			xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
   2950			if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
   2951				struct bdi_writeback *wb = inode_to_wb(inode);
   2952
   2953				wb_stat_mod(wb, WB_WRITEBACK, nr);
   2954				if (!on_wblist)
   2955					wb_inode_writeback_start(wb);
   2956			}
   2957
   2958			/*
   2959			 * We can come through here when swapping
   2960			 * anonymous folios, so we don't necessarily
   2961			 * have an inode to track for sync.
   2962			 */
   2963			if (mapping->host && !on_wblist)
   2964				sb_mark_inode_writeback(mapping->host);
   2965		}
   2966		if (!folio_test_dirty(folio))
   2967			xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
   2968		if (!keep_write)
   2969			xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
   2970		xas_unlock_irqrestore(&xas, flags);
   2971	} else {
   2972		ret = folio_test_set_writeback(folio);
   2973	}
   2974	if (!ret) {
   2975		lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
   2976		zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
   2977	}
   2978	folio_memcg_unlock(folio);
   2979	access_ret = arch_make_folio_accessible(folio);
   2980	/*
   2981	 * If writeback has been triggered on a page that cannot be made
   2982	 * accessible, it is too late to recover here.
   2983	 */
   2984	VM_BUG_ON_FOLIO(access_ret != 0, folio);
   2985
   2986	return ret;
   2987}
   2988EXPORT_SYMBOL(__folio_start_writeback);
   2989
   2990/**
   2991 * folio_wait_writeback - Wait for a folio to finish writeback.
   2992 * @folio: The folio to wait for.
   2993 *
   2994 * If the folio is currently being written back to storage, wait for the
   2995 * I/O to complete.
   2996 *
   2997 * Context: Sleeps.  Must be called in process context and with
   2998 * no spinlocks held.  Caller should hold a reference on the folio.
   2999 * If the folio is not locked, writeback may start again after writeback
   3000 * has finished.
   3001 */
   3002void folio_wait_writeback(struct folio *folio)
   3003{
   3004	while (folio_test_writeback(folio)) {
   3005		trace_folio_wait_writeback(folio, folio_mapping(folio));
   3006		folio_wait_bit(folio, PG_writeback);
   3007	}
   3008}
   3009EXPORT_SYMBOL_GPL(folio_wait_writeback);
   3010
   3011/**
   3012 * folio_wait_writeback_killable - Wait for a folio to finish writeback.
   3013 * @folio: The folio to wait for.
   3014 *
   3015 * If the folio is currently being written back to storage, wait for the
   3016 * I/O to complete or a fatal signal to arrive.
   3017 *
   3018 * Context: Sleeps.  Must be called in process context and with
   3019 * no spinlocks held.  Caller should hold a reference on the folio.
   3020 * If the folio is not locked, writeback may start again after writeback
   3021 * has finished.
   3022 * Return: 0 on success, -EINTR if we get a fatal signal while waiting.
   3023 */
   3024int folio_wait_writeback_killable(struct folio *folio)
   3025{
   3026	while (folio_test_writeback(folio)) {
   3027		trace_folio_wait_writeback(folio, folio_mapping(folio));
   3028		if (folio_wait_bit_killable(folio, PG_writeback))
   3029			return -EINTR;
   3030	}
   3031
   3032	return 0;
   3033}
   3034EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);
   3035
   3036/**
   3037 * folio_wait_stable() - wait for writeback to finish, if necessary.
   3038 * @folio: The folio to wait on.
   3039 *
   3040 * This function determines if the given folio is related to a backing
   3041 * device that requires folio contents to be held stable during writeback.
   3042 * If so, then it will wait for any pending writeback to complete.
   3043 *
   3044 * Context: Sleeps.  Must be called in process context and with
   3045 * no spinlocks held.  Caller should hold a reference on the folio.
   3046 * If the folio is not locked, writeback may start again after writeback
   3047 * has finished.
   3048 */
   3049void folio_wait_stable(struct folio *folio)
   3050{
   3051	if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES)
   3052		folio_wait_writeback(folio);
   3053}
   3054EXPORT_SYMBOL_GPL(folio_wait_stable);