cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

backing-dev.c (25278B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2
      3#include <linux/blkdev.h>
      4#include <linux/wait.h>
      5#include <linux/rbtree.h>
      6#include <linux/kthread.h>
      7#include <linux/backing-dev.h>
      8#include <linux/blk-cgroup.h>
      9#include <linux/freezer.h>
     10#include <linux/fs.h>
     11#include <linux/pagemap.h>
     12#include <linux/mm.h>
     13#include <linux/sched/mm.h>
     14#include <linux/sched.h>
     15#include <linux/module.h>
     16#include <linux/writeback.h>
     17#include <linux/device.h>
     18#include <trace/events/writeback.h>
     19
     20struct backing_dev_info noop_backing_dev_info;
     21EXPORT_SYMBOL_GPL(noop_backing_dev_info);
     22
     23static struct class *bdi_class;
     24static const char *bdi_unknown_name = "(unknown)";
     25
     26/*
     27 * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
     28 * reader side locking.
     29 */
     30DEFINE_SPINLOCK(bdi_lock);
     31static u64 bdi_id_cursor;
     32static struct rb_root bdi_tree = RB_ROOT;
     33LIST_HEAD(bdi_list);
     34
     35/* bdi_wq serves all asynchronous writeback tasks */
     36struct workqueue_struct *bdi_wq;
     37
     38#define K(x) ((x) << (PAGE_SHIFT - 10))
     39
     40#ifdef CONFIG_DEBUG_FS
     41#include <linux/debugfs.h>
     42#include <linux/seq_file.h>
     43
     44static struct dentry *bdi_debug_root;
     45
     46static void bdi_debug_init(void)
     47{
     48	bdi_debug_root = debugfs_create_dir("bdi", NULL);
     49}
     50
     51static int bdi_debug_stats_show(struct seq_file *m, void *v)
     52{
     53	struct backing_dev_info *bdi = m->private;
     54	struct bdi_writeback *wb = &bdi->wb;
     55	unsigned long background_thresh;
     56	unsigned long dirty_thresh;
     57	unsigned long wb_thresh;
     58	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
     59	struct inode *inode;
     60
     61	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
     62	spin_lock(&wb->list_lock);
     63	list_for_each_entry(inode, &wb->b_dirty, i_io_list)
     64		nr_dirty++;
     65	list_for_each_entry(inode, &wb->b_io, i_io_list)
     66		nr_io++;
     67	list_for_each_entry(inode, &wb->b_more_io, i_io_list)
     68		nr_more_io++;
     69	list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
     70		if (inode->i_state & I_DIRTY_TIME)
     71			nr_dirty_time++;
     72	spin_unlock(&wb->list_lock);
     73
     74	global_dirty_limits(&background_thresh, &dirty_thresh);
     75	wb_thresh = wb_calc_thresh(wb, dirty_thresh);
     76
     77	seq_printf(m,
     78		   "BdiWriteback:       %10lu kB\n"
     79		   "BdiReclaimable:     %10lu kB\n"
     80		   "BdiDirtyThresh:     %10lu kB\n"
     81		   "DirtyThresh:        %10lu kB\n"
     82		   "BackgroundThresh:   %10lu kB\n"
     83		   "BdiDirtied:         %10lu kB\n"
     84		   "BdiWritten:         %10lu kB\n"
     85		   "BdiWriteBandwidth:  %10lu kBps\n"
     86		   "b_dirty:            %10lu\n"
     87		   "b_io:               %10lu\n"
     88		   "b_more_io:          %10lu\n"
     89		   "b_dirty_time:       %10lu\n"
     90		   "bdi_list:           %10u\n"
     91		   "state:              %10lx\n",
     92		   (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
     93		   (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
     94		   K(wb_thresh),
     95		   K(dirty_thresh),
     96		   K(background_thresh),
     97		   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
     98		   (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
     99		   (unsigned long) K(wb->write_bandwidth),
    100		   nr_dirty,
    101		   nr_io,
    102		   nr_more_io,
    103		   nr_dirty_time,
    104		   !list_empty(&bdi->bdi_list), bdi->wb.state);
    105
    106	return 0;
    107}
    108DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
    109
    110static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
    111{
    112	bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
    113
    114	debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
    115			    &bdi_debug_stats_fops);
    116}
    117
    118static void bdi_debug_unregister(struct backing_dev_info *bdi)
    119{
    120	debugfs_remove_recursive(bdi->debug_dir);
    121}
    122#else
    123static inline void bdi_debug_init(void)
    124{
    125}
    126static inline void bdi_debug_register(struct backing_dev_info *bdi,
    127				      const char *name)
    128{
    129}
    130static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
    131{
    132}
    133#endif
    134
    135static ssize_t read_ahead_kb_store(struct device *dev,
    136				  struct device_attribute *attr,
    137				  const char *buf, size_t count)
    138{
    139	struct backing_dev_info *bdi = dev_get_drvdata(dev);
    140	unsigned long read_ahead_kb;
    141	ssize_t ret;
    142
    143	ret = kstrtoul(buf, 10, &read_ahead_kb);
    144	if (ret < 0)
    145		return ret;
    146
    147	bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
    148
    149	return count;
    150}
    151
    152#define BDI_SHOW(name, expr)						\
    153static ssize_t name##_show(struct device *dev,				\
    154			   struct device_attribute *attr, char *buf)	\
    155{									\
    156	struct backing_dev_info *bdi = dev_get_drvdata(dev);		\
    157									\
    158	return sysfs_emit(buf, "%lld\n", (long long)expr);		\
    159}									\
    160static DEVICE_ATTR_RW(name);
    161
    162BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
    163
    164static ssize_t min_ratio_store(struct device *dev,
    165		struct device_attribute *attr, const char *buf, size_t count)
    166{
    167	struct backing_dev_info *bdi = dev_get_drvdata(dev);
    168	unsigned int ratio;
    169	ssize_t ret;
    170
    171	ret = kstrtouint(buf, 10, &ratio);
    172	if (ret < 0)
    173		return ret;
    174
    175	ret = bdi_set_min_ratio(bdi, ratio);
    176	if (!ret)
    177		ret = count;
    178
    179	return ret;
    180}
    181BDI_SHOW(min_ratio, bdi->min_ratio)
    182
    183static ssize_t max_ratio_store(struct device *dev,
    184		struct device_attribute *attr, const char *buf, size_t count)
    185{
    186	struct backing_dev_info *bdi = dev_get_drvdata(dev);
    187	unsigned int ratio;
    188	ssize_t ret;
    189
    190	ret = kstrtouint(buf, 10, &ratio);
    191	if (ret < 0)
    192		return ret;
    193
    194	ret = bdi_set_max_ratio(bdi, ratio);
    195	if (!ret)
    196		ret = count;
    197
    198	return ret;
    199}
    200BDI_SHOW(max_ratio, bdi->max_ratio)
    201
    202static ssize_t stable_pages_required_show(struct device *dev,
    203					  struct device_attribute *attr,
    204					  char *buf)
    205{
    206	dev_warn_once(dev,
    207		"the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
    208	return sysfs_emit(buf, "%d\n", 0);
    209}
    210static DEVICE_ATTR_RO(stable_pages_required);
    211
    212static struct attribute *bdi_dev_attrs[] = {
    213	&dev_attr_read_ahead_kb.attr,
    214	&dev_attr_min_ratio.attr,
    215	&dev_attr_max_ratio.attr,
    216	&dev_attr_stable_pages_required.attr,
    217	NULL,
    218};
    219ATTRIBUTE_GROUPS(bdi_dev);
    220
    221static __init int bdi_class_init(void)
    222{
    223	bdi_class = class_create(THIS_MODULE, "bdi");
    224	if (IS_ERR(bdi_class))
    225		return PTR_ERR(bdi_class);
    226
    227	bdi_class->dev_groups = bdi_dev_groups;
    228	bdi_debug_init();
    229
    230	return 0;
    231}
    232postcore_initcall(bdi_class_init);
    233
    234static int __init default_bdi_init(void)
    235{
    236	bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
    237				 WQ_SYSFS, 0);
    238	if (!bdi_wq)
    239		return -ENOMEM;
    240	return 0;
    241}
    242subsys_initcall(default_bdi_init);
    243
    244/*
    245 * This function is used when the first inode for this wb is marked dirty. It
    246 * wakes-up the corresponding bdi thread which should then take care of the
    247 * periodic background write-out of dirty inodes. Since the write-out would
    248 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
    249 * set up a timer which wakes the bdi thread up later.
    250 *
    251 * Note, we wouldn't bother setting up the timer, but this function is on the
    252 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
    253 * by delaying the wake-up.
    254 *
    255 * We have to be careful not to postpone flush work if it is scheduled for
    256 * earlier. Thus we use queue_delayed_work().
    257 */
    258void wb_wakeup_delayed(struct bdi_writeback *wb)
    259{
    260	unsigned long timeout;
    261
    262	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
    263	spin_lock_bh(&wb->work_lock);
    264	if (test_bit(WB_registered, &wb->state))
    265		queue_delayed_work(bdi_wq, &wb->dwork, timeout);
    266	spin_unlock_bh(&wb->work_lock);
    267}
    268
    269static void wb_update_bandwidth_workfn(struct work_struct *work)
    270{
    271	struct bdi_writeback *wb = container_of(to_delayed_work(work),
    272						struct bdi_writeback, bw_dwork);
    273
    274	wb_update_bandwidth(wb);
    275}
    276
    277/*
    278 * Initial write bandwidth: 100 MB/s
    279 */
    280#define INIT_BW		(100 << (20 - PAGE_SHIFT))
    281
    282static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
    283		   gfp_t gfp)
    284{
    285	int i, err;
    286
    287	memset(wb, 0, sizeof(*wb));
    288
    289	wb->bdi = bdi;
    290	wb->last_old_flush = jiffies;
    291	INIT_LIST_HEAD(&wb->b_dirty);
    292	INIT_LIST_HEAD(&wb->b_io);
    293	INIT_LIST_HEAD(&wb->b_more_io);
    294	INIT_LIST_HEAD(&wb->b_dirty_time);
    295	spin_lock_init(&wb->list_lock);
    296
    297	atomic_set(&wb->writeback_inodes, 0);
    298	wb->bw_time_stamp = jiffies;
    299	wb->balanced_dirty_ratelimit = INIT_BW;
    300	wb->dirty_ratelimit = INIT_BW;
    301	wb->write_bandwidth = INIT_BW;
    302	wb->avg_write_bandwidth = INIT_BW;
    303
    304	spin_lock_init(&wb->work_lock);
    305	INIT_LIST_HEAD(&wb->work_list);
    306	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
    307	INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
    308	wb->dirty_sleep = jiffies;
    309
    310	err = fprop_local_init_percpu(&wb->completions, gfp);
    311	if (err)
    312		return err;
    313
    314	for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
    315		err = percpu_counter_init(&wb->stat[i], 0, gfp);
    316		if (err)
    317			goto out_destroy_stat;
    318	}
    319
    320	return 0;
    321
    322out_destroy_stat:
    323	while (i--)
    324		percpu_counter_destroy(&wb->stat[i]);
    325	fprop_local_destroy_percpu(&wb->completions);
    326	return err;
    327}
    328
    329static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
    330
    331/*
    332 * Remove bdi from the global list and shutdown any threads we have running
    333 */
    334static void wb_shutdown(struct bdi_writeback *wb)
    335{
    336	/* Make sure nobody queues further work */
    337	spin_lock_bh(&wb->work_lock);
    338	if (!test_and_clear_bit(WB_registered, &wb->state)) {
    339		spin_unlock_bh(&wb->work_lock);
    340		return;
    341	}
    342	spin_unlock_bh(&wb->work_lock);
    343
    344	cgwb_remove_from_bdi_list(wb);
    345	/*
    346	 * Drain work list and shutdown the delayed_work.  !WB_registered
    347	 * tells wb_workfn() that @wb is dying and its work_list needs to
    348	 * be drained no matter what.
    349	 */
    350	mod_delayed_work(bdi_wq, &wb->dwork, 0);
    351	flush_delayed_work(&wb->dwork);
    352	WARN_ON(!list_empty(&wb->work_list));
    353	flush_delayed_work(&wb->bw_dwork);
    354}
    355
    356static void wb_exit(struct bdi_writeback *wb)
    357{
    358	int i;
    359
    360	WARN_ON(delayed_work_pending(&wb->dwork));
    361
    362	for (i = 0; i < NR_WB_STAT_ITEMS; i++)
    363		percpu_counter_destroy(&wb->stat[i]);
    364
    365	fprop_local_destroy_percpu(&wb->completions);
    366}
    367
    368#ifdef CONFIG_CGROUP_WRITEBACK
    369
    370#include <linux/memcontrol.h>
    371
    372/*
    373 * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
    374 * memcg->cgwb_list.  bdi->cgwb_tree is also RCU protected.
    375 */
    376static DEFINE_SPINLOCK(cgwb_lock);
    377static struct workqueue_struct *cgwb_release_wq;
    378
    379static LIST_HEAD(offline_cgwbs);
    380static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
    381static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
    382
    383static void cgwb_release_workfn(struct work_struct *work)
    384{
    385	struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
    386						release_work);
    387	struct backing_dev_info *bdi = wb->bdi;
    388
    389	mutex_lock(&wb->bdi->cgwb_release_mutex);
    390	wb_shutdown(wb);
    391
    392	css_put(wb->memcg_css);
    393	css_put(wb->blkcg_css);
    394	mutex_unlock(&wb->bdi->cgwb_release_mutex);
    395
    396	/* triggers blkg destruction if no online users left */
    397	blkcg_unpin_online(wb->blkcg_css);
    398
    399	fprop_local_destroy_percpu(&wb->memcg_completions);
    400
    401	spin_lock_irq(&cgwb_lock);
    402	list_del(&wb->offline_node);
    403	spin_unlock_irq(&cgwb_lock);
    404
    405	percpu_ref_exit(&wb->refcnt);
    406	wb_exit(wb);
    407	bdi_put(bdi);
    408	WARN_ON_ONCE(!list_empty(&wb->b_attached));
    409	kfree_rcu(wb, rcu);
    410}
    411
    412static void cgwb_release(struct percpu_ref *refcnt)
    413{
    414	struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
    415						refcnt);
    416	queue_work(cgwb_release_wq, &wb->release_work);
    417}
    418
    419static void cgwb_kill(struct bdi_writeback *wb)
    420{
    421	lockdep_assert_held(&cgwb_lock);
    422
    423	WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
    424	list_del(&wb->memcg_node);
    425	list_del(&wb->blkcg_node);
    426	list_add(&wb->offline_node, &offline_cgwbs);
    427	percpu_ref_kill(&wb->refcnt);
    428}
    429
    430static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
    431{
    432	spin_lock_irq(&cgwb_lock);
    433	list_del_rcu(&wb->bdi_node);
    434	spin_unlock_irq(&cgwb_lock);
    435}
    436
    437static int cgwb_create(struct backing_dev_info *bdi,
    438		       struct cgroup_subsys_state *memcg_css, gfp_t gfp)
    439{
    440	struct mem_cgroup *memcg;
    441	struct cgroup_subsys_state *blkcg_css;
    442	struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
    443	struct bdi_writeback *wb;
    444	unsigned long flags;
    445	int ret = 0;
    446
    447	memcg = mem_cgroup_from_css(memcg_css);
    448	blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
    449	memcg_cgwb_list = &memcg->cgwb_list;
    450	blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
    451
    452	/* look up again under lock and discard on blkcg mismatch */
    453	spin_lock_irqsave(&cgwb_lock, flags);
    454	wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
    455	if (wb && wb->blkcg_css != blkcg_css) {
    456		cgwb_kill(wb);
    457		wb = NULL;
    458	}
    459	spin_unlock_irqrestore(&cgwb_lock, flags);
    460	if (wb)
    461		goto out_put;
    462
    463	/* need to create a new one */
    464	wb = kmalloc(sizeof(*wb), gfp);
    465	if (!wb) {
    466		ret = -ENOMEM;
    467		goto out_put;
    468	}
    469
    470	ret = wb_init(wb, bdi, gfp);
    471	if (ret)
    472		goto err_free;
    473
    474	ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
    475	if (ret)
    476		goto err_wb_exit;
    477
    478	ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
    479	if (ret)
    480		goto err_ref_exit;
    481
    482	wb->memcg_css = memcg_css;
    483	wb->blkcg_css = blkcg_css;
    484	INIT_LIST_HEAD(&wb->b_attached);
    485	INIT_WORK(&wb->release_work, cgwb_release_workfn);
    486	set_bit(WB_registered, &wb->state);
    487	bdi_get(bdi);
    488
    489	/*
    490	 * The root wb determines the registered state of the whole bdi and
    491	 * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
    492	 * whether they're still online.  Don't link @wb if any is dead.
    493	 * See wb_memcg_offline() and wb_blkcg_offline().
    494	 */
    495	ret = -ENODEV;
    496	spin_lock_irqsave(&cgwb_lock, flags);
    497	if (test_bit(WB_registered, &bdi->wb.state) &&
    498	    blkcg_cgwb_list->next && memcg_cgwb_list->next) {
    499		/* we might have raced another instance of this function */
    500		ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
    501		if (!ret) {
    502			list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
    503			list_add(&wb->memcg_node, memcg_cgwb_list);
    504			list_add(&wb->blkcg_node, blkcg_cgwb_list);
    505			blkcg_pin_online(blkcg_css);
    506			css_get(memcg_css);
    507			css_get(blkcg_css);
    508		}
    509	}
    510	spin_unlock_irqrestore(&cgwb_lock, flags);
    511	if (ret) {
    512		if (ret == -EEXIST)
    513			ret = 0;
    514		goto err_fprop_exit;
    515	}
    516	goto out_put;
    517
    518err_fprop_exit:
    519	bdi_put(bdi);
    520	fprop_local_destroy_percpu(&wb->memcg_completions);
    521err_ref_exit:
    522	percpu_ref_exit(&wb->refcnt);
    523err_wb_exit:
    524	wb_exit(wb);
    525err_free:
    526	kfree(wb);
    527out_put:
    528	css_put(blkcg_css);
    529	return ret;
    530}
    531
    532/**
    533 * wb_get_lookup - get wb for a given memcg
    534 * @bdi: target bdi
    535 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
    536 *
    537 * Try to get the wb for @memcg_css on @bdi.  The returned wb has its
    538 * refcount incremented.
    539 *
    540 * This function uses css_get() on @memcg_css and thus expects its refcnt
    541 * to be positive on invocation.  IOW, rcu_read_lock() protection on
    542 * @memcg_css isn't enough.  try_get it before calling this function.
    543 *
    544 * A wb is keyed by its associated memcg.  As blkcg implicitly enables
    545 * memcg on the default hierarchy, memcg association is guaranteed to be
    546 * more specific (equal or descendant to the associated blkcg) and thus can
    547 * identify both the memcg and blkcg associations.
    548 *
    549 * Because the blkcg associated with a memcg may change as blkcg is enabled
    550 * and disabled closer to root in the hierarchy, each wb keeps track of
    551 * both the memcg and blkcg associated with it and verifies the blkcg on
    552 * each lookup.  On mismatch, the existing wb is discarded and a new one is
    553 * created.
    554 */
    555struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
    556				    struct cgroup_subsys_state *memcg_css)
    557{
    558	struct bdi_writeback *wb;
    559
    560	if (!memcg_css->parent)
    561		return &bdi->wb;
    562
    563	rcu_read_lock();
    564	wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
    565	if (wb) {
    566		struct cgroup_subsys_state *blkcg_css;
    567
    568		/* see whether the blkcg association has changed */
    569		blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
    570		if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
    571			wb = NULL;
    572		css_put(blkcg_css);
    573	}
    574	rcu_read_unlock();
    575
    576	return wb;
    577}
    578
    579/**
    580 * wb_get_create - get wb for a given memcg, create if necessary
    581 * @bdi: target bdi
    582 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
    583 * @gfp: allocation mask to use
    584 *
    585 * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
    586 * create one.  See wb_get_lookup() for more details.
    587 */
    588struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
    589				    struct cgroup_subsys_state *memcg_css,
    590				    gfp_t gfp)
    591{
    592	struct bdi_writeback *wb;
    593
    594	might_alloc(gfp);
    595
    596	if (!memcg_css->parent)
    597		return &bdi->wb;
    598
    599	do {
    600		wb = wb_get_lookup(bdi, memcg_css);
    601	} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
    602
    603	return wb;
    604}
    605
    606static int cgwb_bdi_init(struct backing_dev_info *bdi)
    607{
    608	int ret;
    609
    610	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
    611	mutex_init(&bdi->cgwb_release_mutex);
    612	init_rwsem(&bdi->wb_switch_rwsem);
    613
    614	ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
    615	if (!ret) {
    616		bdi->wb.memcg_css = &root_mem_cgroup->css;
    617		bdi->wb.blkcg_css = blkcg_root_css;
    618	}
    619	return ret;
    620}
    621
    622static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
    623{
    624	struct radix_tree_iter iter;
    625	void **slot;
    626	struct bdi_writeback *wb;
    627
    628	WARN_ON(test_bit(WB_registered, &bdi->wb.state));
    629
    630	spin_lock_irq(&cgwb_lock);
    631	radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
    632		cgwb_kill(*slot);
    633	spin_unlock_irq(&cgwb_lock);
    634
    635	mutex_lock(&bdi->cgwb_release_mutex);
    636	spin_lock_irq(&cgwb_lock);
    637	while (!list_empty(&bdi->wb_list)) {
    638		wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
    639				      bdi_node);
    640		spin_unlock_irq(&cgwb_lock);
    641		wb_shutdown(wb);
    642		spin_lock_irq(&cgwb_lock);
    643	}
    644	spin_unlock_irq(&cgwb_lock);
    645	mutex_unlock(&bdi->cgwb_release_mutex);
    646}
    647
    648/*
    649 * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
    650 *
    651 * Try to release dying cgwbs by switching attached inodes to the nearest
    652 * living ancestor's writeback. Processed wbs are placed at the end
    653 * of the list to guarantee the forward progress.
    654 */
    655static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
    656{
    657	struct bdi_writeback *wb;
    658	LIST_HEAD(processed);
    659
    660	spin_lock_irq(&cgwb_lock);
    661
    662	while (!list_empty(&offline_cgwbs)) {
    663		wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
    664				      offline_node);
    665		list_move(&wb->offline_node, &processed);
    666
    667		/*
    668		 * If wb is dirty, cleaning up the writeback by switching
    669		 * attached inodes will result in an effective removal of any
    670		 * bandwidth restrictions, which isn't the goal.  Instead,
    671		 * it can be postponed until the next time, when all io
    672		 * will be likely completed.  If in the meantime some inodes
    673		 * will get re-dirtied, they should be eventually switched to
    674		 * a new cgwb.
    675		 */
    676		if (wb_has_dirty_io(wb))
    677			continue;
    678
    679		if (!wb_tryget(wb))
    680			continue;
    681
    682		spin_unlock_irq(&cgwb_lock);
    683		while (cleanup_offline_cgwb(wb))
    684			cond_resched();
    685		spin_lock_irq(&cgwb_lock);
    686
    687		wb_put(wb);
    688	}
    689
    690	if (!list_empty(&processed))
    691		list_splice_tail(&processed, &offline_cgwbs);
    692
    693	spin_unlock_irq(&cgwb_lock);
    694}
    695
    696/**
    697 * wb_memcg_offline - kill all wb's associated with a memcg being offlined
    698 * @memcg: memcg being offlined
    699 *
    700 * Also prevents creation of any new wb's associated with @memcg.
    701 */
    702void wb_memcg_offline(struct mem_cgroup *memcg)
    703{
    704	struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
    705	struct bdi_writeback *wb, *next;
    706
    707	spin_lock_irq(&cgwb_lock);
    708	list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
    709		cgwb_kill(wb);
    710	memcg_cgwb_list->next = NULL;	/* prevent new wb's */
    711	spin_unlock_irq(&cgwb_lock);
    712
    713	queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
    714}
    715
    716/**
    717 * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
    718 * @css: blkcg being offlined
    719 *
    720 * Also prevents creation of any new wb's associated with @blkcg.
    721 */
    722void wb_blkcg_offline(struct cgroup_subsys_state *css)
    723{
    724	struct bdi_writeback *wb, *next;
    725	struct list_head *list = blkcg_get_cgwb_list(css);
    726
    727	spin_lock_irq(&cgwb_lock);
    728	list_for_each_entry_safe(wb, next, list, blkcg_node)
    729		cgwb_kill(wb);
    730	list->next = NULL;	/* prevent new wb's */
    731	spin_unlock_irq(&cgwb_lock);
    732}
    733
    734static void cgwb_bdi_register(struct backing_dev_info *bdi)
    735{
    736	spin_lock_irq(&cgwb_lock);
    737	list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
    738	spin_unlock_irq(&cgwb_lock);
    739}
    740
    741static int __init cgwb_init(void)
    742{
    743	/*
    744	 * There can be many concurrent release work items overwhelming
    745	 * system_wq.  Put them in a separate wq and limit concurrency.
    746	 * There's no point in executing many of these in parallel.
    747	 */
    748	cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
    749	if (!cgwb_release_wq)
    750		return -ENOMEM;
    751
    752	return 0;
    753}
    754subsys_initcall(cgwb_init);
    755
    756#else	/* CONFIG_CGROUP_WRITEBACK */
    757
    758static int cgwb_bdi_init(struct backing_dev_info *bdi)
    759{
    760	return wb_init(&bdi->wb, bdi, GFP_KERNEL);
    761}
    762
    763static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
    764
    765static void cgwb_bdi_register(struct backing_dev_info *bdi)
    766{
    767	list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
    768}
    769
    770static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
    771{
    772	list_del_rcu(&wb->bdi_node);
    773}
    774
    775#endif	/* CONFIG_CGROUP_WRITEBACK */
    776
    777int bdi_init(struct backing_dev_info *bdi)
    778{
    779	int ret;
    780
    781	bdi->dev = NULL;
    782
    783	kref_init(&bdi->refcnt);
    784	bdi->min_ratio = 0;
    785	bdi->max_ratio = 100;
    786	bdi->max_prop_frac = FPROP_FRAC_BASE;
    787	INIT_LIST_HEAD(&bdi->bdi_list);
    788	INIT_LIST_HEAD(&bdi->wb_list);
    789	init_waitqueue_head(&bdi->wb_waitq);
    790
    791	ret = cgwb_bdi_init(bdi);
    792
    793	return ret;
    794}
    795
    796struct backing_dev_info *bdi_alloc(int node_id)
    797{
    798	struct backing_dev_info *bdi;
    799
    800	bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
    801	if (!bdi)
    802		return NULL;
    803
    804	if (bdi_init(bdi)) {
    805		kfree(bdi);
    806		return NULL;
    807	}
    808	bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
    809	bdi->ra_pages = VM_READAHEAD_PAGES;
    810	bdi->io_pages = VM_READAHEAD_PAGES;
    811	timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
    812	return bdi;
    813}
    814EXPORT_SYMBOL(bdi_alloc);
    815
    816static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
    817{
    818	struct rb_node **p = &bdi_tree.rb_node;
    819	struct rb_node *parent = NULL;
    820	struct backing_dev_info *bdi;
    821
    822	lockdep_assert_held(&bdi_lock);
    823
    824	while (*p) {
    825		parent = *p;
    826		bdi = rb_entry(parent, struct backing_dev_info, rb_node);
    827
    828		if (bdi->id > id)
    829			p = &(*p)->rb_left;
    830		else if (bdi->id < id)
    831			p = &(*p)->rb_right;
    832		else
    833			break;
    834	}
    835
    836	if (parentp)
    837		*parentp = parent;
    838	return p;
    839}
    840
    841/**
    842 * bdi_get_by_id - lookup and get bdi from its id
    843 * @id: bdi id to lookup
    844 *
    845 * Find bdi matching @id and get it.  Returns NULL if the matching bdi
    846 * doesn't exist or is already unregistered.
    847 */
    848struct backing_dev_info *bdi_get_by_id(u64 id)
    849{
    850	struct backing_dev_info *bdi = NULL;
    851	struct rb_node **p;
    852
    853	spin_lock_bh(&bdi_lock);
    854	p = bdi_lookup_rb_node(id, NULL);
    855	if (*p) {
    856		bdi = rb_entry(*p, struct backing_dev_info, rb_node);
    857		bdi_get(bdi);
    858	}
    859	spin_unlock_bh(&bdi_lock);
    860
    861	return bdi;
    862}
    863
    864int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
    865{
    866	struct device *dev;
    867	struct rb_node *parent, **p;
    868
    869	if (bdi->dev)	/* The driver needs to use separate queues per device */
    870		return 0;
    871
    872	vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
    873	dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
    874	if (IS_ERR(dev))
    875		return PTR_ERR(dev);
    876
    877	cgwb_bdi_register(bdi);
    878	bdi->dev = dev;
    879
    880	bdi_debug_register(bdi, dev_name(dev));
    881	set_bit(WB_registered, &bdi->wb.state);
    882
    883	spin_lock_bh(&bdi_lock);
    884
    885	bdi->id = ++bdi_id_cursor;
    886
    887	p = bdi_lookup_rb_node(bdi->id, &parent);
    888	rb_link_node(&bdi->rb_node, parent, p);
    889	rb_insert_color(&bdi->rb_node, &bdi_tree);
    890
    891	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
    892
    893	spin_unlock_bh(&bdi_lock);
    894
    895	trace_writeback_bdi_register(bdi);
    896	return 0;
    897}
    898
    899int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
    900{
    901	va_list args;
    902	int ret;
    903
    904	va_start(args, fmt);
    905	ret = bdi_register_va(bdi, fmt, args);
    906	va_end(args);
    907	return ret;
    908}
    909EXPORT_SYMBOL(bdi_register);
    910
    911void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
    912{
    913	WARN_ON_ONCE(bdi->owner);
    914	bdi->owner = owner;
    915	get_device(owner);
    916}
    917
    918/*
    919 * Remove bdi from bdi_list, and ensure that it is no longer visible
    920 */
    921static void bdi_remove_from_list(struct backing_dev_info *bdi)
    922{
    923	spin_lock_bh(&bdi_lock);
    924	rb_erase(&bdi->rb_node, &bdi_tree);
    925	list_del_rcu(&bdi->bdi_list);
    926	spin_unlock_bh(&bdi_lock);
    927
    928	synchronize_rcu_expedited();
    929}
    930
    931void bdi_unregister(struct backing_dev_info *bdi)
    932{
    933	del_timer_sync(&bdi->laptop_mode_wb_timer);
    934
    935	/* make sure nobody finds us on the bdi_list anymore */
    936	bdi_remove_from_list(bdi);
    937	wb_shutdown(&bdi->wb);
    938	cgwb_bdi_unregister(bdi);
    939
    940	/*
    941	 * If this BDI's min ratio has been set, use bdi_set_min_ratio() to
    942	 * update the global bdi_min_ratio.
    943	 */
    944	if (bdi->min_ratio)
    945		bdi_set_min_ratio(bdi, 0);
    946
    947	if (bdi->dev) {
    948		bdi_debug_unregister(bdi);
    949		device_unregister(bdi->dev);
    950		bdi->dev = NULL;
    951	}
    952
    953	if (bdi->owner) {
    954		put_device(bdi->owner);
    955		bdi->owner = NULL;
    956	}
    957}
    958EXPORT_SYMBOL(bdi_unregister);
    959
    960static void release_bdi(struct kref *ref)
    961{
    962	struct backing_dev_info *bdi =
    963			container_of(ref, struct backing_dev_info, refcnt);
    964
    965	WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
    966	WARN_ON_ONCE(bdi->dev);
    967	wb_exit(&bdi->wb);
    968	kfree(bdi);
    969}
    970
    971void bdi_put(struct backing_dev_info *bdi)
    972{
    973	kref_put(&bdi->refcnt, release_bdi);
    974}
    975EXPORT_SYMBOL(bdi_put);
    976
    977struct backing_dev_info *inode_to_bdi(struct inode *inode)
    978{
    979	struct super_block *sb;
    980
    981	if (!inode)
    982		return &noop_backing_dev_info;
    983
    984	sb = inode->i_sb;
    985#ifdef CONFIG_BLOCK
    986	if (sb_is_blkdev_sb(sb))
    987		return I_BDEV(inode)->bd_disk->bdi;
    988#endif
    989	return sb->s_bdi;
    990}
    991EXPORT_SYMBOL(inode_to_bdi);
    992
    993const char *bdi_dev_name(struct backing_dev_info *bdi)
    994{
    995	if (!bdi || !bdi->dev)
    996		return bdi_unknown_name;
    997	return bdi->dev_name;
    998}
    999EXPORT_SYMBOL_GPL(bdi_dev_name);