cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm.c (77482B)


      1/*
      2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
      3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
      4 *
      5 * This file is released under the GPL.
      6 */
      7
      8#include "dm-core.h"
      9#include "dm-rq.h"
     10#include "dm-uevent.h"
     11#include "dm-ima.h"
     12
     13#include <linux/init.h>
     14#include <linux/module.h>
     15#include <linux/mutex.h>
     16#include <linux/sched/mm.h>
     17#include <linux/sched/signal.h>
     18#include <linux/blkpg.h>
     19#include <linux/bio.h>
     20#include <linux/mempool.h>
     21#include <linux/dax.h>
     22#include <linux/slab.h>
     23#include <linux/idr.h>
     24#include <linux/uio.h>
     25#include <linux/hdreg.h>
     26#include <linux/delay.h>
     27#include <linux/wait.h>
     28#include <linux/pr.h>
     29#include <linux/refcount.h>
     30#include <linux/part_stat.h>
     31#include <linux/blk-crypto.h>
     32#include <linux/blk-crypto-profile.h>
     33
     34#define DM_MSG_PREFIX "core"
     35
     36/*
     37 * Cookies are numeric values sent with CHANGE and REMOVE
     38 * uevents while resuming, removing or renaming the device.
     39 */
     40#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
     41#define DM_COOKIE_LENGTH 24
     42
     43/*
     44 * For REQ_POLLED fs bio, this flag is set if we link mapped underlying
     45 * dm_io into one list, and reuse bio->bi_private as the list head. Before
     46 * ending this fs bio, we will recover its ->bi_private.
     47 */
     48#define REQ_DM_POLL_LIST	REQ_DRV
     49
     50static const char *_name = DM_NAME;
     51
     52static unsigned int major = 0;
     53static unsigned int _major = 0;
     54
     55static DEFINE_IDR(_minor_idr);
     56
     57static DEFINE_SPINLOCK(_minor_lock);
     58
     59static void do_deferred_remove(struct work_struct *w);
     60
     61static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
     62
     63static struct workqueue_struct *deferred_remove_workqueue;
     64
     65atomic_t dm_global_event_nr = ATOMIC_INIT(0);
     66DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
     67
     68void dm_issue_global_event(void)
     69{
     70	atomic_inc(&dm_global_event_nr);
     71	wake_up(&dm_global_eventq);
     72}
     73
     74DEFINE_STATIC_KEY_FALSE(stats_enabled);
     75DEFINE_STATIC_KEY_FALSE(swap_bios_enabled);
     76DEFINE_STATIC_KEY_FALSE(zoned_enabled);
     77
     78/*
     79 * One of these is allocated (on-stack) per original bio.
     80 */
     81struct clone_info {
     82	struct dm_table *map;
     83	struct bio *bio;
     84	struct dm_io *io;
     85	sector_t sector;
     86	unsigned sector_count;
     87	bool is_abnormal_io:1;
     88	bool submit_as_polled:1;
     89};
     90
     91#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
     92#define DM_IO_BIO_OFFSET \
     93	(offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
     94
     95static inline struct dm_target_io *clone_to_tio(struct bio *clone)
     96{
     97	return container_of(clone, struct dm_target_io, clone);
     98}
     99
    100void *dm_per_bio_data(struct bio *bio, size_t data_size)
    101{
    102	if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO))
    103		return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
    104	return (char *)bio - DM_IO_BIO_OFFSET - data_size;
    105}
    106EXPORT_SYMBOL_GPL(dm_per_bio_data);
    107
    108struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
    109{
    110	struct dm_io *io = (struct dm_io *)((char *)data + data_size);
    111	if (io->magic == DM_IO_MAGIC)
    112		return (struct bio *)((char *)io + DM_IO_BIO_OFFSET);
    113	BUG_ON(io->magic != DM_TIO_MAGIC);
    114	return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET);
    115}
    116EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
    117
    118unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
    119{
    120	return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
    121}
    122EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
    123
    124#define MINOR_ALLOCED ((void *)-1)
    125
    126#define DM_NUMA_NODE NUMA_NO_NODE
    127static int dm_numa_node = DM_NUMA_NODE;
    128
    129#define DEFAULT_SWAP_BIOS	(8 * 1048576 / PAGE_SIZE)
    130static int swap_bios = DEFAULT_SWAP_BIOS;
    131static int get_swap_bios(void)
    132{
    133	int latch = READ_ONCE(swap_bios);
    134	if (unlikely(latch <= 0))
    135		latch = DEFAULT_SWAP_BIOS;
    136	return latch;
    137}
    138
    139struct table_device {
    140	struct list_head list;
    141	refcount_t count;
    142	struct dm_dev dm_dev;
    143};
    144
    145/*
    146 * Bio-based DM's mempools' reserved IOs set by the user.
    147 */
    148#define RESERVED_BIO_BASED_IOS		16
    149static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
    150
    151static int __dm_get_module_param_int(int *module_param, int min, int max)
    152{
    153	int param = READ_ONCE(*module_param);
    154	int modified_param = 0;
    155	bool modified = true;
    156
    157	if (param < min)
    158		modified_param = min;
    159	else if (param > max)
    160		modified_param = max;
    161	else
    162		modified = false;
    163
    164	if (modified) {
    165		(void)cmpxchg(module_param, param, modified_param);
    166		param = modified_param;
    167	}
    168
    169	return param;
    170}
    171
    172unsigned __dm_get_module_param(unsigned *module_param,
    173			       unsigned def, unsigned max)
    174{
    175	unsigned param = READ_ONCE(*module_param);
    176	unsigned modified_param = 0;
    177
    178	if (!param)
    179		modified_param = def;
    180	else if (param > max)
    181		modified_param = max;
    182
    183	if (modified_param) {
    184		(void)cmpxchg(module_param, param, modified_param);
    185		param = modified_param;
    186	}
    187
    188	return param;
    189}
    190
    191unsigned dm_get_reserved_bio_based_ios(void)
    192{
    193	return __dm_get_module_param(&reserved_bio_based_ios,
    194				     RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
    195}
    196EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
    197
    198static unsigned dm_get_numa_node(void)
    199{
    200	return __dm_get_module_param_int(&dm_numa_node,
    201					 DM_NUMA_NODE, num_online_nodes() - 1);
    202}
    203
    204static int __init local_init(void)
    205{
    206	int r;
    207
    208	r = dm_uevent_init();
    209	if (r)
    210		return r;
    211
    212	deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
    213	if (!deferred_remove_workqueue) {
    214		r = -ENOMEM;
    215		goto out_uevent_exit;
    216	}
    217
    218	_major = major;
    219	r = register_blkdev(_major, _name);
    220	if (r < 0)
    221		goto out_free_workqueue;
    222
    223	if (!_major)
    224		_major = r;
    225
    226	return 0;
    227
    228out_free_workqueue:
    229	destroy_workqueue(deferred_remove_workqueue);
    230out_uevent_exit:
    231	dm_uevent_exit();
    232
    233	return r;
    234}
    235
    236static void local_exit(void)
    237{
    238	flush_scheduled_work();
    239	destroy_workqueue(deferred_remove_workqueue);
    240
    241	unregister_blkdev(_major, _name);
    242	dm_uevent_exit();
    243
    244	_major = 0;
    245
    246	DMINFO("cleaned up");
    247}
    248
    249static int (*_inits[])(void) __initdata = {
    250	local_init,
    251	dm_target_init,
    252	dm_linear_init,
    253	dm_stripe_init,
    254	dm_io_init,
    255	dm_kcopyd_init,
    256	dm_interface_init,
    257	dm_statistics_init,
    258};
    259
    260static void (*_exits[])(void) = {
    261	local_exit,
    262	dm_target_exit,
    263	dm_linear_exit,
    264	dm_stripe_exit,
    265	dm_io_exit,
    266	dm_kcopyd_exit,
    267	dm_interface_exit,
    268	dm_statistics_exit,
    269};
    270
    271static int __init dm_init(void)
    272{
    273	const int count = ARRAY_SIZE(_inits);
    274	int r, i;
    275
    276#if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE))
    277	DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled."
    278	       " Duplicate IMA measurements will not be recorded in the IMA log.");
    279#endif
    280
    281	for (i = 0; i < count; i++) {
    282		r = _inits[i]();
    283		if (r)
    284			goto bad;
    285	}
    286
    287	return 0;
    288bad:
    289	while (i--)
    290		_exits[i]();
    291
    292	return r;
    293}
    294
    295static void __exit dm_exit(void)
    296{
    297	int i = ARRAY_SIZE(_exits);
    298
    299	while (i--)
    300		_exits[i]();
    301
    302	/*
    303	 * Should be empty by this point.
    304	 */
    305	idr_destroy(&_minor_idr);
    306}
    307
    308/*
    309 * Block device functions
    310 */
    311int dm_deleting_md(struct mapped_device *md)
    312{
    313	return test_bit(DMF_DELETING, &md->flags);
    314}
    315
    316static int dm_blk_open(struct block_device *bdev, fmode_t mode)
    317{
    318	struct mapped_device *md;
    319
    320	spin_lock(&_minor_lock);
    321
    322	md = bdev->bd_disk->private_data;
    323	if (!md)
    324		goto out;
    325
    326	if (test_bit(DMF_FREEING, &md->flags) ||
    327	    dm_deleting_md(md)) {
    328		md = NULL;
    329		goto out;
    330	}
    331
    332	dm_get(md);
    333	atomic_inc(&md->open_count);
    334out:
    335	spin_unlock(&_minor_lock);
    336
    337	return md ? 0 : -ENXIO;
    338}
    339
    340static void dm_blk_close(struct gendisk *disk, fmode_t mode)
    341{
    342	struct mapped_device *md;
    343
    344	spin_lock(&_minor_lock);
    345
    346	md = disk->private_data;
    347	if (WARN_ON(!md))
    348		goto out;
    349
    350	if (atomic_dec_and_test(&md->open_count) &&
    351	    (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
    352		queue_work(deferred_remove_workqueue, &deferred_remove_work);
    353
    354	dm_put(md);
    355out:
    356	spin_unlock(&_minor_lock);
    357}
    358
    359int dm_open_count(struct mapped_device *md)
    360{
    361	return atomic_read(&md->open_count);
    362}
    363
    364/*
    365 * Guarantees nothing is using the device before it's deleted.
    366 */
    367int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
    368{
    369	int r = 0;
    370
    371	spin_lock(&_minor_lock);
    372
    373	if (dm_open_count(md)) {
    374		r = -EBUSY;
    375		if (mark_deferred)
    376			set_bit(DMF_DEFERRED_REMOVE, &md->flags);
    377	} else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
    378		r = -EEXIST;
    379	else
    380		set_bit(DMF_DELETING, &md->flags);
    381
    382	spin_unlock(&_minor_lock);
    383
    384	return r;
    385}
    386
    387int dm_cancel_deferred_remove(struct mapped_device *md)
    388{
    389	int r = 0;
    390
    391	spin_lock(&_minor_lock);
    392
    393	if (test_bit(DMF_DELETING, &md->flags))
    394		r = -EBUSY;
    395	else
    396		clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
    397
    398	spin_unlock(&_minor_lock);
    399
    400	return r;
    401}
    402
    403static void do_deferred_remove(struct work_struct *w)
    404{
    405	dm_deferred_remove();
    406}
    407
    408static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
    409{
    410	struct mapped_device *md = bdev->bd_disk->private_data;
    411
    412	return dm_get_geometry(md, geo);
    413}
    414
    415static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
    416			    struct block_device **bdev)
    417{
    418	struct dm_target *tgt;
    419	struct dm_table *map;
    420	int r;
    421
    422retry:
    423	r = -ENOTTY;
    424	map = dm_get_live_table(md, srcu_idx);
    425	if (!map || !dm_table_get_size(map))
    426		return r;
    427
    428	/* We only support devices that have a single target */
    429	if (dm_table_get_num_targets(map) != 1)
    430		return r;
    431
    432	tgt = dm_table_get_target(map, 0);
    433	if (!tgt->type->prepare_ioctl)
    434		return r;
    435
    436	if (dm_suspended_md(md))
    437		return -EAGAIN;
    438
    439	r = tgt->type->prepare_ioctl(tgt, bdev);
    440	if (r == -ENOTCONN && !fatal_signal_pending(current)) {
    441		dm_put_live_table(md, *srcu_idx);
    442		msleep(10);
    443		goto retry;
    444	}
    445
    446	return r;
    447}
    448
    449static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
    450{
    451	dm_put_live_table(md, srcu_idx);
    452}
    453
    454static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
    455			unsigned int cmd, unsigned long arg)
    456{
    457	struct mapped_device *md = bdev->bd_disk->private_data;
    458	int r, srcu_idx;
    459
    460	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
    461	if (r < 0)
    462		goto out;
    463
    464	if (r > 0) {
    465		/*
    466		 * Target determined this ioctl is being issued against a
    467		 * subset of the parent bdev; require extra privileges.
    468		 */
    469		if (!capable(CAP_SYS_RAWIO)) {
    470			DMDEBUG_LIMIT(
    471	"%s: sending ioctl %x to DM device without required privilege.",
    472				current->comm, cmd);
    473			r = -ENOIOCTLCMD;
    474			goto out;
    475		}
    476	}
    477
    478	if (!bdev->bd_disk->fops->ioctl)
    479		r = -ENOTTY;
    480	else
    481		r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
    482out:
    483	dm_unprepare_ioctl(md, srcu_idx);
    484	return r;
    485}
    486
    487u64 dm_start_time_ns_from_clone(struct bio *bio)
    488{
    489	return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
    490}
    491EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
    492
    493static bool bio_is_flush_with_data(struct bio *bio)
    494{
    495	return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
    496}
    497
    498static void dm_io_acct(struct dm_io *io, bool end)
    499{
    500	struct dm_stats_aux *stats_aux = &io->stats_aux;
    501	unsigned long start_time = io->start_time;
    502	struct mapped_device *md = io->md;
    503	struct bio *bio = io->orig_bio;
    504	unsigned int sectors;
    505
    506	/*
    507	 * If REQ_PREFLUSH set, don't account payload, it will be
    508	 * submitted (and accounted) after this flush completes.
    509	 */
    510	if (bio_is_flush_with_data(bio))
    511		sectors = 0;
    512	else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT))))
    513		sectors = bio_sectors(bio);
    514	else
    515		sectors = io->sectors;
    516
    517	if (!end)
    518		bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio),
    519				   start_time);
    520	else
    521		bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time);
    522
    523	if (static_branch_unlikely(&stats_enabled) &&
    524	    unlikely(dm_stats_used(&md->stats))) {
    525		sector_t sector;
    526
    527		if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT)))
    528			sector = bio->bi_iter.bi_sector;
    529		else
    530			sector = bio_end_sector(bio) - io->sector_offset;
    531
    532		dm_stats_account_io(&md->stats, bio_data_dir(bio),
    533				    sector, sectors,
    534				    end, start_time, stats_aux);
    535	}
    536}
    537
    538static void __dm_start_io_acct(struct dm_io *io)
    539{
    540	dm_io_acct(io, false);
    541}
    542
    543static void dm_start_io_acct(struct dm_io *io, struct bio *clone)
    544{
    545	/*
    546	 * Ensure IO accounting is only ever started once.
    547	 */
    548	if (dm_io_flagged(io, DM_IO_ACCOUNTED))
    549		return;
    550
    551	/* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */
    552	if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) {
    553		dm_io_set_flag(io, DM_IO_ACCOUNTED);
    554	} else {
    555		unsigned long flags;
    556		/* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */
    557		spin_lock_irqsave(&io->lock, flags);
    558		if (dm_io_flagged(io, DM_IO_ACCOUNTED)) {
    559			spin_unlock_irqrestore(&io->lock, flags);
    560			return;
    561		}
    562		dm_io_set_flag(io, DM_IO_ACCOUNTED);
    563		spin_unlock_irqrestore(&io->lock, flags);
    564	}
    565
    566	__dm_start_io_acct(io);
    567}
    568
    569static void dm_end_io_acct(struct dm_io *io)
    570{
    571	dm_io_acct(io, true);
    572}
    573
    574static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
    575{
    576	struct dm_io *io;
    577	struct dm_target_io *tio;
    578	struct bio *clone;
    579
    580	clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs);
    581	/* Set default bdev, but target must bio_set_dev() before issuing IO */
    582	clone->bi_bdev = md->disk->part0;
    583
    584	tio = clone_to_tio(clone);
    585	tio->flags = 0;
    586	dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
    587	tio->io = NULL;
    588
    589	io = container_of(tio, struct dm_io, tio);
    590	io->magic = DM_IO_MAGIC;
    591	io->status = BLK_STS_OK;
    592
    593	/* one ref is for submission, the other is for completion */
    594	atomic_set(&io->io_count, 2);
    595	this_cpu_inc(*md->pending_io);
    596	io->orig_bio = bio;
    597	io->split_bio = NULL;
    598	io->md = md;
    599	spin_lock_init(&io->lock);
    600	io->start_time = jiffies;
    601	io->flags = 0;
    602
    603	if (static_branch_unlikely(&stats_enabled))
    604		dm_stats_record_start(&md->stats, &io->stats_aux);
    605
    606	return io;
    607}
    608
    609static void free_io(struct dm_io *io)
    610{
    611	bio_put(&io->tio.clone);
    612}
    613
    614static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
    615			     unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
    616{
    617	struct dm_target_io *tio;
    618	struct bio *clone;
    619
    620	if (!ci->io->tio.io) {
    621		/* the dm_target_io embedded in ci->io is available */
    622		tio = &ci->io->tio;
    623		/* alloc_io() already initialized embedded clone */
    624		clone = &tio->clone;
    625	} else {
    626		struct mapped_device *md = ci->io->md;
    627
    628		clone = bio_alloc_clone(NULL, ci->bio, gfp_mask,
    629					&md->mempools->bs);
    630		if (!clone)
    631			return NULL;
    632		/* Set default bdev, but target must bio_set_dev() before issuing IO */
    633		clone->bi_bdev = md->disk->part0;
    634
    635		/* REQ_DM_POLL_LIST shouldn't be inherited */
    636		clone->bi_opf &= ~REQ_DM_POLL_LIST;
    637
    638		tio = clone_to_tio(clone);
    639		tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */
    640	}
    641
    642	tio->magic = DM_TIO_MAGIC;
    643	tio->io = ci->io;
    644	tio->ti = ti;
    645	tio->target_bio_nr = target_bio_nr;
    646	tio->len_ptr = len;
    647	tio->old_sector = 0;
    648
    649	if (len) {
    650		clone->bi_iter.bi_size = to_bytes(*len);
    651		if (bio_integrity(clone))
    652			bio_integrity_trim(clone);
    653	}
    654
    655	return clone;
    656}
    657
    658static void free_tio(struct bio *clone)
    659{
    660	if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO))
    661		return;
    662	bio_put(clone);
    663}
    664
    665/*
    666 * Add the bio to the list of deferred io.
    667 */
    668static void queue_io(struct mapped_device *md, struct bio *bio)
    669{
    670	unsigned long flags;
    671
    672	spin_lock_irqsave(&md->deferred_lock, flags);
    673	bio_list_add(&md->deferred, bio);
    674	spin_unlock_irqrestore(&md->deferred_lock, flags);
    675	queue_work(md->wq, &md->work);
    676}
    677
    678/*
    679 * Everyone (including functions in this file), should use this
    680 * function to access the md->map field, and make sure they call
    681 * dm_put_live_table() when finished.
    682 */
    683struct dm_table *dm_get_live_table(struct mapped_device *md,
    684				   int *srcu_idx) __acquires(md->io_barrier)
    685{
    686	*srcu_idx = srcu_read_lock(&md->io_barrier);
    687
    688	return srcu_dereference(md->map, &md->io_barrier);
    689}
    690
    691void dm_put_live_table(struct mapped_device *md,
    692		       int srcu_idx) __releases(md->io_barrier)
    693{
    694	srcu_read_unlock(&md->io_barrier, srcu_idx);
    695}
    696
    697void dm_sync_table(struct mapped_device *md)
    698{
    699	synchronize_srcu(&md->io_barrier);
    700	synchronize_rcu_expedited();
    701}
    702
    703/*
    704 * A fast alternative to dm_get_live_table/dm_put_live_table.
    705 * The caller must not block between these two functions.
    706 */
    707static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
    708{
    709	rcu_read_lock();
    710	return rcu_dereference(md->map);
    711}
    712
    713static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
    714{
    715	rcu_read_unlock();
    716}
    717
    718static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md,
    719						     int *srcu_idx, unsigned bio_opf)
    720{
    721	if (bio_opf & REQ_NOWAIT)
    722		return dm_get_live_table_fast(md);
    723	else
    724		return dm_get_live_table(md, srcu_idx);
    725}
    726
    727static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx,
    728					 unsigned bio_opf)
    729{
    730	if (bio_opf & REQ_NOWAIT)
    731		dm_put_live_table_fast(md);
    732	else
    733		dm_put_live_table(md, srcu_idx);
    734}
    735
    736static char *_dm_claim_ptr = "I belong to device-mapper";
    737
    738/*
    739 * Open a table device so we can use it as a map destination.
    740 */
    741static int open_table_device(struct table_device *td, dev_t dev,
    742			     struct mapped_device *md)
    743{
    744	struct block_device *bdev;
    745	u64 part_off;
    746	int r;
    747
    748	BUG_ON(td->dm_dev.bdev);
    749
    750	bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
    751	if (IS_ERR(bdev))
    752		return PTR_ERR(bdev);
    753
    754	r = bd_link_disk_holder(bdev, dm_disk(md));
    755	if (r) {
    756		blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
    757		return r;
    758	}
    759
    760	td->dm_dev.bdev = bdev;
    761	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
    762	return 0;
    763}
    764
    765/*
    766 * Close a table device that we've been using.
    767 */
    768static void close_table_device(struct table_device *td, struct mapped_device *md)
    769{
    770	if (!td->dm_dev.bdev)
    771		return;
    772
    773	bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
    774	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
    775	put_dax(td->dm_dev.dax_dev);
    776	td->dm_dev.bdev = NULL;
    777	td->dm_dev.dax_dev = NULL;
    778}
    779
    780static struct table_device *find_table_device(struct list_head *l, dev_t dev,
    781					      fmode_t mode)
    782{
    783	struct table_device *td;
    784
    785	list_for_each_entry(td, l, list)
    786		if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
    787			return td;
    788
    789	return NULL;
    790}
    791
    792int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
    793			struct dm_dev **result)
    794{
    795	int r;
    796	struct table_device *td;
    797
    798	mutex_lock(&md->table_devices_lock);
    799	td = find_table_device(&md->table_devices, dev, mode);
    800	if (!td) {
    801		td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
    802		if (!td) {
    803			mutex_unlock(&md->table_devices_lock);
    804			return -ENOMEM;
    805		}
    806
    807		td->dm_dev.mode = mode;
    808		td->dm_dev.bdev = NULL;
    809
    810		if ((r = open_table_device(td, dev, md))) {
    811			mutex_unlock(&md->table_devices_lock);
    812			kfree(td);
    813			return r;
    814		}
    815
    816		format_dev_t(td->dm_dev.name, dev);
    817
    818		refcount_set(&td->count, 1);
    819		list_add(&td->list, &md->table_devices);
    820	} else {
    821		refcount_inc(&td->count);
    822	}
    823	mutex_unlock(&md->table_devices_lock);
    824
    825	*result = &td->dm_dev;
    826	return 0;
    827}
    828
    829void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
    830{
    831	struct table_device *td = container_of(d, struct table_device, dm_dev);
    832
    833	mutex_lock(&md->table_devices_lock);
    834	if (refcount_dec_and_test(&td->count)) {
    835		close_table_device(td, md);
    836		list_del(&td->list);
    837		kfree(td);
    838	}
    839	mutex_unlock(&md->table_devices_lock);
    840}
    841
    842static void free_table_devices(struct list_head *devices)
    843{
    844	struct list_head *tmp, *next;
    845
    846	list_for_each_safe(tmp, next, devices) {
    847		struct table_device *td = list_entry(tmp, struct table_device, list);
    848
    849		DMWARN("dm_destroy: %s still exists with %d references",
    850		       td->dm_dev.name, refcount_read(&td->count));
    851		kfree(td);
    852	}
    853}
    854
    855/*
    856 * Get the geometry associated with a dm device
    857 */
    858int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
    859{
    860	*geo = md->geometry;
    861
    862	return 0;
    863}
    864
    865/*
    866 * Set the geometry of a device.
    867 */
    868int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
    869{
    870	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
    871
    872	if (geo->start > sz) {
    873		DMWARN("Start sector is beyond the geometry limits.");
    874		return -EINVAL;
    875	}
    876
    877	md->geometry = *geo;
    878
    879	return 0;
    880}
    881
    882static int __noflush_suspending(struct mapped_device *md)
    883{
    884	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
    885}
    886
    887static void dm_io_complete(struct dm_io *io)
    888{
    889	blk_status_t io_error;
    890	struct mapped_device *md = io->md;
    891	struct bio *bio = io->split_bio ? io->split_bio : io->orig_bio;
    892
    893	if (io->status == BLK_STS_DM_REQUEUE) {
    894		unsigned long flags;
    895		/*
    896		 * Target requested pushing back the I/O.
    897		 */
    898		spin_lock_irqsave(&md->deferred_lock, flags);
    899		if (__noflush_suspending(md) &&
    900		    !WARN_ON_ONCE(dm_is_zone_write(md, bio))) {
    901			/* NOTE early return due to BLK_STS_DM_REQUEUE below */
    902			bio_list_add_head(&md->deferred, bio);
    903		} else {
    904			/*
    905			 * noflush suspend was interrupted or this is
    906			 * a write to a zoned target.
    907			 */
    908			io->status = BLK_STS_IOERR;
    909		}
    910		spin_unlock_irqrestore(&md->deferred_lock, flags);
    911	}
    912
    913	io_error = io->status;
    914	if (dm_io_flagged(io, DM_IO_ACCOUNTED))
    915		dm_end_io_acct(io);
    916	else if (!io_error) {
    917		/*
    918		 * Must handle target that DM_MAPIO_SUBMITTED only to
    919		 * then bio_endio() rather than dm_submit_bio_remap()
    920		 */
    921		__dm_start_io_acct(io);
    922		dm_end_io_acct(io);
    923	}
    924	free_io(io);
    925	smp_wmb();
    926	this_cpu_dec(*md->pending_io);
    927
    928	/* nudge anyone waiting on suspend queue */
    929	if (unlikely(wq_has_sleeper(&md->wait)))
    930		wake_up(&md->wait);
    931
    932	if (io_error == BLK_STS_DM_REQUEUE || io_error == BLK_STS_AGAIN) {
    933		if (bio->bi_opf & REQ_POLLED) {
    934			/*
    935			 * Upper layer won't help us poll split bio (io->orig_bio
    936			 * may only reflect a subset of the pre-split original)
    937			 * so clear REQ_POLLED in case of requeue.
    938			 */
    939			bio_clear_polled(bio);
    940			if (io_error == BLK_STS_AGAIN) {
    941				/* io_uring doesn't handle BLK_STS_AGAIN (yet) */
    942				queue_io(md, bio);
    943				return;
    944			}
    945		}
    946		if (io_error == BLK_STS_DM_REQUEUE)
    947			return;
    948	}
    949
    950	if (bio_is_flush_with_data(bio)) {
    951		/*
    952		 * Preflush done for flush with data, reissue
    953		 * without REQ_PREFLUSH.
    954		 */
    955		bio->bi_opf &= ~REQ_PREFLUSH;
    956		queue_io(md, bio);
    957	} else {
    958		/* done with normal IO or empty flush */
    959		if (io_error)
    960			bio->bi_status = io_error;
    961		bio_endio(bio);
    962	}
    963}
    964
    965/*
    966 * Decrements the number of outstanding ios that a bio has been
    967 * cloned into, completing the original io if necc.
    968 */
    969static inline void __dm_io_dec_pending(struct dm_io *io)
    970{
    971	if (atomic_dec_and_test(&io->io_count))
    972		dm_io_complete(io);
    973}
    974
    975static void dm_io_set_error(struct dm_io *io, blk_status_t error)
    976{
    977	unsigned long flags;
    978
    979	/* Push-back supersedes any I/O errors */
    980	spin_lock_irqsave(&io->lock, flags);
    981	if (!(io->status == BLK_STS_DM_REQUEUE &&
    982	      __noflush_suspending(io->md))) {
    983		io->status = error;
    984	}
    985	spin_unlock_irqrestore(&io->lock, flags);
    986}
    987
    988static void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
    989{
    990	if (unlikely(error))
    991		dm_io_set_error(io, error);
    992
    993	__dm_io_dec_pending(io);
    994}
    995
    996void disable_discard(struct mapped_device *md)
    997{
    998	struct queue_limits *limits = dm_get_queue_limits(md);
    999
   1000	/* device doesn't really support DISCARD, disable it */
   1001	limits->max_discard_sectors = 0;
   1002}
   1003
   1004void disable_write_zeroes(struct mapped_device *md)
   1005{
   1006	struct queue_limits *limits = dm_get_queue_limits(md);
   1007
   1008	/* device doesn't really support WRITE ZEROES, disable it */
   1009	limits->max_write_zeroes_sectors = 0;
   1010}
   1011
   1012static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
   1013{
   1014	return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
   1015}
   1016
   1017static void clone_endio(struct bio *bio)
   1018{
   1019	blk_status_t error = bio->bi_status;
   1020	struct dm_target_io *tio = clone_to_tio(bio);
   1021	struct dm_target *ti = tio->ti;
   1022	dm_endio_fn endio = ti->type->end_io;
   1023	struct dm_io *io = tio->io;
   1024	struct mapped_device *md = io->md;
   1025
   1026	if (unlikely(error == BLK_STS_TARGET)) {
   1027		if (bio_op(bio) == REQ_OP_DISCARD &&
   1028		    !bdev_max_discard_sectors(bio->bi_bdev))
   1029			disable_discard(md);
   1030		else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
   1031			 !bdev_write_zeroes_sectors(bio->bi_bdev))
   1032			disable_write_zeroes(md);
   1033	}
   1034
   1035	if (static_branch_unlikely(&zoned_enabled) &&
   1036	    unlikely(blk_queue_is_zoned(bdev_get_queue(bio->bi_bdev))))
   1037		dm_zone_endio(io, bio);
   1038
   1039	if (endio) {
   1040		int r = endio(ti, bio, &error);
   1041		switch (r) {
   1042		case DM_ENDIO_REQUEUE:
   1043			if (static_branch_unlikely(&zoned_enabled)) {
   1044				/*
   1045				 * Requeuing writes to a sequential zone of a zoned
   1046				 * target will break the sequential write pattern:
   1047				 * fail such IO.
   1048				 */
   1049				if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
   1050					error = BLK_STS_IOERR;
   1051				else
   1052					error = BLK_STS_DM_REQUEUE;
   1053			} else
   1054				error = BLK_STS_DM_REQUEUE;
   1055			fallthrough;
   1056		case DM_ENDIO_DONE:
   1057			break;
   1058		case DM_ENDIO_INCOMPLETE:
   1059			/* The target will handle the io */
   1060			return;
   1061		default:
   1062			DMWARN("unimplemented target endio return value: %d", r);
   1063			BUG();
   1064		}
   1065	}
   1066
   1067	if (static_branch_unlikely(&swap_bios_enabled) &&
   1068	    unlikely(swap_bios_limit(ti, bio)))
   1069		up(&md->swap_bios_semaphore);
   1070
   1071	free_tio(bio);
   1072	dm_io_dec_pending(io, error);
   1073}
   1074
   1075/*
   1076 * Return maximum size of I/O possible at the supplied sector up to the current
   1077 * target boundary.
   1078 */
   1079static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
   1080						  sector_t target_offset)
   1081{
   1082	return ti->len - target_offset;
   1083}
   1084
   1085static sector_t max_io_len(struct dm_target *ti, sector_t sector)
   1086{
   1087	sector_t target_offset = dm_target_offset(ti, sector);
   1088	sector_t len = max_io_len_target_boundary(ti, target_offset);
   1089	sector_t max_len;
   1090
   1091	/*
   1092	 * Does the target need to split IO even further?
   1093	 * - varied (per target) IO splitting is a tenet of DM; this
   1094	 *   explains why stacked chunk_sectors based splitting via
   1095	 *   blk_max_size_offset() isn't possible here. So pass in
   1096	 *   ti->max_io_len to override stacked chunk_sectors.
   1097	 */
   1098	if (ti->max_io_len) {
   1099		max_len = blk_max_size_offset(ti->table->md->queue,
   1100					      target_offset, ti->max_io_len);
   1101		if (len > max_len)
   1102			len = max_len;
   1103	}
   1104
   1105	return len;
   1106}
   1107
   1108int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
   1109{
   1110	if (len > UINT_MAX) {
   1111		DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
   1112		      (unsigned long long)len, UINT_MAX);
   1113		ti->error = "Maximum size of target IO is too large";
   1114		return -EINVAL;
   1115	}
   1116
   1117	ti->max_io_len = (uint32_t) len;
   1118
   1119	return 0;
   1120}
   1121EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
   1122
   1123static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
   1124						sector_t sector, int *srcu_idx)
   1125	__acquires(md->io_barrier)
   1126{
   1127	struct dm_table *map;
   1128	struct dm_target *ti;
   1129
   1130	map = dm_get_live_table(md, srcu_idx);
   1131	if (!map)
   1132		return NULL;
   1133
   1134	ti = dm_table_find_target(map, sector);
   1135	if (!ti)
   1136		return NULL;
   1137
   1138	return ti;
   1139}
   1140
   1141static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
   1142		long nr_pages, enum dax_access_mode mode, void **kaddr,
   1143		pfn_t *pfn)
   1144{
   1145	struct mapped_device *md = dax_get_private(dax_dev);
   1146	sector_t sector = pgoff * PAGE_SECTORS;
   1147	struct dm_target *ti;
   1148	long len, ret = -EIO;
   1149	int srcu_idx;
   1150
   1151	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
   1152
   1153	if (!ti)
   1154		goto out;
   1155	if (!ti->type->direct_access)
   1156		goto out;
   1157	len = max_io_len(ti, sector) / PAGE_SECTORS;
   1158	if (len < 1)
   1159		goto out;
   1160	nr_pages = min(len, nr_pages);
   1161	ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn);
   1162
   1163 out:
   1164	dm_put_live_table(md, srcu_idx);
   1165
   1166	return ret;
   1167}
   1168
   1169static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
   1170				  size_t nr_pages)
   1171{
   1172	struct mapped_device *md = dax_get_private(dax_dev);
   1173	sector_t sector = pgoff * PAGE_SECTORS;
   1174	struct dm_target *ti;
   1175	int ret = -EIO;
   1176	int srcu_idx;
   1177
   1178	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
   1179
   1180	if (!ti)
   1181		goto out;
   1182	if (WARN_ON(!ti->type->dax_zero_page_range)) {
   1183		/*
   1184		 * ->zero_page_range() is mandatory dax operation. If we are
   1185		 *  here, something is wrong.
   1186		 */
   1187		goto out;
   1188	}
   1189	ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
   1190 out:
   1191	dm_put_live_table(md, srcu_idx);
   1192
   1193	return ret;
   1194}
   1195
   1196static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
   1197		void *addr, size_t bytes, struct iov_iter *i)
   1198{
   1199	struct mapped_device *md = dax_get_private(dax_dev);
   1200	sector_t sector = pgoff * PAGE_SECTORS;
   1201	struct dm_target *ti;
   1202	int srcu_idx;
   1203	long ret = 0;
   1204
   1205	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
   1206	if (!ti || !ti->type->dax_recovery_write)
   1207		goto out;
   1208
   1209	ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i);
   1210out:
   1211	dm_put_live_table(md, srcu_idx);
   1212	return ret;
   1213}
   1214
   1215/*
   1216 * A target may call dm_accept_partial_bio only from the map routine.  It is
   1217 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
   1218 * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by
   1219 * __send_duplicate_bios().
   1220 *
   1221 * dm_accept_partial_bio informs the dm that the target only wants to process
   1222 * additional n_sectors sectors of the bio and the rest of the data should be
   1223 * sent in a next bio.
   1224 *
   1225 * A diagram that explains the arithmetics:
   1226 * +--------------------+---------------+-------+
   1227 * |         1          |       2       |   3   |
   1228 * +--------------------+---------------+-------+
   1229 *
   1230 * <-------------- *tio->len_ptr --------------->
   1231 *                      <----- bio_sectors ----->
   1232 *                      <-- n_sectors -->
   1233 *
   1234 * Region 1 was already iterated over with bio_advance or similar function.
   1235 *	(it may be empty if the target doesn't use bio_advance)
   1236 * Region 2 is the remaining bio size that the target wants to process.
   1237 *	(it may be empty if region 1 is non-empty, although there is no reason
   1238 *	 to make it empty)
   1239 * The target requires that region 3 is to be sent in the next bio.
   1240 *
   1241 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
   1242 * the partially processed part (the sum of regions 1+2) must be the same for all
   1243 * copies of the bio.
   1244 */
   1245void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
   1246{
   1247	struct dm_target_io *tio = clone_to_tio(bio);
   1248	unsigned bio_sectors = bio_sectors(bio);
   1249
   1250	BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
   1251	BUG_ON(op_is_zone_mgmt(bio_op(bio)));
   1252	BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
   1253	BUG_ON(bio_sectors > *tio->len_ptr);
   1254	BUG_ON(n_sectors > bio_sectors);
   1255
   1256	*tio->len_ptr -= bio_sectors - n_sectors;
   1257	bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
   1258
   1259	/*
   1260	 * __split_and_process_bio() may have already saved mapped part
   1261	 * for accounting but it is being reduced so update accordingly.
   1262	 */
   1263	dm_io_set_flag(tio->io, DM_IO_WAS_SPLIT);
   1264	tio->io->sectors = n_sectors;
   1265}
   1266EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
   1267
   1268/*
   1269 * @clone: clone bio that DM core passed to target's .map function
   1270 * @tgt_clone: clone of @clone bio that target needs submitted
   1271 *
   1272 * Targets should use this interface to submit bios they take
   1273 * ownership of when returning DM_MAPIO_SUBMITTED.
   1274 *
   1275 * Target should also enable ti->accounts_remapped_io
   1276 */
   1277void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone)
   1278{
   1279	struct dm_target_io *tio = clone_to_tio(clone);
   1280	struct dm_io *io = tio->io;
   1281
   1282	/* establish bio that will get submitted */
   1283	if (!tgt_clone)
   1284		tgt_clone = clone;
   1285
   1286	/*
   1287	 * Account io->origin_bio to DM dev on behalf of target
   1288	 * that took ownership of IO with DM_MAPIO_SUBMITTED.
   1289	 */
   1290	dm_start_io_acct(io, clone);
   1291
   1292	trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk),
   1293			      tio->old_sector);
   1294	submit_bio_noacct(tgt_clone);
   1295}
   1296EXPORT_SYMBOL_GPL(dm_submit_bio_remap);
   1297
   1298static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
   1299{
   1300	mutex_lock(&md->swap_bios_lock);
   1301	while (latch < md->swap_bios) {
   1302		cond_resched();
   1303		down(&md->swap_bios_semaphore);
   1304		md->swap_bios--;
   1305	}
   1306	while (latch > md->swap_bios) {
   1307		cond_resched();
   1308		up(&md->swap_bios_semaphore);
   1309		md->swap_bios++;
   1310	}
   1311	mutex_unlock(&md->swap_bios_lock);
   1312}
   1313
   1314static void __map_bio(struct bio *clone)
   1315{
   1316	struct dm_target_io *tio = clone_to_tio(clone);
   1317	struct dm_target *ti = tio->ti;
   1318	struct dm_io *io = tio->io;
   1319	struct mapped_device *md = io->md;
   1320	int r;
   1321
   1322	clone->bi_end_io = clone_endio;
   1323
   1324	/*
   1325	 * Map the clone.
   1326	 */
   1327	tio->old_sector = clone->bi_iter.bi_sector;
   1328
   1329	if (static_branch_unlikely(&swap_bios_enabled) &&
   1330	    unlikely(swap_bios_limit(ti, clone))) {
   1331		int latch = get_swap_bios();
   1332		if (unlikely(latch != md->swap_bios))
   1333			__set_swap_bios_limit(md, latch);
   1334		down(&md->swap_bios_semaphore);
   1335	}
   1336
   1337	if (static_branch_unlikely(&zoned_enabled)) {
   1338		/*
   1339		 * Check if the IO needs a special mapping due to zone append
   1340		 * emulation on zoned target. In this case, dm_zone_map_bio()
   1341		 * calls the target map operation.
   1342		 */
   1343		if (unlikely(dm_emulate_zone_append(md)))
   1344			r = dm_zone_map_bio(tio);
   1345		else
   1346			r = ti->type->map(ti, clone);
   1347	} else
   1348		r = ti->type->map(ti, clone);
   1349
   1350	switch (r) {
   1351	case DM_MAPIO_SUBMITTED:
   1352		/* target has assumed ownership of this io */
   1353		if (!ti->accounts_remapped_io)
   1354			dm_start_io_acct(io, clone);
   1355		break;
   1356	case DM_MAPIO_REMAPPED:
   1357		dm_submit_bio_remap(clone, NULL);
   1358		break;
   1359	case DM_MAPIO_KILL:
   1360	case DM_MAPIO_REQUEUE:
   1361		if (static_branch_unlikely(&swap_bios_enabled) &&
   1362		    unlikely(swap_bios_limit(ti, clone)))
   1363			up(&md->swap_bios_semaphore);
   1364		free_tio(clone);
   1365		if (r == DM_MAPIO_KILL)
   1366			dm_io_dec_pending(io, BLK_STS_IOERR);
   1367		else
   1368			dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
   1369		break;
   1370	default:
   1371		DMWARN("unimplemented target map return value: %d", r);
   1372		BUG();
   1373	}
   1374}
   1375
   1376static void setup_split_accounting(struct clone_info *ci, unsigned len)
   1377{
   1378	struct dm_io *io = ci->io;
   1379
   1380	if (ci->sector_count > len) {
   1381		/*
   1382		 * Split needed, save the mapped part for accounting.
   1383		 * NOTE: dm_accept_partial_bio() will update accordingly.
   1384		 */
   1385		dm_io_set_flag(io, DM_IO_WAS_SPLIT);
   1386		io->sectors = len;
   1387	}
   1388
   1389	if (static_branch_unlikely(&stats_enabled) &&
   1390	    unlikely(dm_stats_used(&io->md->stats))) {
   1391		/*
   1392		 * Save bi_sector in terms of its offset from end of
   1393		 * original bio, only needed for DM-stats' benefit.
   1394		 * - saved regardless of whether split needed so that
   1395		 *   dm_accept_partial_bio() doesn't need to.
   1396		 */
   1397		io->sector_offset = bio_end_sector(ci->bio) - ci->sector;
   1398	}
   1399}
   1400
   1401static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
   1402				struct dm_target *ti, unsigned num_bios)
   1403{
   1404	struct bio *bio;
   1405	int try;
   1406
   1407	for (try = 0; try < 2; try++) {
   1408		int bio_nr;
   1409
   1410		if (try)
   1411			mutex_lock(&ci->io->md->table_devices_lock);
   1412		for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
   1413			bio = alloc_tio(ci, ti, bio_nr, NULL,
   1414					try ? GFP_NOIO : GFP_NOWAIT);
   1415			if (!bio)
   1416				break;
   1417
   1418			bio_list_add(blist, bio);
   1419		}
   1420		if (try)
   1421			mutex_unlock(&ci->io->md->table_devices_lock);
   1422		if (bio_nr == num_bios)
   1423			return;
   1424
   1425		while ((bio = bio_list_pop(blist)))
   1426			free_tio(bio);
   1427	}
   1428}
   1429
   1430static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
   1431				  unsigned num_bios, unsigned *len)
   1432{
   1433	struct bio_list blist = BIO_EMPTY_LIST;
   1434	struct bio *clone;
   1435	int ret = 0;
   1436
   1437	switch (num_bios) {
   1438	case 0:
   1439		break;
   1440	case 1:
   1441		if (len)
   1442			setup_split_accounting(ci, *len);
   1443		clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
   1444		__map_bio(clone);
   1445		ret = 1;
   1446		break;
   1447	default:
   1448		/* dm_accept_partial_bio() is not supported with shared tio->len_ptr */
   1449		alloc_multiple_bios(&blist, ci, ti, num_bios);
   1450		while ((clone = bio_list_pop(&blist))) {
   1451			dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO);
   1452			__map_bio(clone);
   1453			ret += 1;
   1454		}
   1455		break;
   1456	}
   1457
   1458	return ret;
   1459}
   1460
   1461static void __send_empty_flush(struct clone_info *ci)
   1462{
   1463	unsigned target_nr = 0;
   1464	struct dm_target *ti;
   1465	struct bio flush_bio;
   1466
   1467	/*
   1468	 * Use an on-stack bio for this, it's safe since we don't
   1469	 * need to reference it after submit. It's just used as
   1470	 * the basis for the clone(s).
   1471	 */
   1472	bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
   1473		 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
   1474
   1475	ci->bio = &flush_bio;
   1476	ci->sector_count = 0;
   1477	ci->io->tio.clone.bi_iter.bi_size = 0;
   1478
   1479	while ((ti = dm_table_get_target(ci->map, target_nr++))) {
   1480		int bios;
   1481
   1482		atomic_add(ti->num_flush_bios, &ci->io->io_count);
   1483		bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
   1484		atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
   1485	}
   1486
   1487	/*
   1488	 * alloc_io() takes one extra reference for submission, so the
   1489	 * reference won't reach 0 without the following subtraction
   1490	 */
   1491	atomic_sub(1, &ci->io->io_count);
   1492
   1493	bio_uninit(ci->bio);
   1494}
   1495
   1496static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
   1497					unsigned num_bios)
   1498{
   1499	unsigned len;
   1500	int bios;
   1501
   1502	len = min_t(sector_t, ci->sector_count,
   1503		    max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
   1504
   1505	atomic_add(num_bios, &ci->io->io_count);
   1506	bios = __send_duplicate_bios(ci, ti, num_bios, &len);
   1507	/*
   1508	 * alloc_io() takes one extra reference for submission, so the
   1509	 * reference won't reach 0 without the following (+1) subtraction
   1510	 */
   1511	atomic_sub(num_bios - bios + 1, &ci->io->io_count);
   1512
   1513	ci->sector += len;
   1514	ci->sector_count -= len;
   1515}
   1516
   1517static bool is_abnormal_io(struct bio *bio)
   1518{
   1519	unsigned int op = bio_op(bio);
   1520
   1521	if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) {
   1522		switch (op) {
   1523		case REQ_OP_DISCARD:
   1524		case REQ_OP_SECURE_ERASE:
   1525		case REQ_OP_WRITE_ZEROES:
   1526			return true;
   1527		default:
   1528			break;
   1529		}
   1530	}
   1531
   1532	return false;
   1533}
   1534
   1535static blk_status_t __process_abnormal_io(struct clone_info *ci,
   1536					  struct dm_target *ti)
   1537{
   1538	unsigned num_bios = 0;
   1539
   1540	switch (bio_op(ci->bio)) {
   1541	case REQ_OP_DISCARD:
   1542		num_bios = ti->num_discard_bios;
   1543		break;
   1544	case REQ_OP_SECURE_ERASE:
   1545		num_bios = ti->num_secure_erase_bios;
   1546		break;
   1547	case REQ_OP_WRITE_ZEROES:
   1548		num_bios = ti->num_write_zeroes_bios;
   1549		break;
   1550	}
   1551
   1552	/*
   1553	 * Even though the device advertised support for this type of
   1554	 * request, that does not mean every target supports it, and
   1555	 * reconfiguration might also have changed that since the
   1556	 * check was performed.
   1557	 */
   1558	if (unlikely(!num_bios))
   1559		return BLK_STS_NOTSUPP;
   1560
   1561	__send_changing_extent_only(ci, ti, num_bios);
   1562	return BLK_STS_OK;
   1563}
   1564
   1565/*
   1566 * Reuse ->bi_private as dm_io list head for storing all dm_io instances
   1567 * associated with this bio, and this bio's bi_private needs to be
   1568 * stored in dm_io->data before the reuse.
   1569 *
   1570 * bio->bi_private is owned by fs or upper layer, so block layer won't
   1571 * touch it after splitting. Meantime it won't be changed by anyone after
   1572 * bio is submitted. So this reuse is safe.
   1573 */
   1574static inline struct dm_io **dm_poll_list_head(struct bio *bio)
   1575{
   1576	return (struct dm_io **)&bio->bi_private;
   1577}
   1578
   1579static void dm_queue_poll_io(struct bio *bio, struct dm_io *io)
   1580{
   1581	struct dm_io **head = dm_poll_list_head(bio);
   1582
   1583	if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {
   1584		bio->bi_opf |= REQ_DM_POLL_LIST;
   1585		/*
   1586		 * Save .bi_private into dm_io, so that we can reuse
   1587		 * .bi_private as dm_io list head for storing dm_io list
   1588		 */
   1589		io->data = bio->bi_private;
   1590
   1591		/* tell block layer to poll for completion */
   1592		bio->bi_cookie = ~BLK_QC_T_NONE;
   1593
   1594		io->next = NULL;
   1595	} else {
   1596		/*
   1597		 * bio recursed due to split, reuse original poll list,
   1598		 * and save bio->bi_private too.
   1599		 */
   1600		io->data = (*head)->data;
   1601		io->next = *head;
   1602	}
   1603
   1604	*head = io;
   1605}
   1606
   1607/*
   1608 * Select the correct strategy for processing a non-flush bio.
   1609 */
   1610static blk_status_t __split_and_process_bio(struct clone_info *ci)
   1611{
   1612	struct bio *clone;
   1613	struct dm_target *ti;
   1614	unsigned len;
   1615
   1616	ti = dm_table_find_target(ci->map, ci->sector);
   1617	if (unlikely(!ti))
   1618		return BLK_STS_IOERR;
   1619
   1620	if (unlikely((ci->bio->bi_opf & REQ_NOWAIT) != 0) &&
   1621	    unlikely(!dm_target_supports_nowait(ti->type)))
   1622		return BLK_STS_NOTSUPP;
   1623
   1624	if (unlikely(ci->is_abnormal_io))
   1625		return __process_abnormal_io(ci, ti);
   1626
   1627	/*
   1628	 * Only support bio polling for normal IO, and the target io is
   1629	 * exactly inside the dm_io instance (verified in dm_poll_dm_io)
   1630	 */
   1631	ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED;
   1632
   1633	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
   1634	setup_split_accounting(ci, len);
   1635	clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
   1636	__map_bio(clone);
   1637
   1638	ci->sector += len;
   1639	ci->sector_count -= len;
   1640
   1641	return BLK_STS_OK;
   1642}
   1643
   1644static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
   1645			    struct dm_table *map, struct bio *bio, bool is_abnormal)
   1646{
   1647	ci->map = map;
   1648	ci->io = alloc_io(md, bio);
   1649	ci->bio = bio;
   1650	ci->is_abnormal_io = is_abnormal;
   1651	ci->submit_as_polled = false;
   1652	ci->sector = bio->bi_iter.bi_sector;
   1653	ci->sector_count = bio_sectors(bio);
   1654
   1655	/* Shouldn't happen but sector_count was being set to 0 so... */
   1656	if (static_branch_unlikely(&zoned_enabled) &&
   1657	    WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count))
   1658		ci->sector_count = 0;
   1659}
   1660
   1661/*
   1662 * Entry point to split a bio into clones and submit them to the targets.
   1663 */
   1664static void dm_split_and_process_bio(struct mapped_device *md,
   1665				     struct dm_table *map, struct bio *bio)
   1666{
   1667	struct clone_info ci;
   1668	struct dm_io *io;
   1669	blk_status_t error = BLK_STS_OK;
   1670	bool is_abnormal;
   1671
   1672	is_abnormal = is_abnormal_io(bio);
   1673	if (unlikely(is_abnormal)) {
   1674		/*
   1675		 * Use blk_queue_split() for abnormal IO (e.g. discard, etc)
   1676		 * otherwise associated queue_limits won't be imposed.
   1677		 */
   1678		blk_queue_split(&bio);
   1679	}
   1680
   1681	init_clone_info(&ci, md, map, bio, is_abnormal);
   1682	io = ci.io;
   1683
   1684	if (bio->bi_opf & REQ_PREFLUSH) {
   1685		__send_empty_flush(&ci);
   1686		/* dm_io_complete submits any data associated with flush */
   1687		goto out;
   1688	}
   1689
   1690	error = __split_and_process_bio(&ci);
   1691	if (error || !ci.sector_count)
   1692		goto out;
   1693	/*
   1694	 * Remainder must be passed to submit_bio_noacct() so it gets handled
   1695	 * *after* bios already submitted have been completely processed.
   1696	 */
   1697	WARN_ON_ONCE(!dm_io_flagged(io, DM_IO_WAS_SPLIT));
   1698	io->split_bio = bio_split(bio, io->sectors, GFP_NOIO,
   1699				  &md->queue->bio_split);
   1700	bio_chain(io->split_bio, bio);
   1701	trace_block_split(io->split_bio, bio->bi_iter.bi_sector);
   1702	submit_bio_noacct(bio);
   1703out:
   1704	/*
   1705	 * Drop the extra reference count for non-POLLED bio, and hold one
   1706	 * reference for POLLED bio, which will be released in dm_poll_bio
   1707	 *
   1708	 * Add every dm_io instance into the dm_io list head which is stored
   1709	 * in bio->bi_private, so that dm_poll_bio can poll them all.
   1710	 */
   1711	if (error || !ci.submit_as_polled) {
   1712		/*
   1713		 * In case of submission failure, the extra reference for
   1714		 * submitting io isn't consumed yet
   1715		 */
   1716		if (error)
   1717			atomic_dec(&io->io_count);
   1718		dm_io_dec_pending(io, error);
   1719	} else
   1720		dm_queue_poll_io(bio, io);
   1721}
   1722
   1723static void dm_submit_bio(struct bio *bio)
   1724{
   1725	struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
   1726	int srcu_idx;
   1727	struct dm_table *map;
   1728	unsigned bio_opf = bio->bi_opf;
   1729
   1730	map = dm_get_live_table_bio(md, &srcu_idx, bio_opf);
   1731
   1732	/* If suspended, or map not yet available, queue this IO for later */
   1733	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) ||
   1734	    unlikely(!map)) {
   1735		if (bio->bi_opf & REQ_NOWAIT)
   1736			bio_wouldblock_error(bio);
   1737		else if (bio->bi_opf & REQ_RAHEAD)
   1738			bio_io_error(bio);
   1739		else
   1740			queue_io(md, bio);
   1741		goto out;
   1742	}
   1743
   1744	dm_split_and_process_bio(md, map, bio);
   1745out:
   1746	dm_put_live_table_bio(md, srcu_idx, bio_opf);
   1747}
   1748
   1749static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,
   1750			  unsigned int flags)
   1751{
   1752	WARN_ON_ONCE(!dm_tio_is_normal(&io->tio));
   1753
   1754	/* don't poll if the mapped io is done */
   1755	if (atomic_read(&io->io_count) > 1)
   1756		bio_poll(&io->tio.clone, iob, flags);
   1757
   1758	/* bio_poll holds the last reference */
   1759	return atomic_read(&io->io_count) == 1;
   1760}
   1761
   1762static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob,
   1763		       unsigned int flags)
   1764{
   1765	struct dm_io **head = dm_poll_list_head(bio);
   1766	struct dm_io *list = *head;
   1767	struct dm_io *tmp = NULL;
   1768	struct dm_io *curr, *next;
   1769
   1770	/* Only poll normal bio which was marked as REQ_DM_POLL_LIST */
   1771	if (!(bio->bi_opf & REQ_DM_POLL_LIST))
   1772		return 0;
   1773
   1774	WARN_ON_ONCE(!list);
   1775
   1776	/*
   1777	 * Restore .bi_private before possibly completing dm_io.
   1778	 *
   1779	 * bio_poll() is only possible once @bio has been completely
   1780	 * submitted via submit_bio_noacct()'s depth-first submission.
   1781	 * So there is no dm_queue_poll_io() race associated with
   1782	 * clearing REQ_DM_POLL_LIST here.
   1783	 */
   1784	bio->bi_opf &= ~REQ_DM_POLL_LIST;
   1785	bio->bi_private = list->data;
   1786
   1787	for (curr = list, next = curr->next; curr; curr = next, next =
   1788			curr ? curr->next : NULL) {
   1789		if (dm_poll_dm_io(curr, iob, flags)) {
   1790			/*
   1791			 * clone_endio() has already occurred, so no
   1792			 * error handling is needed here.
   1793			 */
   1794			__dm_io_dec_pending(curr);
   1795		} else {
   1796			curr->next = tmp;
   1797			tmp = curr;
   1798		}
   1799	}
   1800
   1801	/* Not done? */
   1802	if (tmp) {
   1803		bio->bi_opf |= REQ_DM_POLL_LIST;
   1804		/* Reset bio->bi_private to dm_io list head */
   1805		*head = tmp;
   1806		return 0;
   1807	}
   1808	return 1;
   1809}
   1810
   1811/*-----------------------------------------------------------------
   1812 * An IDR is used to keep track of allocated minor numbers.
   1813 *---------------------------------------------------------------*/
   1814static void free_minor(int minor)
   1815{
   1816	spin_lock(&_minor_lock);
   1817	idr_remove(&_minor_idr, minor);
   1818	spin_unlock(&_minor_lock);
   1819}
   1820
   1821/*
   1822 * See if the device with a specific minor # is free.
   1823 */
   1824static int specific_minor(int minor)
   1825{
   1826	int r;
   1827
   1828	if (minor >= (1 << MINORBITS))
   1829		return -EINVAL;
   1830
   1831	idr_preload(GFP_KERNEL);
   1832	spin_lock(&_minor_lock);
   1833
   1834	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
   1835
   1836	spin_unlock(&_minor_lock);
   1837	idr_preload_end();
   1838	if (r < 0)
   1839		return r == -ENOSPC ? -EBUSY : r;
   1840	return 0;
   1841}
   1842
   1843static int next_free_minor(int *minor)
   1844{
   1845	int r;
   1846
   1847	idr_preload(GFP_KERNEL);
   1848	spin_lock(&_minor_lock);
   1849
   1850	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
   1851
   1852	spin_unlock(&_minor_lock);
   1853	idr_preload_end();
   1854	if (r < 0)
   1855		return r;
   1856	*minor = r;
   1857	return 0;
   1858}
   1859
   1860static const struct block_device_operations dm_blk_dops;
   1861static const struct block_device_operations dm_rq_blk_dops;
   1862static const struct dax_operations dm_dax_ops;
   1863
   1864static void dm_wq_work(struct work_struct *work);
   1865
   1866#ifdef CONFIG_BLK_INLINE_ENCRYPTION
   1867static void dm_queue_destroy_crypto_profile(struct request_queue *q)
   1868{
   1869	dm_destroy_crypto_profile(q->crypto_profile);
   1870}
   1871
   1872#else /* CONFIG_BLK_INLINE_ENCRYPTION */
   1873
   1874static inline void dm_queue_destroy_crypto_profile(struct request_queue *q)
   1875{
   1876}
   1877#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
   1878
   1879static void cleanup_mapped_device(struct mapped_device *md)
   1880{
   1881	if (md->wq)
   1882		destroy_workqueue(md->wq);
   1883	dm_free_md_mempools(md->mempools);
   1884
   1885	if (md->dax_dev) {
   1886		dax_remove_host(md->disk);
   1887		kill_dax(md->dax_dev);
   1888		put_dax(md->dax_dev);
   1889		md->dax_dev = NULL;
   1890	}
   1891
   1892	dm_cleanup_zoned_dev(md);
   1893	if (md->disk) {
   1894		spin_lock(&_minor_lock);
   1895		md->disk->private_data = NULL;
   1896		spin_unlock(&_minor_lock);
   1897		if (dm_get_md_type(md) != DM_TYPE_NONE) {
   1898			dm_sysfs_exit(md);
   1899			del_gendisk(md->disk);
   1900		}
   1901		dm_queue_destroy_crypto_profile(md->queue);
   1902		blk_cleanup_disk(md->disk);
   1903	}
   1904
   1905	if (md->pending_io) {
   1906		free_percpu(md->pending_io);
   1907		md->pending_io = NULL;
   1908	}
   1909
   1910	cleanup_srcu_struct(&md->io_barrier);
   1911
   1912	mutex_destroy(&md->suspend_lock);
   1913	mutex_destroy(&md->type_lock);
   1914	mutex_destroy(&md->table_devices_lock);
   1915	mutex_destroy(&md->swap_bios_lock);
   1916
   1917	dm_mq_cleanup_mapped_device(md);
   1918}
   1919
   1920/*
   1921 * Allocate and initialise a blank device with a given minor.
   1922 */
   1923static struct mapped_device *alloc_dev(int minor)
   1924{
   1925	int r, numa_node_id = dm_get_numa_node();
   1926	struct mapped_device *md;
   1927	void *old_md;
   1928
   1929	md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
   1930	if (!md) {
   1931		DMWARN("unable to allocate device, out of memory.");
   1932		return NULL;
   1933	}
   1934
   1935	if (!try_module_get(THIS_MODULE))
   1936		goto bad_module_get;
   1937
   1938	/* get a minor number for the dev */
   1939	if (minor == DM_ANY_MINOR)
   1940		r = next_free_minor(&minor);
   1941	else
   1942		r = specific_minor(minor);
   1943	if (r < 0)
   1944		goto bad_minor;
   1945
   1946	r = init_srcu_struct(&md->io_barrier);
   1947	if (r < 0)
   1948		goto bad_io_barrier;
   1949
   1950	md->numa_node_id = numa_node_id;
   1951	md->init_tio_pdu = false;
   1952	md->type = DM_TYPE_NONE;
   1953	mutex_init(&md->suspend_lock);
   1954	mutex_init(&md->type_lock);
   1955	mutex_init(&md->table_devices_lock);
   1956	spin_lock_init(&md->deferred_lock);
   1957	atomic_set(&md->holders, 1);
   1958	atomic_set(&md->open_count, 0);
   1959	atomic_set(&md->event_nr, 0);
   1960	atomic_set(&md->uevent_seq, 0);
   1961	INIT_LIST_HEAD(&md->uevent_list);
   1962	INIT_LIST_HEAD(&md->table_devices);
   1963	spin_lock_init(&md->uevent_lock);
   1964
   1965	/*
   1966	 * default to bio-based until DM table is loaded and md->type
   1967	 * established. If request-based table is loaded: blk-mq will
   1968	 * override accordingly.
   1969	 */
   1970	md->disk = blk_alloc_disk(md->numa_node_id);
   1971	if (!md->disk)
   1972		goto bad;
   1973	md->queue = md->disk->queue;
   1974
   1975	init_waitqueue_head(&md->wait);
   1976	INIT_WORK(&md->work, dm_wq_work);
   1977	init_waitqueue_head(&md->eventq);
   1978	init_completion(&md->kobj_holder.completion);
   1979
   1980	md->swap_bios = get_swap_bios();
   1981	sema_init(&md->swap_bios_semaphore, md->swap_bios);
   1982	mutex_init(&md->swap_bios_lock);
   1983
   1984	md->disk->major = _major;
   1985	md->disk->first_minor = minor;
   1986	md->disk->minors = 1;
   1987	md->disk->flags |= GENHD_FL_NO_PART;
   1988	md->disk->fops = &dm_blk_dops;
   1989	md->disk->queue = md->queue;
   1990	md->disk->private_data = md;
   1991	sprintf(md->disk->disk_name, "dm-%d", minor);
   1992
   1993	if (IS_ENABLED(CONFIG_FS_DAX)) {
   1994		md->dax_dev = alloc_dax(md, &dm_dax_ops);
   1995		if (IS_ERR(md->dax_dev)) {
   1996			md->dax_dev = NULL;
   1997			goto bad;
   1998		}
   1999		set_dax_nocache(md->dax_dev);
   2000		set_dax_nomc(md->dax_dev);
   2001		if (dax_add_host(md->dax_dev, md->disk))
   2002			goto bad;
   2003	}
   2004
   2005	format_dev_t(md->name, MKDEV(_major, minor));
   2006
   2007	md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name);
   2008	if (!md->wq)
   2009		goto bad;
   2010
   2011	md->pending_io = alloc_percpu(unsigned long);
   2012	if (!md->pending_io)
   2013		goto bad;
   2014
   2015	dm_stats_init(&md->stats);
   2016
   2017	/* Populate the mapping, nobody knows we exist yet */
   2018	spin_lock(&_minor_lock);
   2019	old_md = idr_replace(&_minor_idr, md, minor);
   2020	spin_unlock(&_minor_lock);
   2021
   2022	BUG_ON(old_md != MINOR_ALLOCED);
   2023
   2024	return md;
   2025
   2026bad:
   2027	cleanup_mapped_device(md);
   2028bad_io_barrier:
   2029	free_minor(minor);
   2030bad_minor:
   2031	module_put(THIS_MODULE);
   2032bad_module_get:
   2033	kvfree(md);
   2034	return NULL;
   2035}
   2036
   2037static void unlock_fs(struct mapped_device *md);
   2038
   2039static void free_dev(struct mapped_device *md)
   2040{
   2041	int minor = MINOR(disk_devt(md->disk));
   2042
   2043	unlock_fs(md);
   2044
   2045	cleanup_mapped_device(md);
   2046
   2047	free_table_devices(&md->table_devices);
   2048	dm_stats_cleanup(&md->stats);
   2049	free_minor(minor);
   2050
   2051	module_put(THIS_MODULE);
   2052	kvfree(md);
   2053}
   2054
   2055/*
   2056 * Bind a table to the device.
   2057 */
   2058static void event_callback(void *context)
   2059{
   2060	unsigned long flags;
   2061	LIST_HEAD(uevents);
   2062	struct mapped_device *md = (struct mapped_device *) context;
   2063
   2064	spin_lock_irqsave(&md->uevent_lock, flags);
   2065	list_splice_init(&md->uevent_list, &uevents);
   2066	spin_unlock_irqrestore(&md->uevent_lock, flags);
   2067
   2068	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
   2069
   2070	atomic_inc(&md->event_nr);
   2071	wake_up(&md->eventq);
   2072	dm_issue_global_event();
   2073}
   2074
   2075/*
   2076 * Returns old map, which caller must destroy.
   2077 */
   2078static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
   2079			       struct queue_limits *limits)
   2080{
   2081	struct dm_table *old_map;
   2082	sector_t size;
   2083	int ret;
   2084
   2085	lockdep_assert_held(&md->suspend_lock);
   2086
   2087	size = dm_table_get_size(t);
   2088
   2089	/*
   2090	 * Wipe any geometry if the size of the table changed.
   2091	 */
   2092	if (size != dm_get_size(md))
   2093		memset(&md->geometry, 0, sizeof(md->geometry));
   2094
   2095	if (!get_capacity(md->disk))
   2096		set_capacity(md->disk, size);
   2097	else
   2098		set_capacity_and_notify(md->disk, size);
   2099
   2100	dm_table_event_callback(t, event_callback, md);
   2101
   2102	if (dm_table_request_based(t)) {
   2103		/*
   2104		 * Leverage the fact that request-based DM targets are
   2105		 * immutable singletons - used to optimize dm_mq_queue_rq.
   2106		 */
   2107		md->immutable_target = dm_table_get_immutable_target(t);
   2108
   2109		/*
   2110		 * There is no need to reload with request-based dm because the
   2111		 * size of front_pad doesn't change.
   2112		 *
   2113		 * Note for future: If you are to reload bioset, prep-ed
   2114		 * requests in the queue may refer to bio from the old bioset,
   2115		 * so you must walk through the queue to unprep.
   2116		 */
   2117		if (!md->mempools) {
   2118			md->mempools = t->mempools;
   2119			t->mempools = NULL;
   2120		}
   2121	} else {
   2122		/*
   2123		 * The md may already have mempools that need changing.
   2124		 * If so, reload bioset because front_pad may have changed
   2125		 * because a different table was loaded.
   2126		 */
   2127		dm_free_md_mempools(md->mempools);
   2128		md->mempools = t->mempools;
   2129		t->mempools = NULL;
   2130	}
   2131
   2132	ret = dm_table_set_restrictions(t, md->queue, limits);
   2133	if (ret) {
   2134		old_map = ERR_PTR(ret);
   2135		goto out;
   2136	}
   2137
   2138	old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
   2139	rcu_assign_pointer(md->map, (void *)t);
   2140	md->immutable_target_type = dm_table_get_immutable_target_type(t);
   2141
   2142	if (old_map)
   2143		dm_sync_table(md);
   2144out:
   2145	return old_map;
   2146}
   2147
   2148/*
   2149 * Returns unbound table for the caller to free.
   2150 */
   2151static struct dm_table *__unbind(struct mapped_device *md)
   2152{
   2153	struct dm_table *map = rcu_dereference_protected(md->map, 1);
   2154
   2155	if (!map)
   2156		return NULL;
   2157
   2158	dm_table_event_callback(map, NULL, NULL);
   2159	RCU_INIT_POINTER(md->map, NULL);
   2160	dm_sync_table(md);
   2161
   2162	return map;
   2163}
   2164
   2165/*
   2166 * Constructor for a new device.
   2167 */
   2168int dm_create(int minor, struct mapped_device **result)
   2169{
   2170	struct mapped_device *md;
   2171
   2172	md = alloc_dev(minor);
   2173	if (!md)
   2174		return -ENXIO;
   2175
   2176	dm_ima_reset_data(md);
   2177
   2178	*result = md;
   2179	return 0;
   2180}
   2181
   2182/*
   2183 * Functions to manage md->type.
   2184 * All are required to hold md->type_lock.
   2185 */
   2186void dm_lock_md_type(struct mapped_device *md)
   2187{
   2188	mutex_lock(&md->type_lock);
   2189}
   2190
   2191void dm_unlock_md_type(struct mapped_device *md)
   2192{
   2193	mutex_unlock(&md->type_lock);
   2194}
   2195
   2196void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
   2197{
   2198	BUG_ON(!mutex_is_locked(&md->type_lock));
   2199	md->type = type;
   2200}
   2201
   2202enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
   2203{
   2204	return md->type;
   2205}
   2206
   2207struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
   2208{
   2209	return md->immutable_target_type;
   2210}
   2211
   2212/*
   2213 * The queue_limits are only valid as long as you have a reference
   2214 * count on 'md'.
   2215 */
   2216struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
   2217{
   2218	BUG_ON(!atomic_read(&md->holders));
   2219	return &md->queue->limits;
   2220}
   2221EXPORT_SYMBOL_GPL(dm_get_queue_limits);
   2222
   2223/*
   2224 * Setup the DM device's queue based on md's type
   2225 */
   2226int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
   2227{
   2228	enum dm_queue_mode type = dm_table_get_type(t);
   2229	struct queue_limits limits;
   2230	int r;
   2231
   2232	switch (type) {
   2233	case DM_TYPE_REQUEST_BASED:
   2234		md->disk->fops = &dm_rq_blk_dops;
   2235		r = dm_mq_init_request_queue(md, t);
   2236		if (r) {
   2237			DMERR("Cannot initialize queue for request-based dm mapped device");
   2238			return r;
   2239		}
   2240		break;
   2241	case DM_TYPE_BIO_BASED:
   2242	case DM_TYPE_DAX_BIO_BASED:
   2243		break;
   2244	case DM_TYPE_NONE:
   2245		WARN_ON_ONCE(true);
   2246		break;
   2247	}
   2248
   2249	r = dm_calculate_queue_limits(t, &limits);
   2250	if (r) {
   2251		DMERR("Cannot calculate initial queue limits");
   2252		return r;
   2253	}
   2254	r = dm_table_set_restrictions(t, md->queue, &limits);
   2255	if (r)
   2256		return r;
   2257
   2258	r = add_disk(md->disk);
   2259	if (r)
   2260		return r;
   2261
   2262	r = dm_sysfs_init(md);
   2263	if (r) {
   2264		del_gendisk(md->disk);
   2265		return r;
   2266	}
   2267	md->type = type;
   2268	return 0;
   2269}
   2270
   2271struct mapped_device *dm_get_md(dev_t dev)
   2272{
   2273	struct mapped_device *md;
   2274	unsigned minor = MINOR(dev);
   2275
   2276	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
   2277		return NULL;
   2278
   2279	spin_lock(&_minor_lock);
   2280
   2281	md = idr_find(&_minor_idr, minor);
   2282	if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
   2283	    test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
   2284		md = NULL;
   2285		goto out;
   2286	}
   2287	dm_get(md);
   2288out:
   2289	spin_unlock(&_minor_lock);
   2290
   2291	return md;
   2292}
   2293EXPORT_SYMBOL_GPL(dm_get_md);
   2294
   2295void *dm_get_mdptr(struct mapped_device *md)
   2296{
   2297	return md->interface_ptr;
   2298}
   2299
   2300void dm_set_mdptr(struct mapped_device *md, void *ptr)
   2301{
   2302	md->interface_ptr = ptr;
   2303}
   2304
   2305void dm_get(struct mapped_device *md)
   2306{
   2307	atomic_inc(&md->holders);
   2308	BUG_ON(test_bit(DMF_FREEING, &md->flags));
   2309}
   2310
   2311int dm_hold(struct mapped_device *md)
   2312{
   2313	spin_lock(&_minor_lock);
   2314	if (test_bit(DMF_FREEING, &md->flags)) {
   2315		spin_unlock(&_minor_lock);
   2316		return -EBUSY;
   2317	}
   2318	dm_get(md);
   2319	spin_unlock(&_minor_lock);
   2320	return 0;
   2321}
   2322EXPORT_SYMBOL_GPL(dm_hold);
   2323
   2324const char *dm_device_name(struct mapped_device *md)
   2325{
   2326	return md->name;
   2327}
   2328EXPORT_SYMBOL_GPL(dm_device_name);
   2329
   2330static void __dm_destroy(struct mapped_device *md, bool wait)
   2331{
   2332	struct dm_table *map;
   2333	int srcu_idx;
   2334
   2335	might_sleep();
   2336
   2337	spin_lock(&_minor_lock);
   2338	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
   2339	set_bit(DMF_FREEING, &md->flags);
   2340	spin_unlock(&_minor_lock);
   2341
   2342	blk_mark_disk_dead(md->disk);
   2343
   2344	/*
   2345	 * Take suspend_lock so that presuspend and postsuspend methods
   2346	 * do not race with internal suspend.
   2347	 */
   2348	mutex_lock(&md->suspend_lock);
   2349	map = dm_get_live_table(md, &srcu_idx);
   2350	if (!dm_suspended_md(md)) {
   2351		dm_table_presuspend_targets(map);
   2352		set_bit(DMF_SUSPENDED, &md->flags);
   2353		set_bit(DMF_POST_SUSPENDING, &md->flags);
   2354		dm_table_postsuspend_targets(map);
   2355	}
   2356	/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
   2357	dm_put_live_table(md, srcu_idx);
   2358	mutex_unlock(&md->suspend_lock);
   2359
   2360	/*
   2361	 * Rare, but there may be I/O requests still going to complete,
   2362	 * for example.  Wait for all references to disappear.
   2363	 * No one should increment the reference count of the mapped_device,
   2364	 * after the mapped_device state becomes DMF_FREEING.
   2365	 */
   2366	if (wait)
   2367		while (atomic_read(&md->holders))
   2368			msleep(1);
   2369	else if (atomic_read(&md->holders))
   2370		DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
   2371		       dm_device_name(md), atomic_read(&md->holders));
   2372
   2373	dm_table_destroy(__unbind(md));
   2374	free_dev(md);
   2375}
   2376
   2377void dm_destroy(struct mapped_device *md)
   2378{
   2379	__dm_destroy(md, true);
   2380}
   2381
   2382void dm_destroy_immediate(struct mapped_device *md)
   2383{
   2384	__dm_destroy(md, false);
   2385}
   2386
   2387void dm_put(struct mapped_device *md)
   2388{
   2389	atomic_dec(&md->holders);
   2390}
   2391EXPORT_SYMBOL_GPL(dm_put);
   2392
   2393static bool dm_in_flight_bios(struct mapped_device *md)
   2394{
   2395	int cpu;
   2396	unsigned long sum = 0;
   2397
   2398	for_each_possible_cpu(cpu)
   2399		sum += *per_cpu_ptr(md->pending_io, cpu);
   2400
   2401	return sum != 0;
   2402}
   2403
   2404static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
   2405{
   2406	int r = 0;
   2407	DEFINE_WAIT(wait);
   2408
   2409	while (true) {
   2410		prepare_to_wait(&md->wait, &wait, task_state);
   2411
   2412		if (!dm_in_flight_bios(md))
   2413			break;
   2414
   2415		if (signal_pending_state(task_state, current)) {
   2416			r = -EINTR;
   2417			break;
   2418		}
   2419
   2420		io_schedule();
   2421	}
   2422	finish_wait(&md->wait, &wait);
   2423
   2424	smp_rmb();
   2425
   2426	return r;
   2427}
   2428
   2429static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
   2430{
   2431	int r = 0;
   2432
   2433	if (!queue_is_mq(md->queue))
   2434		return dm_wait_for_bios_completion(md, task_state);
   2435
   2436	while (true) {
   2437		if (!blk_mq_queue_inflight(md->queue))
   2438			break;
   2439
   2440		if (signal_pending_state(task_state, current)) {
   2441			r = -EINTR;
   2442			break;
   2443		}
   2444
   2445		msleep(5);
   2446	}
   2447
   2448	return r;
   2449}
   2450
   2451/*
   2452 * Process the deferred bios
   2453 */
   2454static void dm_wq_work(struct work_struct *work)
   2455{
   2456	struct mapped_device *md = container_of(work, struct mapped_device, work);
   2457	struct bio *bio;
   2458
   2459	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
   2460		spin_lock_irq(&md->deferred_lock);
   2461		bio = bio_list_pop(&md->deferred);
   2462		spin_unlock_irq(&md->deferred_lock);
   2463
   2464		if (!bio)
   2465			break;
   2466
   2467		submit_bio_noacct(bio);
   2468	}
   2469}
   2470
   2471static void dm_queue_flush(struct mapped_device *md)
   2472{
   2473	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
   2474	smp_mb__after_atomic();
   2475	queue_work(md->wq, &md->work);
   2476}
   2477
   2478/*
   2479 * Swap in a new table, returning the old one for the caller to destroy.
   2480 */
   2481struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
   2482{
   2483	struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
   2484	struct queue_limits limits;
   2485	int r;
   2486
   2487	mutex_lock(&md->suspend_lock);
   2488
   2489	/* device must be suspended */
   2490	if (!dm_suspended_md(md))
   2491		goto out;
   2492
   2493	/*
   2494	 * If the new table has no data devices, retain the existing limits.
   2495	 * This helps multipath with queue_if_no_path if all paths disappear,
   2496	 * then new I/O is queued based on these limits, and then some paths
   2497	 * reappear.
   2498	 */
   2499	if (dm_table_has_no_data_devices(table)) {
   2500		live_map = dm_get_live_table_fast(md);
   2501		if (live_map)
   2502			limits = md->queue->limits;
   2503		dm_put_live_table_fast(md);
   2504	}
   2505
   2506	if (!live_map) {
   2507		r = dm_calculate_queue_limits(table, &limits);
   2508		if (r) {
   2509			map = ERR_PTR(r);
   2510			goto out;
   2511		}
   2512	}
   2513
   2514	map = __bind(md, table, &limits);
   2515	dm_issue_global_event();
   2516
   2517out:
   2518	mutex_unlock(&md->suspend_lock);
   2519	return map;
   2520}
   2521
   2522/*
   2523 * Functions to lock and unlock any filesystem running on the
   2524 * device.
   2525 */
   2526static int lock_fs(struct mapped_device *md)
   2527{
   2528	int r;
   2529
   2530	WARN_ON(test_bit(DMF_FROZEN, &md->flags));
   2531
   2532	r = freeze_bdev(md->disk->part0);
   2533	if (!r)
   2534		set_bit(DMF_FROZEN, &md->flags);
   2535	return r;
   2536}
   2537
   2538static void unlock_fs(struct mapped_device *md)
   2539{
   2540	if (!test_bit(DMF_FROZEN, &md->flags))
   2541		return;
   2542	thaw_bdev(md->disk->part0);
   2543	clear_bit(DMF_FROZEN, &md->flags);
   2544}
   2545
   2546/*
   2547 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
   2548 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
   2549 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
   2550 *
   2551 * If __dm_suspend returns 0, the device is completely quiescent
   2552 * now. There is no request-processing activity. All new requests
   2553 * are being added to md->deferred list.
   2554 */
   2555static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
   2556			unsigned suspend_flags, unsigned int task_state,
   2557			int dmf_suspended_flag)
   2558{
   2559	bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
   2560	bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
   2561	int r;
   2562
   2563	lockdep_assert_held(&md->suspend_lock);
   2564
   2565	/*
   2566	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
   2567	 * This flag is cleared before dm_suspend returns.
   2568	 */
   2569	if (noflush)
   2570		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
   2571	else
   2572		DMDEBUG("%s: suspending with flush", dm_device_name(md));
   2573
   2574	/*
   2575	 * This gets reverted if there's an error later and the targets
   2576	 * provide the .presuspend_undo hook.
   2577	 */
   2578	dm_table_presuspend_targets(map);
   2579
   2580	/*
   2581	 * Flush I/O to the device.
   2582	 * Any I/O submitted after lock_fs() may not be flushed.
   2583	 * noflush takes precedence over do_lockfs.
   2584	 * (lock_fs() flushes I/Os and waits for them to complete.)
   2585	 */
   2586	if (!noflush && do_lockfs) {
   2587		r = lock_fs(md);
   2588		if (r) {
   2589			dm_table_presuspend_undo_targets(map);
   2590			return r;
   2591		}
   2592	}
   2593
   2594	/*
   2595	 * Here we must make sure that no processes are submitting requests
   2596	 * to target drivers i.e. no one may be executing
   2597	 * dm_split_and_process_bio from dm_submit_bio.
   2598	 *
   2599	 * To get all processes out of dm_split_and_process_bio in dm_submit_bio,
   2600	 * we take the write lock. To prevent any process from reentering
   2601	 * dm_split_and_process_bio from dm_submit_bio and quiesce the thread
   2602	 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
   2603	 * flush_workqueue(md->wq).
   2604	 */
   2605	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
   2606	if (map)
   2607		synchronize_srcu(&md->io_barrier);
   2608
   2609	/*
   2610	 * Stop md->queue before flushing md->wq in case request-based
   2611	 * dm defers requests to md->wq from md->queue.
   2612	 */
   2613	if (dm_request_based(md))
   2614		dm_stop_queue(md->queue);
   2615
   2616	flush_workqueue(md->wq);
   2617
   2618	/*
   2619	 * At this point no more requests are entering target request routines.
   2620	 * We call dm_wait_for_completion to wait for all existing requests
   2621	 * to finish.
   2622	 */
   2623	r = dm_wait_for_completion(md, task_state);
   2624	if (!r)
   2625		set_bit(dmf_suspended_flag, &md->flags);
   2626
   2627	if (noflush)
   2628		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
   2629	if (map)
   2630		synchronize_srcu(&md->io_barrier);
   2631
   2632	/* were we interrupted ? */
   2633	if (r < 0) {
   2634		dm_queue_flush(md);
   2635
   2636		if (dm_request_based(md))
   2637			dm_start_queue(md->queue);
   2638
   2639		unlock_fs(md);
   2640		dm_table_presuspend_undo_targets(map);
   2641		/* pushback list is already flushed, so skip flush */
   2642	}
   2643
   2644	return r;
   2645}
   2646
   2647/*
   2648 * We need to be able to change a mapping table under a mounted
   2649 * filesystem.  For example we might want to move some data in
   2650 * the background.  Before the table can be swapped with
   2651 * dm_bind_table, dm_suspend must be called to flush any in
   2652 * flight bios and ensure that any further io gets deferred.
   2653 */
   2654/*
   2655 * Suspend mechanism in request-based dm.
   2656 *
   2657 * 1. Flush all I/Os by lock_fs() if needed.
   2658 * 2. Stop dispatching any I/O by stopping the request_queue.
   2659 * 3. Wait for all in-flight I/Os to be completed or requeued.
   2660 *
   2661 * To abort suspend, start the request_queue.
   2662 */
   2663int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
   2664{
   2665	struct dm_table *map = NULL;
   2666	int r = 0;
   2667
   2668retry:
   2669	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
   2670
   2671	if (dm_suspended_md(md)) {
   2672		r = -EINVAL;
   2673		goto out_unlock;
   2674	}
   2675
   2676	if (dm_suspended_internally_md(md)) {
   2677		/* already internally suspended, wait for internal resume */
   2678		mutex_unlock(&md->suspend_lock);
   2679		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
   2680		if (r)
   2681			return r;
   2682		goto retry;
   2683	}
   2684
   2685	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
   2686
   2687	r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
   2688	if (r)
   2689		goto out_unlock;
   2690
   2691	set_bit(DMF_POST_SUSPENDING, &md->flags);
   2692	dm_table_postsuspend_targets(map);
   2693	clear_bit(DMF_POST_SUSPENDING, &md->flags);
   2694
   2695out_unlock:
   2696	mutex_unlock(&md->suspend_lock);
   2697	return r;
   2698}
   2699
   2700static int __dm_resume(struct mapped_device *md, struct dm_table *map)
   2701{
   2702	if (map) {
   2703		int r = dm_table_resume_targets(map);
   2704		if (r)
   2705			return r;
   2706	}
   2707
   2708	dm_queue_flush(md);
   2709
   2710	/*
   2711	 * Flushing deferred I/Os must be done after targets are resumed
   2712	 * so that mapping of targets can work correctly.
   2713	 * Request-based dm is queueing the deferred I/Os in its request_queue.
   2714	 */
   2715	if (dm_request_based(md))
   2716		dm_start_queue(md->queue);
   2717
   2718	unlock_fs(md);
   2719
   2720	return 0;
   2721}
   2722
   2723int dm_resume(struct mapped_device *md)
   2724{
   2725	int r;
   2726	struct dm_table *map = NULL;
   2727
   2728retry:
   2729	r = -EINVAL;
   2730	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
   2731
   2732	if (!dm_suspended_md(md))
   2733		goto out;
   2734
   2735	if (dm_suspended_internally_md(md)) {
   2736		/* already internally suspended, wait for internal resume */
   2737		mutex_unlock(&md->suspend_lock);
   2738		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
   2739		if (r)
   2740			return r;
   2741		goto retry;
   2742	}
   2743
   2744	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
   2745	if (!map || !dm_table_get_size(map))
   2746		goto out;
   2747
   2748	r = __dm_resume(md, map);
   2749	if (r)
   2750		goto out;
   2751
   2752	clear_bit(DMF_SUSPENDED, &md->flags);
   2753out:
   2754	mutex_unlock(&md->suspend_lock);
   2755
   2756	return r;
   2757}
   2758
   2759/*
   2760 * Internal suspend/resume works like userspace-driven suspend. It waits
   2761 * until all bios finish and prevents issuing new bios to the target drivers.
   2762 * It may be used only from the kernel.
   2763 */
   2764
   2765static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
   2766{
   2767	struct dm_table *map = NULL;
   2768
   2769	lockdep_assert_held(&md->suspend_lock);
   2770
   2771	if (md->internal_suspend_count++)
   2772		return; /* nested internal suspend */
   2773
   2774	if (dm_suspended_md(md)) {
   2775		set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
   2776		return; /* nest suspend */
   2777	}
   2778
   2779	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
   2780
   2781	/*
   2782	 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
   2783	 * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
   2784	 * would require changing .presuspend to return an error -- avoid this
   2785	 * until there is a need for more elaborate variants of internal suspend.
   2786	 */
   2787	(void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
   2788			    DMF_SUSPENDED_INTERNALLY);
   2789
   2790	set_bit(DMF_POST_SUSPENDING, &md->flags);
   2791	dm_table_postsuspend_targets(map);
   2792	clear_bit(DMF_POST_SUSPENDING, &md->flags);
   2793}
   2794
   2795static void __dm_internal_resume(struct mapped_device *md)
   2796{
   2797	BUG_ON(!md->internal_suspend_count);
   2798
   2799	if (--md->internal_suspend_count)
   2800		return; /* resume from nested internal suspend */
   2801
   2802	if (dm_suspended_md(md))
   2803		goto done; /* resume from nested suspend */
   2804
   2805	/*
   2806	 * NOTE: existing callers don't need to call dm_table_resume_targets
   2807	 * (which may fail -- so best to avoid it for now by passing NULL map)
   2808	 */
   2809	(void) __dm_resume(md, NULL);
   2810
   2811done:
   2812	clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
   2813	smp_mb__after_atomic();
   2814	wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
   2815}
   2816
   2817void dm_internal_suspend_noflush(struct mapped_device *md)
   2818{
   2819	mutex_lock(&md->suspend_lock);
   2820	__dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
   2821	mutex_unlock(&md->suspend_lock);
   2822}
   2823EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
   2824
   2825void dm_internal_resume(struct mapped_device *md)
   2826{
   2827	mutex_lock(&md->suspend_lock);
   2828	__dm_internal_resume(md);
   2829	mutex_unlock(&md->suspend_lock);
   2830}
   2831EXPORT_SYMBOL_GPL(dm_internal_resume);
   2832
   2833/*
   2834 * Fast variants of internal suspend/resume hold md->suspend_lock,
   2835 * which prevents interaction with userspace-driven suspend.
   2836 */
   2837
   2838void dm_internal_suspend_fast(struct mapped_device *md)
   2839{
   2840	mutex_lock(&md->suspend_lock);
   2841	if (dm_suspended_md(md) || dm_suspended_internally_md(md))
   2842		return;
   2843
   2844	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
   2845	synchronize_srcu(&md->io_barrier);
   2846	flush_workqueue(md->wq);
   2847	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
   2848}
   2849EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
   2850
   2851void dm_internal_resume_fast(struct mapped_device *md)
   2852{
   2853	if (dm_suspended_md(md) || dm_suspended_internally_md(md))
   2854		goto done;
   2855
   2856	dm_queue_flush(md);
   2857
   2858done:
   2859	mutex_unlock(&md->suspend_lock);
   2860}
   2861EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
   2862
   2863/*-----------------------------------------------------------------
   2864 * Event notification.
   2865 *---------------------------------------------------------------*/
   2866int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
   2867		       unsigned cookie)
   2868{
   2869	int r;
   2870	unsigned noio_flag;
   2871	char udev_cookie[DM_COOKIE_LENGTH];
   2872	char *envp[] = { udev_cookie, NULL };
   2873
   2874	noio_flag = memalloc_noio_save();
   2875
   2876	if (!cookie)
   2877		r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
   2878	else {
   2879		snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
   2880			 DM_COOKIE_ENV_VAR_NAME, cookie);
   2881		r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
   2882				       action, envp);
   2883	}
   2884
   2885	memalloc_noio_restore(noio_flag);
   2886
   2887	return r;
   2888}
   2889
   2890uint32_t dm_next_uevent_seq(struct mapped_device *md)
   2891{
   2892	return atomic_add_return(1, &md->uevent_seq);
   2893}
   2894
   2895uint32_t dm_get_event_nr(struct mapped_device *md)
   2896{
   2897	return atomic_read(&md->event_nr);
   2898}
   2899
   2900int dm_wait_event(struct mapped_device *md, int event_nr)
   2901{
   2902	return wait_event_interruptible(md->eventq,
   2903			(event_nr != atomic_read(&md->event_nr)));
   2904}
   2905
   2906void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
   2907{
   2908	unsigned long flags;
   2909
   2910	spin_lock_irqsave(&md->uevent_lock, flags);
   2911	list_add(elist, &md->uevent_list);
   2912	spin_unlock_irqrestore(&md->uevent_lock, flags);
   2913}
   2914
   2915/*
   2916 * The gendisk is only valid as long as you have a reference
   2917 * count on 'md'.
   2918 */
   2919struct gendisk *dm_disk(struct mapped_device *md)
   2920{
   2921	return md->disk;
   2922}
   2923EXPORT_SYMBOL_GPL(dm_disk);
   2924
   2925struct kobject *dm_kobject(struct mapped_device *md)
   2926{
   2927	return &md->kobj_holder.kobj;
   2928}
   2929
   2930struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
   2931{
   2932	struct mapped_device *md;
   2933
   2934	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
   2935
   2936	spin_lock(&_minor_lock);
   2937	if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
   2938		md = NULL;
   2939		goto out;
   2940	}
   2941	dm_get(md);
   2942out:
   2943	spin_unlock(&_minor_lock);
   2944
   2945	return md;
   2946}
   2947
   2948int dm_suspended_md(struct mapped_device *md)
   2949{
   2950	return test_bit(DMF_SUSPENDED, &md->flags);
   2951}
   2952
   2953static int dm_post_suspending_md(struct mapped_device *md)
   2954{
   2955	return test_bit(DMF_POST_SUSPENDING, &md->flags);
   2956}
   2957
   2958int dm_suspended_internally_md(struct mapped_device *md)
   2959{
   2960	return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
   2961}
   2962
   2963int dm_test_deferred_remove_flag(struct mapped_device *md)
   2964{
   2965	return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
   2966}
   2967
   2968int dm_suspended(struct dm_target *ti)
   2969{
   2970	return dm_suspended_md(ti->table->md);
   2971}
   2972EXPORT_SYMBOL_GPL(dm_suspended);
   2973
   2974int dm_post_suspending(struct dm_target *ti)
   2975{
   2976	return dm_post_suspending_md(ti->table->md);
   2977}
   2978EXPORT_SYMBOL_GPL(dm_post_suspending);
   2979
   2980int dm_noflush_suspending(struct dm_target *ti)
   2981{
   2982	return __noflush_suspending(ti->table->md);
   2983}
   2984EXPORT_SYMBOL_GPL(dm_noflush_suspending);
   2985
   2986struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
   2987					    unsigned per_io_data_size, unsigned min_pool_size,
   2988					    bool integrity, bool poll)
   2989{
   2990	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
   2991	unsigned int pool_size = 0;
   2992	unsigned int front_pad, io_front_pad;
   2993	int ret;
   2994
   2995	if (!pools)
   2996		return NULL;
   2997
   2998	switch (type) {
   2999	case DM_TYPE_BIO_BASED:
   3000	case DM_TYPE_DAX_BIO_BASED:
   3001		pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
   3002		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET;
   3003		io_front_pad = roundup(per_io_data_size,  __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
   3004		ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, poll ? BIOSET_PERCPU_CACHE : 0);
   3005		if (ret)
   3006			goto out;
   3007		if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
   3008			goto out;
   3009		break;
   3010	case DM_TYPE_REQUEST_BASED:
   3011		pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
   3012		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
   3013		/* per_io_data_size is used for blk-mq pdu at queue allocation */
   3014		break;
   3015	default:
   3016		BUG();
   3017	}
   3018
   3019	ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
   3020	if (ret)
   3021		goto out;
   3022
   3023	if (integrity && bioset_integrity_create(&pools->bs, pool_size))
   3024		goto out;
   3025
   3026	return pools;
   3027
   3028out:
   3029	dm_free_md_mempools(pools);
   3030
   3031	return NULL;
   3032}
   3033
   3034void dm_free_md_mempools(struct dm_md_mempools *pools)
   3035{
   3036	if (!pools)
   3037		return;
   3038
   3039	bioset_exit(&pools->bs);
   3040	bioset_exit(&pools->io_bs);
   3041
   3042	kfree(pools);
   3043}
   3044
   3045struct dm_pr {
   3046	u64	old_key;
   3047	u64	new_key;
   3048	u32	flags;
   3049	bool	fail_early;
   3050};
   3051
   3052static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
   3053		      void *data)
   3054{
   3055	struct mapped_device *md = bdev->bd_disk->private_data;
   3056	struct dm_table *table;
   3057	struct dm_target *ti;
   3058	int ret = -ENOTTY, srcu_idx;
   3059
   3060	table = dm_get_live_table(md, &srcu_idx);
   3061	if (!table || !dm_table_get_size(table))
   3062		goto out;
   3063
   3064	/* We only support devices that have a single target */
   3065	if (dm_table_get_num_targets(table) != 1)
   3066		goto out;
   3067	ti = dm_table_get_target(table, 0);
   3068
   3069	ret = -EINVAL;
   3070	if (!ti->type->iterate_devices)
   3071		goto out;
   3072
   3073	ret = ti->type->iterate_devices(ti, fn, data);
   3074out:
   3075	dm_put_live_table(md, srcu_idx);
   3076	return ret;
   3077}
   3078
   3079/*
   3080 * For register / unregister we need to manually call out to every path.
   3081 */
   3082static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
   3083			    sector_t start, sector_t len, void *data)
   3084{
   3085	struct dm_pr *pr = data;
   3086	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
   3087
   3088	if (!ops || !ops->pr_register)
   3089		return -EOPNOTSUPP;
   3090	return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
   3091}
   3092
   3093static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
   3094			  u32 flags)
   3095{
   3096	struct dm_pr pr = {
   3097		.old_key	= old_key,
   3098		.new_key	= new_key,
   3099		.flags		= flags,
   3100		.fail_early	= true,
   3101	};
   3102	int ret;
   3103
   3104	ret = dm_call_pr(bdev, __dm_pr_register, &pr);
   3105	if (ret && new_key) {
   3106		/* unregister all paths if we failed to register any path */
   3107		pr.old_key = new_key;
   3108		pr.new_key = 0;
   3109		pr.flags = 0;
   3110		pr.fail_early = false;
   3111		dm_call_pr(bdev, __dm_pr_register, &pr);
   3112	}
   3113
   3114	return ret;
   3115}
   3116
   3117static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
   3118			 u32 flags)
   3119{
   3120	struct mapped_device *md = bdev->bd_disk->private_data;
   3121	const struct pr_ops *ops;
   3122	int r, srcu_idx;
   3123
   3124	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
   3125	if (r < 0)
   3126		goto out;
   3127
   3128	ops = bdev->bd_disk->fops->pr_ops;
   3129	if (ops && ops->pr_reserve)
   3130		r = ops->pr_reserve(bdev, key, type, flags);
   3131	else
   3132		r = -EOPNOTSUPP;
   3133out:
   3134	dm_unprepare_ioctl(md, srcu_idx);
   3135	return r;
   3136}
   3137
   3138static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
   3139{
   3140	struct mapped_device *md = bdev->bd_disk->private_data;
   3141	const struct pr_ops *ops;
   3142	int r, srcu_idx;
   3143
   3144	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
   3145	if (r < 0)
   3146		goto out;
   3147
   3148	ops = bdev->bd_disk->fops->pr_ops;
   3149	if (ops && ops->pr_release)
   3150		r = ops->pr_release(bdev, key, type);
   3151	else
   3152		r = -EOPNOTSUPP;
   3153out:
   3154	dm_unprepare_ioctl(md, srcu_idx);
   3155	return r;
   3156}
   3157
   3158static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
   3159			 enum pr_type type, bool abort)
   3160{
   3161	struct mapped_device *md = bdev->bd_disk->private_data;
   3162	const struct pr_ops *ops;
   3163	int r, srcu_idx;
   3164
   3165	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
   3166	if (r < 0)
   3167		goto out;
   3168
   3169	ops = bdev->bd_disk->fops->pr_ops;
   3170	if (ops && ops->pr_preempt)
   3171		r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
   3172	else
   3173		r = -EOPNOTSUPP;
   3174out:
   3175	dm_unprepare_ioctl(md, srcu_idx);
   3176	return r;
   3177}
   3178
   3179static int dm_pr_clear(struct block_device *bdev, u64 key)
   3180{
   3181	struct mapped_device *md = bdev->bd_disk->private_data;
   3182	const struct pr_ops *ops;
   3183	int r, srcu_idx;
   3184
   3185	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
   3186	if (r < 0)
   3187		goto out;
   3188
   3189	ops = bdev->bd_disk->fops->pr_ops;
   3190	if (ops && ops->pr_clear)
   3191		r = ops->pr_clear(bdev, key);
   3192	else
   3193		r = -EOPNOTSUPP;
   3194out:
   3195	dm_unprepare_ioctl(md, srcu_idx);
   3196	return r;
   3197}
   3198
   3199static const struct pr_ops dm_pr_ops = {
   3200	.pr_register	= dm_pr_register,
   3201	.pr_reserve	= dm_pr_reserve,
   3202	.pr_release	= dm_pr_release,
   3203	.pr_preempt	= dm_pr_preempt,
   3204	.pr_clear	= dm_pr_clear,
   3205};
   3206
   3207static const struct block_device_operations dm_blk_dops = {
   3208	.submit_bio = dm_submit_bio,
   3209	.poll_bio = dm_poll_bio,
   3210	.open = dm_blk_open,
   3211	.release = dm_blk_close,
   3212	.ioctl = dm_blk_ioctl,
   3213	.getgeo = dm_blk_getgeo,
   3214	.report_zones = dm_blk_report_zones,
   3215	.pr_ops = &dm_pr_ops,
   3216	.owner = THIS_MODULE
   3217};
   3218
   3219static const struct block_device_operations dm_rq_blk_dops = {
   3220	.open = dm_blk_open,
   3221	.release = dm_blk_close,
   3222	.ioctl = dm_blk_ioctl,
   3223	.getgeo = dm_blk_getgeo,
   3224	.pr_ops = &dm_pr_ops,
   3225	.owner = THIS_MODULE
   3226};
   3227
   3228static const struct dax_operations dm_dax_ops = {
   3229	.direct_access = dm_dax_direct_access,
   3230	.zero_page_range = dm_dax_zero_page_range,
   3231	.recovery_write = dm_dax_recovery_write,
   3232};
   3233
   3234/*
   3235 * module hooks
   3236 */
   3237module_init(dm_init);
   3238module_exit(dm_exit);
   3239
   3240module_param(major, uint, 0);
   3241MODULE_PARM_DESC(major, "The major number of the device mapper");
   3242
   3243module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
   3244MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
   3245
   3246module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
   3247MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
   3248
   3249module_param(swap_bios, int, S_IRUGO | S_IWUSR);
   3250MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
   3251
   3252MODULE_DESCRIPTION(DM_NAME " driver");
   3253MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
   3254MODULE_LICENSE("GPL");