dm-thin.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
dm-thin.c (114290B)
      1/*
      2 * Copyright (C) 2011-2012 Red Hat UK.
      3 *
      4 * This file is released under the GPL.
      5 */
      6
      7#include "dm-thin-metadata.h"
      8#include "dm-bio-prison-v1.h"
      9#include "dm.h"
     10
     11#include <linux/device-mapper.h>
     12#include <linux/dm-io.h>
     13#include <linux/dm-kcopyd.h>
     14#include <linux/jiffies.h>
     15#include <linux/log2.h>
     16#include <linux/list.h>
     17#include <linux/rculist.h>
     18#include <linux/init.h>
     19#include <linux/module.h>
     20#include <linux/slab.h>
     21#include <linux/vmalloc.h>
     22#include <linux/sort.h>
     23#include <linux/rbtree.h>
     24
     25#define	DM_MSG_PREFIX	"thin"
     26
     27/*
     28 * Tunable constants
     29 */
     30#define ENDIO_HOOK_POOL_SIZE 1024
     31#define MAPPING_POOL_SIZE 1024
     32#define COMMIT_PERIOD HZ
     33#define NO_SPACE_TIMEOUT_SECS 60
     34
     35static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
     36
     37DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
     38		"A percentage of time allocated for copy on write");
     39
     40/*
     41 * The block size of the device holding pool data must be
     42 * between 64KB and 1GB.
     43 */
     44#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
     45#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
     46
     47/*
     48 * Device id is restricted to 24 bits.
     49 */
     50#define MAX_DEV_ID ((1 << 24) - 1)
     51
     52/*
     53 * How do we handle breaking sharing of data blocks?
     54 * =================================================
     55 *
     56 * We use a standard copy-on-write btree to store the mappings for the
     57 * devices (note I'm talking about copy-on-write of the metadata here, not
     58 * the data).  When you take an internal snapshot you clone the root node
     59 * of the origin btree.  After this there is no concept of an origin or a
     60 * snapshot.  They are just two device trees that happen to point to the
     61 * same data blocks.
     62 *
     63 * When we get a write in we decide if it's to a shared data block using
     64 * some timestamp magic.  If it is, we have to break sharing.
     65 *
     66 * Let's say we write to a shared block in what was the origin.  The
     67 * steps are:
     68 *
     69 * i) plug io further to this physical block. (see bio_prison code).
     70 *
     71 * ii) quiesce any read io to that shared data block.  Obviously
     72 * including all devices that share this block.  (see dm_deferred_set code)
     73 *
     74 * iii) copy the data block to a newly allocate block.  This step can be
     75 * missed out if the io covers the block. (schedule_copy).
     76 *
     77 * iv) insert the new mapping into the origin's btree
     78 * (process_prepared_mapping).  This act of inserting breaks some
     79 * sharing of btree nodes between the two devices.  Breaking sharing only
     80 * effects the btree of that specific device.  Btrees for the other
     81 * devices that share the block never change.  The btree for the origin
     82 * device as it was after the last commit is untouched, ie. we're using
     83 * persistent data structures in the functional programming sense.
     84 *
     85 * v) unplug io to this physical block, including the io that triggered
     86 * the breaking of sharing.
     87 *
     88 * Steps (ii) and (iii) occur in parallel.
     89 *
     90 * The metadata _doesn't_ need to be committed before the io continues.  We
     91 * get away with this because the io is always written to a _new_ block.
     92 * If there's a crash, then:
     93 *
     94 * - The origin mapping will point to the old origin block (the shared
     95 * one).  This will contain the data as it was before the io that triggered
     96 * the breaking of sharing came in.
     97 *
     98 * - The snap mapping still points to the old block.  As it would after
     99 * the commit.
    100 *
    101 * The downside of this scheme is the timestamp magic isn't perfect, and
    102 * will continue to think that data block in the snapshot device is shared
    103 * even after the write to the origin has broken sharing.  I suspect data
    104 * blocks will typically be shared by many different devices, so we're
    105 * breaking sharing n + 1 times, rather than n, where n is the number of
    106 * devices that reference this data block.  At the moment I think the
    107 * benefits far, far outweigh the disadvantages.
    108 */
    109
    110/*----------------------------------------------------------------*/
    111
    112/*
    113 * Key building.
    114 */
    115enum lock_space {
    116	VIRTUAL,
    117	PHYSICAL
    118};
    119
    120static void build_key(struct dm_thin_device *td, enum lock_space ls,
    121		      dm_block_t b, dm_block_t e, struct dm_cell_key *key)
    122{
    123	key->virtual = (ls == VIRTUAL);
    124	key->dev = dm_thin_dev_id(td);
    125	key->block_begin = b;
    126	key->block_end = e;
    127}
    128
    129static void build_data_key(struct dm_thin_device *td, dm_block_t b,
    130			   struct dm_cell_key *key)
    131{
    132	build_key(td, PHYSICAL, b, b + 1llu, key);
    133}
    134
    135static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
    136			      struct dm_cell_key *key)
    137{
    138	build_key(td, VIRTUAL, b, b + 1llu, key);
    139}
    140
    141/*----------------------------------------------------------------*/
    142
    143#define THROTTLE_THRESHOLD (1 * HZ)
    144
    145struct throttle {
    146	struct rw_semaphore lock;
    147	unsigned long threshold;
    148	bool throttle_applied;
    149};
    150
    151static void throttle_init(struct throttle *t)
    152{
    153	init_rwsem(&t->lock);
    154	t->throttle_applied = false;
    155}
    156
    157static void throttle_work_start(struct throttle *t)
    158{
    159	t->threshold = jiffies + THROTTLE_THRESHOLD;
    160}
    161
    162static void throttle_work_update(struct throttle *t)
    163{
    164	if (!t->throttle_applied && time_is_before_jiffies(t->threshold)) {
    165		down_write(&t->lock);
    166		t->throttle_applied = true;
    167	}
    168}
    169
    170static void throttle_work_complete(struct throttle *t)
    171{
    172	if (t->throttle_applied) {
    173		t->throttle_applied = false;
    174		up_write(&t->lock);
    175	}
    176}
    177
    178static void throttle_lock(struct throttle *t)
    179{
    180	down_read(&t->lock);
    181}
    182
    183static void throttle_unlock(struct throttle *t)
    184{
    185	up_read(&t->lock);
    186}
    187
    188/*----------------------------------------------------------------*/
    189
    190/*
    191 * A pool device ties together a metadata device and a data device.  It
    192 * also provides the interface for creating and destroying internal
    193 * devices.
    194 */
    195struct dm_thin_new_mapping;
    196
    197/*
    198 * The pool runs in various modes.  Ordered in degraded order for comparisons.
    199 */
    200enum pool_mode {
    201	PM_WRITE,		/* metadata may be changed */
    202	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */
    203
    204	/*
    205	 * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
    206	 */
    207	PM_OUT_OF_METADATA_SPACE,
    208	PM_READ_ONLY,		/* metadata may not be changed */
    209
    210	PM_FAIL,		/* all I/O fails */
    211};
    212
    213struct pool_features {
    214	enum pool_mode mode;
    215
    216	bool zero_new_blocks:1;
    217	bool discard_enabled:1;
    218	bool discard_passdown:1;
    219	bool error_if_no_space:1;
    220};
    221
    222struct thin_c;
    223typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
    224typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
    225typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
    226
    227#define CELL_SORT_ARRAY_SIZE 8192
    228
    229struct pool {
    230	struct list_head list;
    231	struct dm_target *ti;	/* Only set if a pool target is bound */
    232
    233	struct mapped_device *pool_md;
    234	struct block_device *data_dev;
    235	struct block_device *md_dev;
    236	struct dm_pool_metadata *pmd;
    237
    238	dm_block_t low_water_blocks;
    239	uint32_t sectors_per_block;
    240	int sectors_per_block_shift;
    241
    242	struct pool_features pf;
    243	bool low_water_triggered:1;	/* A dm event has been sent */
    244	bool suspended:1;
    245	bool out_of_data_space:1;
    246
    247	struct dm_bio_prison *prison;
    248	struct dm_kcopyd_client *copier;
    249
    250	struct work_struct worker;
    251	struct workqueue_struct *wq;
    252	struct throttle throttle;
    253	struct delayed_work waker;
    254	struct delayed_work no_space_timeout;
    255
    256	unsigned long last_commit_jiffies;
    257	unsigned ref_count;
    258
    259	spinlock_t lock;
    260	struct bio_list deferred_flush_bios;
    261	struct bio_list deferred_flush_completions;
    262	struct list_head prepared_mappings;
    263	struct list_head prepared_discards;
    264	struct list_head prepared_discards_pt2;
    265	struct list_head active_thins;
    266
    267	struct dm_deferred_set *shared_read_ds;
    268	struct dm_deferred_set *all_io_ds;
    269
    270	struct dm_thin_new_mapping *next_mapping;
    271
    272	process_bio_fn process_bio;
    273	process_bio_fn process_discard;
    274
    275	process_cell_fn process_cell;
    276	process_cell_fn process_discard_cell;
    277
    278	process_mapping_fn process_prepared_mapping;
    279	process_mapping_fn process_prepared_discard;
    280	process_mapping_fn process_prepared_discard_pt2;
    281
    282	struct dm_bio_prison_cell **cell_sort_array;
    283
    284	mempool_t mapping_pool;
    285};
    286
    287static void metadata_operation_failed(struct pool *pool, const char *op, int r);
    288
    289static enum pool_mode get_pool_mode(struct pool *pool)
    290{
    291	return pool->pf.mode;
    292}
    293
    294static void notify_of_pool_mode_change(struct pool *pool)
    295{
    296	const char *descs[] = {
    297		"write",
    298		"out-of-data-space",
    299		"read-only",
    300		"read-only",
    301		"fail"
    302	};
    303	const char *extra_desc = NULL;
    304	enum pool_mode mode = get_pool_mode(pool);
    305
    306	if (mode == PM_OUT_OF_DATA_SPACE) {
    307		if (!pool->pf.error_if_no_space)
    308			extra_desc = " (queue IO)";
    309		else
    310			extra_desc = " (error IO)";
    311	}
    312
    313	dm_table_event(pool->ti->table);
    314	DMINFO("%s: switching pool to %s%s mode",
    315	       dm_device_name(pool->pool_md),
    316	       descs[(int)mode], extra_desc ? : "");
    317}
    318
    319/*
    320 * Target context for a pool.
    321 */
    322struct pool_c {
    323	struct dm_target *ti;
    324	struct pool *pool;
    325	struct dm_dev *data_dev;
    326	struct dm_dev *metadata_dev;
    327
    328	dm_block_t low_water_blocks;
    329	struct pool_features requested_pf; /* Features requested during table load */
    330	struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
    331};
    332
    333/*
    334 * Target context for a thin.
    335 */
    336struct thin_c {
    337	struct list_head list;
    338	struct dm_dev *pool_dev;
    339	struct dm_dev *origin_dev;
    340	sector_t origin_size;
    341	dm_thin_id dev_id;
    342
    343	struct pool *pool;
    344	struct dm_thin_device *td;
    345	struct mapped_device *thin_md;
    346
    347	bool requeue_mode:1;
    348	spinlock_t lock;
    349	struct list_head deferred_cells;
    350	struct bio_list deferred_bio_list;
    351	struct bio_list retry_on_resume_list;
    352	struct rb_root sort_bio_list; /* sorted list of deferred bios */
    353
    354	/*
    355	 * Ensures the thin is not destroyed until the worker has finished
    356	 * iterating the active_thins list.
    357	 */
    358	refcount_t refcount;
    359	struct completion can_destroy;
    360};
    361
    362/*----------------------------------------------------------------*/
    363
    364static bool block_size_is_power_of_two(struct pool *pool)
    365{
    366	return pool->sectors_per_block_shift >= 0;
    367}
    368
    369static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
    370{
    371	return block_size_is_power_of_two(pool) ?
    372		(b << pool->sectors_per_block_shift) :
    373		(b * pool->sectors_per_block);
    374}
    375
    376/*----------------------------------------------------------------*/
    377
    378struct discard_op {
    379	struct thin_c *tc;
    380	struct blk_plug plug;
    381	struct bio *parent_bio;
    382	struct bio *bio;
    383};
    384
    385static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent)
    386{
    387	BUG_ON(!parent);
    388
    389	op->tc = tc;
    390	blk_start_plug(&op->plug);
    391	op->parent_bio = parent;
    392	op->bio = NULL;
    393}
    394
    395static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
    396{
    397	struct thin_c *tc = op->tc;
    398	sector_t s = block_to_sectors(tc->pool, data_b);
    399	sector_t len = block_to_sectors(tc->pool, data_e - data_b);
    400
    401	return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOWAIT,
    402				      &op->bio);
    403}
    404
    405static void end_discard(struct discard_op *op, int r)
    406{
    407	if (op->bio) {
    408		/*
    409		 * Even if one of the calls to issue_discard failed, we
    410		 * need to wait for the chain to complete.
    411		 */
    412		bio_chain(op->bio, op->parent_bio);
    413		bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0);
    414		submit_bio(op->bio);
    415	}
    416
    417	blk_finish_plug(&op->plug);
    418
    419	/*
    420	 * Even if r is set, there could be sub discards in flight that we
    421	 * need to wait for.
    422	 */
    423	if (r && !op->parent_bio->bi_status)
    424		op->parent_bio->bi_status = errno_to_blk_status(r);
    425	bio_endio(op->parent_bio);
    426}
    427
    428/*----------------------------------------------------------------*/
    429
    430/*
    431 * wake_worker() is used when new work is queued and when pool_resume is
    432 * ready to continue deferred IO processing.
    433 */
    434static void wake_worker(struct pool *pool)
    435{
    436	queue_work(pool->wq, &pool->worker);
    437}
    438
    439/*----------------------------------------------------------------*/
    440
    441static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
    442		      struct dm_bio_prison_cell **cell_result)
    443{
    444	int r;
    445	struct dm_bio_prison_cell *cell_prealloc;
    446
    447	/*
    448	 * Allocate a cell from the prison's mempool.
    449	 * This might block but it can't fail.
    450	 */
    451	cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
    452
    453	r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
    454	if (r)
    455		/*
    456		 * We reused an old cell; we can get rid of
    457		 * the new one.
    458		 */
    459		dm_bio_prison_free_cell(pool->prison, cell_prealloc);
    460
    461	return r;
    462}
    463
    464static void cell_release(struct pool *pool,
    465			 struct dm_bio_prison_cell *cell,
    466			 struct bio_list *bios)
    467{
    468	dm_cell_release(pool->prison, cell, bios);
    469	dm_bio_prison_free_cell(pool->prison, cell);
    470}
    471
    472static void cell_visit_release(struct pool *pool,
    473			       void (*fn)(void *, struct dm_bio_prison_cell *),
    474			       void *context,
    475			       struct dm_bio_prison_cell *cell)
    476{
    477	dm_cell_visit_release(pool->prison, fn, context, cell);
    478	dm_bio_prison_free_cell(pool->prison, cell);
    479}
    480
    481static void cell_release_no_holder(struct pool *pool,
    482				   struct dm_bio_prison_cell *cell,
    483				   struct bio_list *bios)
    484{
    485	dm_cell_release_no_holder(pool->prison, cell, bios);
    486	dm_bio_prison_free_cell(pool->prison, cell);
    487}
    488
    489static void cell_error_with_code(struct pool *pool,
    490		struct dm_bio_prison_cell *cell, blk_status_t error_code)
    491{
    492	dm_cell_error(pool->prison, cell, error_code);
    493	dm_bio_prison_free_cell(pool->prison, cell);
    494}
    495
    496static blk_status_t get_pool_io_error_code(struct pool *pool)
    497{
    498	return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
    499}
    500
    501static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
    502{
    503	cell_error_with_code(pool, cell, get_pool_io_error_code(pool));
    504}
    505
    506static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
    507{
    508	cell_error_with_code(pool, cell, 0);
    509}
    510
    511static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
    512{
    513	cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
    514}
    515
    516/*----------------------------------------------------------------*/
    517
    518/*
    519 * A global list of pools that uses a struct mapped_device as a key.
    520 */
    521static struct dm_thin_pool_table {
    522	struct mutex mutex;
    523	struct list_head pools;
    524} dm_thin_pool_table;
    525
    526static void pool_table_init(void)
    527{
    528	mutex_init(&dm_thin_pool_table.mutex);
    529	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
    530}
    531
    532static void pool_table_exit(void)
    533{
    534	mutex_destroy(&dm_thin_pool_table.mutex);
    535}
    536
    537static void __pool_table_insert(struct pool *pool)
    538{
    539	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
    540	list_add(&pool->list, &dm_thin_pool_table.pools);
    541}
    542
    543static void __pool_table_remove(struct pool *pool)
    544{
    545	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
    546	list_del(&pool->list);
    547}
    548
    549static struct pool *__pool_table_lookup(struct mapped_device *md)
    550{
    551	struct pool *pool = NULL, *tmp;
    552
    553	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
    554
    555	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
    556		if (tmp->pool_md == md) {
    557			pool = tmp;
    558			break;
    559		}
    560	}
    561
    562	return pool;
    563}
    564
    565static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
    566{
    567	struct pool *pool = NULL, *tmp;
    568
    569	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
    570
    571	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
    572		if (tmp->md_dev == md_dev) {
    573			pool = tmp;
    574			break;
    575		}
    576	}
    577
    578	return pool;
    579}
    580
    581/*----------------------------------------------------------------*/
    582
    583struct dm_thin_endio_hook {
    584	struct thin_c *tc;
    585	struct dm_deferred_entry *shared_read_entry;
    586	struct dm_deferred_entry *all_io_entry;
    587	struct dm_thin_new_mapping *overwrite_mapping;
    588	struct rb_node rb_node;
    589	struct dm_bio_prison_cell *cell;
    590};
    591
    592static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
    593{
    594	bio_list_merge(bios, master);
    595	bio_list_init(master);
    596}
    597
    598static void error_bio_list(struct bio_list *bios, blk_status_t error)
    599{
    600	struct bio *bio;
    601
    602	while ((bio = bio_list_pop(bios))) {
    603		bio->bi_status = error;
    604		bio_endio(bio);
    605	}
    606}
    607
    608static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
    609		blk_status_t error)
    610{
    611	struct bio_list bios;
    612
    613	bio_list_init(&bios);
    614
    615	spin_lock_irq(&tc->lock);
    616	__merge_bio_list(&bios, master);
    617	spin_unlock_irq(&tc->lock);
    618
    619	error_bio_list(&bios, error);
    620}
    621
    622static void requeue_deferred_cells(struct thin_c *tc)
    623{
    624	struct pool *pool = tc->pool;
    625	struct list_head cells;
    626	struct dm_bio_prison_cell *cell, *tmp;
    627
    628	INIT_LIST_HEAD(&cells);
    629
    630	spin_lock_irq(&tc->lock);
    631	list_splice_init(&tc->deferred_cells, &cells);
    632	spin_unlock_irq(&tc->lock);
    633
    634	list_for_each_entry_safe(cell, tmp, &cells, user_list)
    635		cell_requeue(pool, cell);
    636}
    637
    638static void requeue_io(struct thin_c *tc)
    639{
    640	struct bio_list bios;
    641
    642	bio_list_init(&bios);
    643
    644	spin_lock_irq(&tc->lock);
    645	__merge_bio_list(&bios, &tc->deferred_bio_list);
    646	__merge_bio_list(&bios, &tc->retry_on_resume_list);
    647	spin_unlock_irq(&tc->lock);
    648
    649	error_bio_list(&bios, BLK_STS_DM_REQUEUE);
    650	requeue_deferred_cells(tc);
    651}
    652
    653static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
    654{
    655	struct thin_c *tc;
    656
    657	rcu_read_lock();
    658	list_for_each_entry_rcu(tc, &pool->active_thins, list)
    659		error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
    660	rcu_read_unlock();
    661}
    662
    663static void error_retry_list(struct pool *pool)
    664{
    665	error_retry_list_with_code(pool, get_pool_io_error_code(pool));
    666}
    667
    668/*
    669 * This section of code contains the logic for processing a thin device's IO.
    670 * Much of the code depends on pool object resources (lists, workqueues, etc)
    671 * but most is exclusively called from the thin target rather than the thin-pool
    672 * target.
    673 */
    674
    675static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
    676{
    677	struct pool *pool = tc->pool;
    678	sector_t block_nr = bio->bi_iter.bi_sector;
    679
    680	if (block_size_is_power_of_two(pool))
    681		block_nr >>= pool->sectors_per_block_shift;
    682	else
    683		(void) sector_div(block_nr, pool->sectors_per_block);
    684
    685	return block_nr;
    686}
    687
    688/*
    689 * Returns the _complete_ blocks that this bio covers.
    690 */
    691static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
    692				dm_block_t *begin, dm_block_t *end)
    693{
    694	struct pool *pool = tc->pool;
    695	sector_t b = bio->bi_iter.bi_sector;
    696	sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
    697
    698	b += pool->sectors_per_block - 1ull; /* so we round up */
    699
    700	if (block_size_is_power_of_two(pool)) {
    701		b >>= pool->sectors_per_block_shift;
    702		e >>= pool->sectors_per_block_shift;
    703	} else {
    704		(void) sector_div(b, pool->sectors_per_block);
    705		(void) sector_div(e, pool->sectors_per_block);
    706	}
    707
    708	if (e < b)
    709		/* Can happen if the bio is within a single block. */
    710		e = b;
    711
    712	*begin = b;
    713	*end = e;
    714}
    715
    716static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
    717{
    718	struct pool *pool = tc->pool;
    719	sector_t bi_sector = bio->bi_iter.bi_sector;
    720
    721	bio_set_dev(bio, tc->pool_dev->bdev);
    722	if (block_size_is_power_of_two(pool))
    723		bio->bi_iter.bi_sector =
    724			(block << pool->sectors_per_block_shift) |
    725			(bi_sector & (pool->sectors_per_block - 1));
    726	else
    727		bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
    728				 sector_div(bi_sector, pool->sectors_per_block);
    729}
    730
    731static void remap_to_origin(struct thin_c *tc, struct bio *bio)
    732{
    733	bio_set_dev(bio, tc->origin_dev->bdev);
    734}
    735
    736static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
    737{
    738	return op_is_flush(bio->bi_opf) &&
    739		dm_thin_changed_this_transaction(tc->td);
    740}
    741
    742static void inc_all_io_entry(struct pool *pool, struct bio *bio)
    743{
    744	struct dm_thin_endio_hook *h;
    745
    746	if (bio_op(bio) == REQ_OP_DISCARD)
    747		return;
    748
    749	h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
    750	h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
    751}
    752
    753static void issue(struct thin_c *tc, struct bio *bio)
    754{
    755	struct pool *pool = tc->pool;
    756
    757	if (!bio_triggers_commit(tc, bio)) {
    758		dm_submit_bio_remap(bio, NULL);
    759		return;
    760	}
    761
    762	/*
    763	 * Complete bio with an error if earlier I/O caused changes to
    764	 * the metadata that can't be committed e.g, due to I/O errors
    765	 * on the metadata device.
    766	 */
    767	if (dm_thin_aborted_changes(tc->td)) {
    768		bio_io_error(bio);
    769		return;
    770	}
    771
    772	/*
    773	 * Batch together any bios that trigger commits and then issue a
    774	 * single commit for them in process_deferred_bios().
    775	 */
    776	spin_lock_irq(&pool->lock);
    777	bio_list_add(&pool->deferred_flush_bios, bio);
    778	spin_unlock_irq(&pool->lock);
    779}
    780
    781static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
    782{
    783	remap_to_origin(tc, bio);
    784	issue(tc, bio);
    785}
    786
    787static void remap_and_issue(struct thin_c *tc, struct bio *bio,
    788			    dm_block_t block)
    789{
    790	remap(tc, bio, block);
    791	issue(tc, bio);
    792}
    793
    794/*----------------------------------------------------------------*/
    795
    796/*
    797 * Bio endio functions.
    798 */
    799struct dm_thin_new_mapping {
    800	struct list_head list;
    801
    802	bool pass_discard:1;
    803	bool maybe_shared:1;
    804
    805	/*
    806	 * Track quiescing, copying and zeroing preparation actions.  When this
    807	 * counter hits zero the block is prepared and can be inserted into the
    808	 * btree.
    809	 */
    810	atomic_t prepare_actions;
    811
    812	blk_status_t status;
    813	struct thin_c *tc;
    814	dm_block_t virt_begin, virt_end;
    815	dm_block_t data_block;
    816	struct dm_bio_prison_cell *cell;
    817
    818	/*
    819	 * If the bio covers the whole area of a block then we can avoid
    820	 * zeroing or copying.  Instead this bio is hooked.  The bio will
    821	 * still be in the cell, so care has to be taken to avoid issuing
    822	 * the bio twice.
    823	 */
    824	struct bio *bio;
    825	bio_end_io_t *saved_bi_end_io;
    826};
    827
    828static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
    829{
    830	struct pool *pool = m->tc->pool;
    831
    832	if (atomic_dec_and_test(&m->prepare_actions)) {
    833		list_add_tail(&m->list, &pool->prepared_mappings);
    834		wake_worker(pool);
    835	}
    836}
    837
    838static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
    839{
    840	unsigned long flags;
    841	struct pool *pool = m->tc->pool;
    842
    843	spin_lock_irqsave(&pool->lock, flags);
    844	__complete_mapping_preparation(m);
    845	spin_unlock_irqrestore(&pool->lock, flags);
    846}
    847
    848static void copy_complete(int read_err, unsigned long write_err, void *context)
    849{
    850	struct dm_thin_new_mapping *m = context;
    851
    852	m->status = read_err || write_err ? BLK_STS_IOERR : 0;
    853	complete_mapping_preparation(m);
    854}
    855
    856static void overwrite_endio(struct bio *bio)
    857{
    858	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
    859	struct dm_thin_new_mapping *m = h->overwrite_mapping;
    860
    861	bio->bi_end_io = m->saved_bi_end_io;
    862
    863	m->status = bio->bi_status;
    864	complete_mapping_preparation(m);
    865}
    866
    867/*----------------------------------------------------------------*/
    868
    869/*
    870 * Workqueue.
    871 */
    872
    873/*
    874 * Prepared mapping jobs.
    875 */
    876
    877/*
    878 * This sends the bios in the cell, except the original holder, back
    879 * to the deferred_bios list.
    880 */
    881static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
    882{
    883	struct pool *pool = tc->pool;
    884	unsigned long flags;
    885	int has_work;
    886
    887	spin_lock_irqsave(&tc->lock, flags);
    888	cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
    889	has_work = !bio_list_empty(&tc->deferred_bio_list);
    890	spin_unlock_irqrestore(&tc->lock, flags);
    891
    892	if (has_work)
    893		wake_worker(pool);
    894}
    895
    896static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
    897
    898struct remap_info {
    899	struct thin_c *tc;
    900	struct bio_list defer_bios;
    901	struct bio_list issue_bios;
    902};
    903
    904static void __inc_remap_and_issue_cell(void *context,
    905				       struct dm_bio_prison_cell *cell)
    906{
    907	struct remap_info *info = context;
    908	struct bio *bio;
    909
    910	while ((bio = bio_list_pop(&cell->bios))) {
    911		if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
    912			bio_list_add(&info->defer_bios, bio);
    913		else {
    914			inc_all_io_entry(info->tc->pool, bio);
    915
    916			/*
    917			 * We can't issue the bios with the bio prison lock
    918			 * held, so we add them to a list to issue on
    919			 * return from this function.
    920			 */
    921			bio_list_add(&info->issue_bios, bio);
    922		}
    923	}
    924}
    925
    926static void inc_remap_and_issue_cell(struct thin_c *tc,
    927				     struct dm_bio_prison_cell *cell,
    928				     dm_block_t block)
    929{
    930	struct bio *bio;
    931	struct remap_info info;
    932
    933	info.tc = tc;
    934	bio_list_init(&info.defer_bios);
    935	bio_list_init(&info.issue_bios);
    936
    937	/*
    938	 * We have to be careful to inc any bios we're about to issue
    939	 * before the cell is released, and avoid a race with new bios
    940	 * being added to the cell.
    941	 */
    942	cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
    943			   &info, cell);
    944
    945	while ((bio = bio_list_pop(&info.defer_bios)))
    946		thin_defer_bio(tc, bio);
    947
    948	while ((bio = bio_list_pop(&info.issue_bios)))
    949		remap_and_issue(info.tc, bio, block);
    950}
    951
    952static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
    953{
    954	cell_error(m->tc->pool, m->cell);
    955	list_del(&m->list);
    956	mempool_free(m, &m->tc->pool->mapping_pool);
    957}
    958
    959static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
    960{
    961	struct pool *pool = tc->pool;
    962
    963	/*
    964	 * If the bio has the REQ_FUA flag set we must commit the metadata
    965	 * before signaling its completion.
    966	 */
    967	if (!bio_triggers_commit(tc, bio)) {
    968		bio_endio(bio);
    969		return;
    970	}
    971
    972	/*
    973	 * Complete bio with an error if earlier I/O caused changes to the
    974	 * metadata that can't be committed, e.g, due to I/O errors on the
    975	 * metadata device.
    976	 */
    977	if (dm_thin_aborted_changes(tc->td)) {
    978		bio_io_error(bio);
    979		return;
    980	}
    981
    982	/*
    983	 * Batch together any bios that trigger commits and then issue a
    984	 * single commit for them in process_deferred_bios().
    985	 */
    986	spin_lock_irq(&pool->lock);
    987	bio_list_add(&pool->deferred_flush_completions, bio);
    988	spin_unlock_irq(&pool->lock);
    989}
    990
    991static void process_prepared_mapping(struct dm_thin_new_mapping *m)
    992{
    993	struct thin_c *tc = m->tc;
    994	struct pool *pool = tc->pool;
    995	struct bio *bio = m->bio;
    996	int r;
    997
    998	if (m->status) {
    999		cell_error(pool, m->cell);
   1000		goto out;
   1001	}
   1002
   1003	/*
   1004	 * Commit the prepared block into the mapping btree.
   1005	 * Any I/O for this block arriving after this point will get
   1006	 * remapped to it directly.
   1007	 */
   1008	r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
   1009	if (r) {
   1010		metadata_operation_failed(pool, "dm_thin_insert_block", r);
   1011		cell_error(pool, m->cell);
   1012		goto out;
   1013	}
   1014
   1015	/*
   1016	 * Release any bios held while the block was being provisioned.
   1017	 * If we are processing a write bio that completely covers the block,
   1018	 * we already processed it so can ignore it now when processing
   1019	 * the bios in the cell.
   1020	 */
   1021	if (bio) {
   1022		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
   1023		complete_overwrite_bio(tc, bio);
   1024	} else {
   1025		inc_all_io_entry(tc->pool, m->cell->holder);
   1026		remap_and_issue(tc, m->cell->holder, m->data_block);
   1027		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
   1028	}
   1029
   1030out:
   1031	list_del(&m->list);
   1032	mempool_free(m, &pool->mapping_pool);
   1033}
   1034
   1035/*----------------------------------------------------------------*/
   1036
   1037static void free_discard_mapping(struct dm_thin_new_mapping *m)
   1038{
   1039	struct thin_c *tc = m->tc;
   1040	if (m->cell)
   1041		cell_defer_no_holder(tc, m->cell);
   1042	mempool_free(m, &tc->pool->mapping_pool);
   1043}
   1044
   1045static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
   1046{
   1047	bio_io_error(m->bio);
   1048	free_discard_mapping(m);
   1049}
   1050
   1051static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
   1052{
   1053	bio_endio(m->bio);
   1054	free_discard_mapping(m);
   1055}
   1056
   1057static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
   1058{
   1059	int r;
   1060	struct thin_c *tc = m->tc;
   1061
   1062	r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
   1063	if (r) {
   1064		metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
   1065		bio_io_error(m->bio);
   1066	} else
   1067		bio_endio(m->bio);
   1068
   1069	cell_defer_no_holder(tc, m->cell);
   1070	mempool_free(m, &tc->pool->mapping_pool);
   1071}
   1072
   1073/*----------------------------------------------------------------*/
   1074
   1075static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
   1076						   struct bio *discard_parent)
   1077{
   1078	/*
   1079	 * We've already unmapped this range of blocks, but before we
   1080	 * passdown we have to check that these blocks are now unused.
   1081	 */
   1082	int r = 0;
   1083	bool shared = true;
   1084	struct thin_c *tc = m->tc;
   1085	struct pool *pool = tc->pool;
   1086	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
   1087	struct discard_op op;
   1088
   1089	begin_discard(&op, tc, discard_parent);
   1090	while (b != end) {
   1091		/* find start of unmapped run */
   1092		for (; b < end; b++) {
   1093			r = dm_pool_block_is_shared(pool->pmd, b, &shared);
   1094			if (r)
   1095				goto out;
   1096
   1097			if (!shared)
   1098				break;
   1099		}
   1100
   1101		if (b == end)
   1102			break;
   1103
   1104		/* find end of run */
   1105		for (e = b + 1; e != end; e++) {
   1106			r = dm_pool_block_is_shared(pool->pmd, e, &shared);
   1107			if (r)
   1108				goto out;
   1109
   1110			if (shared)
   1111				break;
   1112		}
   1113
   1114		r = issue_discard(&op, b, e);
   1115		if (r)
   1116			goto out;
   1117
   1118		b = e;
   1119	}
   1120out:
   1121	end_discard(&op, r);
   1122}
   1123
   1124static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
   1125{
   1126	unsigned long flags;
   1127	struct pool *pool = m->tc->pool;
   1128
   1129	spin_lock_irqsave(&pool->lock, flags);
   1130	list_add_tail(&m->list, &pool->prepared_discards_pt2);
   1131	spin_unlock_irqrestore(&pool->lock, flags);
   1132	wake_worker(pool);
   1133}
   1134
   1135static void passdown_endio(struct bio *bio)
   1136{
   1137	/*
   1138	 * It doesn't matter if the passdown discard failed, we still want
   1139	 * to unmap (we ignore err).
   1140	 */
   1141	queue_passdown_pt2(bio->bi_private);
   1142	bio_put(bio);
   1143}
   1144
   1145static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
   1146{
   1147	int r;
   1148	struct thin_c *tc = m->tc;
   1149	struct pool *pool = tc->pool;
   1150	struct bio *discard_parent;
   1151	dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
   1152
   1153	/*
   1154	 * Only this thread allocates blocks, so we can be sure that the
   1155	 * newly unmapped blocks will not be allocated before the end of
   1156	 * the function.
   1157	 */
   1158	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
   1159	if (r) {
   1160		metadata_operation_failed(pool, "dm_thin_remove_range", r);
   1161		bio_io_error(m->bio);
   1162		cell_defer_no_holder(tc, m->cell);
   1163		mempool_free(m, &pool->mapping_pool);
   1164		return;
   1165	}
   1166
   1167	/*
   1168	 * Increment the unmapped blocks.  This prevents a race between the
   1169	 * passdown io and reallocation of freed blocks.
   1170	 */
   1171	r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
   1172	if (r) {
   1173		metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
   1174		bio_io_error(m->bio);
   1175		cell_defer_no_holder(tc, m->cell);
   1176		mempool_free(m, &pool->mapping_pool);
   1177		return;
   1178	}
   1179
   1180	discard_parent = bio_alloc(NULL, 1, 0, GFP_NOIO);
   1181	discard_parent->bi_end_io = passdown_endio;
   1182	discard_parent->bi_private = m;
   1183 	if (m->maybe_shared)
   1184 		passdown_double_checking_shared_status(m, discard_parent);
   1185 	else {
   1186		struct discard_op op;
   1187
   1188		begin_discard(&op, tc, discard_parent);
   1189		r = issue_discard(&op, m->data_block, data_end);
   1190		end_discard(&op, r);
   1191	}
   1192}
   1193
   1194static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
   1195{
   1196	int r;
   1197	struct thin_c *tc = m->tc;
   1198	struct pool *pool = tc->pool;
   1199
   1200	/*
   1201	 * The passdown has completed, so now we can decrement all those
   1202	 * unmapped blocks.
   1203	 */
   1204	r = dm_pool_dec_data_range(pool->pmd, m->data_block,
   1205				   m->data_block + (m->virt_end - m->virt_begin));
   1206	if (r) {
   1207		metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
   1208		bio_io_error(m->bio);
   1209	} else
   1210		bio_endio(m->bio);
   1211
   1212	cell_defer_no_holder(tc, m->cell);
   1213	mempool_free(m, &pool->mapping_pool);
   1214}
   1215
   1216static void process_prepared(struct pool *pool, struct list_head *head,
   1217			     process_mapping_fn *fn)
   1218{
   1219	struct list_head maps;
   1220	struct dm_thin_new_mapping *m, *tmp;
   1221
   1222	INIT_LIST_HEAD(&maps);
   1223	spin_lock_irq(&pool->lock);
   1224	list_splice_init(head, &maps);
   1225	spin_unlock_irq(&pool->lock);
   1226
   1227	list_for_each_entry_safe(m, tmp, &maps, list)
   1228		(*fn)(m);
   1229}
   1230
   1231/*
   1232 * Deferred bio jobs.
   1233 */
   1234static int io_overlaps_block(struct pool *pool, struct bio *bio)
   1235{
   1236	return bio->bi_iter.bi_size ==
   1237		(pool->sectors_per_block << SECTOR_SHIFT);
   1238}
   1239
   1240static int io_overwrites_block(struct pool *pool, struct bio *bio)
   1241{
   1242	return (bio_data_dir(bio) == WRITE) &&
   1243		io_overlaps_block(pool, bio);
   1244}
   1245
   1246static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
   1247			       bio_end_io_t *fn)
   1248{
   1249	*save = bio->bi_end_io;
   1250	bio->bi_end_io = fn;
   1251}
   1252
   1253static int ensure_next_mapping(struct pool *pool)
   1254{
   1255	if (pool->next_mapping)
   1256		return 0;
   1257
   1258	pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC);
   1259
   1260	return pool->next_mapping ? 0 : -ENOMEM;
   1261}
   1262
   1263static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
   1264{
   1265	struct dm_thin_new_mapping *m = pool->next_mapping;
   1266
   1267	BUG_ON(!pool->next_mapping);
   1268
   1269	memset(m, 0, sizeof(struct dm_thin_new_mapping));
   1270	INIT_LIST_HEAD(&m->list);
   1271	m->bio = NULL;
   1272
   1273	pool->next_mapping = NULL;
   1274
   1275	return m;
   1276}
   1277
   1278static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
   1279		    sector_t begin, sector_t end)
   1280{
   1281	struct dm_io_region to;
   1282
   1283	to.bdev = tc->pool_dev->bdev;
   1284	to.sector = begin;
   1285	to.count = end - begin;
   1286
   1287	dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
   1288}
   1289
   1290static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
   1291				      dm_block_t data_begin,
   1292				      struct dm_thin_new_mapping *m)
   1293{
   1294	struct pool *pool = tc->pool;
   1295	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
   1296
   1297	h->overwrite_mapping = m;
   1298	m->bio = bio;
   1299	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
   1300	inc_all_io_entry(pool, bio);
   1301	remap_and_issue(tc, bio, data_begin);
   1302}
   1303
   1304/*
   1305 * A partial copy also needs to zero the uncopied region.
   1306 */
   1307static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
   1308			  struct dm_dev *origin, dm_block_t data_origin,
   1309			  dm_block_t data_dest,
   1310			  struct dm_bio_prison_cell *cell, struct bio *bio,
   1311			  sector_t len)
   1312{
   1313	struct pool *pool = tc->pool;
   1314	struct dm_thin_new_mapping *m = get_next_mapping(pool);
   1315
   1316	m->tc = tc;
   1317	m->virt_begin = virt_block;
   1318	m->virt_end = virt_block + 1u;
   1319	m->data_block = data_dest;
   1320	m->cell = cell;
   1321
   1322	/*
   1323	 * quiesce action + copy action + an extra reference held for the
   1324	 * duration of this function (we may need to inc later for a
   1325	 * partial zero).
   1326	 */
   1327	atomic_set(&m->prepare_actions, 3);
   1328
   1329	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
   1330		complete_mapping_preparation(m); /* already quiesced */
   1331
   1332	/*
   1333	 * IO to pool_dev remaps to the pool target's data_dev.
   1334	 *
   1335	 * If the whole block of data is being overwritten, we can issue the
   1336	 * bio immediately. Otherwise we use kcopyd to clone the data first.
   1337	 */
   1338	if (io_overwrites_block(pool, bio))
   1339		remap_and_issue_overwrite(tc, bio, data_dest, m);
   1340	else {
   1341		struct dm_io_region from, to;
   1342
   1343		from.bdev = origin->bdev;
   1344		from.sector = data_origin * pool->sectors_per_block;
   1345		from.count = len;
   1346
   1347		to.bdev = tc->pool_dev->bdev;
   1348		to.sector = data_dest * pool->sectors_per_block;
   1349		to.count = len;
   1350
   1351		dm_kcopyd_copy(pool->copier, &from, 1, &to,
   1352			       0, copy_complete, m);
   1353
   1354		/*
   1355		 * Do we need to zero a tail region?
   1356		 */
   1357		if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
   1358			atomic_inc(&m->prepare_actions);
   1359			ll_zero(tc, m,
   1360				data_dest * pool->sectors_per_block + len,
   1361				(data_dest + 1) * pool->sectors_per_block);
   1362		}
   1363	}
   1364
   1365	complete_mapping_preparation(m); /* drop our ref */
   1366}
   1367
   1368static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
   1369				   dm_block_t data_origin, dm_block_t data_dest,
   1370				   struct dm_bio_prison_cell *cell, struct bio *bio)
   1371{
   1372	schedule_copy(tc, virt_block, tc->pool_dev,
   1373		      data_origin, data_dest, cell, bio,
   1374		      tc->pool->sectors_per_block);
   1375}
   1376
   1377static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
   1378			  dm_block_t data_block, struct dm_bio_prison_cell *cell,
   1379			  struct bio *bio)
   1380{
   1381	struct pool *pool = tc->pool;
   1382	struct dm_thin_new_mapping *m = get_next_mapping(pool);
   1383
   1384	atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
   1385	m->tc = tc;
   1386	m->virt_begin = virt_block;
   1387	m->virt_end = virt_block + 1u;
   1388	m->data_block = data_block;
   1389	m->cell = cell;
   1390
   1391	/*
   1392	 * If the whole block of data is being overwritten or we are not
   1393	 * zeroing pre-existing data, we can issue the bio immediately.
   1394	 * Otherwise we use kcopyd to zero the data first.
   1395	 */
   1396	if (pool->pf.zero_new_blocks) {
   1397		if (io_overwrites_block(pool, bio))
   1398			remap_and_issue_overwrite(tc, bio, data_block, m);
   1399		else
   1400			ll_zero(tc, m, data_block * pool->sectors_per_block,
   1401				(data_block + 1) * pool->sectors_per_block);
   1402	} else
   1403		process_prepared_mapping(m);
   1404}
   1405
   1406static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
   1407				   dm_block_t data_dest,
   1408				   struct dm_bio_prison_cell *cell, struct bio *bio)
   1409{
   1410	struct pool *pool = tc->pool;
   1411	sector_t virt_block_begin = virt_block * pool->sectors_per_block;
   1412	sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
   1413
   1414	if (virt_block_end <= tc->origin_size)
   1415		schedule_copy(tc, virt_block, tc->origin_dev,
   1416			      virt_block, data_dest, cell, bio,
   1417			      pool->sectors_per_block);
   1418
   1419	else if (virt_block_begin < tc->origin_size)
   1420		schedule_copy(tc, virt_block, tc->origin_dev,
   1421			      virt_block, data_dest, cell, bio,
   1422			      tc->origin_size - virt_block_begin);
   1423
   1424	else
   1425		schedule_zero(tc, virt_block, data_dest, cell, bio);
   1426}
   1427
   1428static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
   1429
   1430static void requeue_bios(struct pool *pool);
   1431
   1432static bool is_read_only_pool_mode(enum pool_mode mode)
   1433{
   1434	return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
   1435}
   1436
   1437static bool is_read_only(struct pool *pool)
   1438{
   1439	return is_read_only_pool_mode(get_pool_mode(pool));
   1440}
   1441
   1442static void check_for_metadata_space(struct pool *pool)
   1443{
   1444	int r;
   1445	const char *ooms_reason = NULL;
   1446	dm_block_t nr_free;
   1447
   1448	r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
   1449	if (r)
   1450		ooms_reason = "Could not get free metadata blocks";
   1451	else if (!nr_free)
   1452		ooms_reason = "No free metadata blocks";
   1453
   1454	if (ooms_reason && !is_read_only(pool)) {
   1455		DMERR("%s", ooms_reason);
   1456		set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
   1457	}
   1458}
   1459
   1460static void check_for_data_space(struct pool *pool)
   1461{
   1462	int r;
   1463	dm_block_t nr_free;
   1464
   1465	if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
   1466		return;
   1467
   1468	r = dm_pool_get_free_block_count(pool->pmd, &nr_free);
   1469	if (r)
   1470		return;
   1471
   1472	if (nr_free) {
   1473		set_pool_mode(pool, PM_WRITE);
   1474		requeue_bios(pool);
   1475	}
   1476}
   1477
   1478/*
   1479 * A non-zero return indicates read_only or fail_io mode.
   1480 * Many callers don't care about the return value.
   1481 */
   1482static int commit(struct pool *pool)
   1483{
   1484	int r;
   1485
   1486	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
   1487		return -EINVAL;
   1488
   1489	r = dm_pool_commit_metadata(pool->pmd);
   1490	if (r)
   1491		metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
   1492	else {
   1493		check_for_metadata_space(pool);
   1494		check_for_data_space(pool);
   1495	}
   1496
   1497	return r;
   1498}
   1499
   1500static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
   1501{
   1502	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
   1503		DMWARN("%s: reached low water mark for data device: sending event.",
   1504		       dm_device_name(pool->pool_md));
   1505		spin_lock_irq(&pool->lock);
   1506		pool->low_water_triggered = true;
   1507		spin_unlock_irq(&pool->lock);
   1508		dm_table_event(pool->ti->table);
   1509	}
   1510}
   1511
   1512static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
   1513{
   1514	int r;
   1515	dm_block_t free_blocks;
   1516	struct pool *pool = tc->pool;
   1517
   1518	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
   1519		return -EINVAL;
   1520
   1521	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
   1522	if (r) {
   1523		metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
   1524		return r;
   1525	}
   1526
   1527	check_low_water_mark(pool, free_blocks);
   1528
   1529	if (!free_blocks) {
   1530		/*
   1531		 * Try to commit to see if that will free up some
   1532		 * more space.
   1533		 */
   1534		r = commit(pool);
   1535		if (r)
   1536			return r;
   1537
   1538		r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
   1539		if (r) {
   1540			metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
   1541			return r;
   1542		}
   1543
   1544		if (!free_blocks) {
   1545			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
   1546			return -ENOSPC;
   1547		}
   1548	}
   1549
   1550	r = dm_pool_alloc_data_block(pool->pmd, result);
   1551	if (r) {
   1552		if (r == -ENOSPC)
   1553			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
   1554		else
   1555			metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
   1556		return r;
   1557	}
   1558
   1559	r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
   1560	if (r) {
   1561		metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
   1562		return r;
   1563	}
   1564
   1565	if (!free_blocks) {
   1566		/* Let's commit before we use up the metadata reserve. */
   1567		r = commit(pool);
   1568		if (r)
   1569			return r;
   1570	}
   1571
   1572	return 0;
   1573}
   1574
   1575/*
   1576 * If we have run out of space, queue bios until the device is
   1577 * resumed, presumably after having been reloaded with more space.
   1578 */
   1579static void retry_on_resume(struct bio *bio)
   1580{
   1581	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
   1582	struct thin_c *tc = h->tc;
   1583
   1584	spin_lock_irq(&tc->lock);
   1585	bio_list_add(&tc->retry_on_resume_list, bio);
   1586	spin_unlock_irq(&tc->lock);
   1587}
   1588
   1589static blk_status_t should_error_unserviceable_bio(struct pool *pool)
   1590{
   1591	enum pool_mode m = get_pool_mode(pool);
   1592
   1593	switch (m) {
   1594	case PM_WRITE:
   1595		/* Shouldn't get here */
   1596		DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
   1597		return BLK_STS_IOERR;
   1598
   1599	case PM_OUT_OF_DATA_SPACE:
   1600		return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
   1601
   1602	case PM_OUT_OF_METADATA_SPACE:
   1603	case PM_READ_ONLY:
   1604	case PM_FAIL:
   1605		return BLK_STS_IOERR;
   1606	default:
   1607		/* Shouldn't get here */
   1608		DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
   1609		return BLK_STS_IOERR;
   1610	}
   1611}
   1612
   1613static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
   1614{
   1615	blk_status_t error = should_error_unserviceable_bio(pool);
   1616
   1617	if (error) {
   1618		bio->bi_status = error;
   1619		bio_endio(bio);
   1620	} else
   1621		retry_on_resume(bio);
   1622}
   1623
   1624static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
   1625{
   1626	struct bio *bio;
   1627	struct bio_list bios;
   1628	blk_status_t error;
   1629
   1630	error = should_error_unserviceable_bio(pool);
   1631	if (error) {
   1632		cell_error_with_code(pool, cell, error);
   1633		return;
   1634	}
   1635
   1636	bio_list_init(&bios);
   1637	cell_release(pool, cell, &bios);
   1638
   1639	while ((bio = bio_list_pop(&bios)))
   1640		retry_on_resume(bio);
   1641}
   1642
   1643static void process_discard_cell_no_passdown(struct thin_c *tc,
   1644					     struct dm_bio_prison_cell *virt_cell)
   1645{
   1646	struct pool *pool = tc->pool;
   1647	struct dm_thin_new_mapping *m = get_next_mapping(pool);
   1648
   1649	/*
   1650	 * We don't need to lock the data blocks, since there's no
   1651	 * passdown.  We only lock data blocks for allocation and breaking sharing.
   1652	 */
   1653	m->tc = tc;
   1654	m->virt_begin = virt_cell->key.block_begin;
   1655	m->virt_end = virt_cell->key.block_end;
   1656	m->cell = virt_cell;
   1657	m->bio = virt_cell->holder;
   1658
   1659	if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
   1660		pool->process_prepared_discard(m);
   1661}
   1662
   1663static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
   1664				 struct bio *bio)
   1665{
   1666	struct pool *pool = tc->pool;
   1667
   1668	int r;
   1669	bool maybe_shared;
   1670	struct dm_cell_key data_key;
   1671	struct dm_bio_prison_cell *data_cell;
   1672	struct dm_thin_new_mapping *m;
   1673	dm_block_t virt_begin, virt_end, data_begin;
   1674
   1675	while (begin != end) {
   1676		r = ensure_next_mapping(pool);
   1677		if (r)
   1678			/* we did our best */
   1679			return;
   1680
   1681		r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
   1682					      &data_begin, &maybe_shared);
   1683		if (r)
   1684			/*
   1685			 * Silently fail, letting any mappings we've
   1686			 * created complete.
   1687			 */
   1688			break;
   1689
   1690		build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
   1691		if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
   1692			/* contention, we'll give up with this range */
   1693			begin = virt_end;
   1694			continue;
   1695		}
   1696
   1697		/*
   1698		 * IO may still be going to the destination block.  We must
   1699		 * quiesce before we can do the removal.
   1700		 */
   1701		m = get_next_mapping(pool);
   1702		m->tc = tc;
   1703		m->maybe_shared = maybe_shared;
   1704		m->virt_begin = virt_begin;
   1705		m->virt_end = virt_end;
   1706		m->data_block = data_begin;
   1707		m->cell = data_cell;
   1708		m->bio = bio;
   1709
   1710		/*
   1711		 * The parent bio must not complete before sub discard bios are
   1712		 * chained to it (see end_discard's bio_chain)!
   1713		 *
   1714		 * This per-mapping bi_remaining increment is paired with
   1715		 * the implicit decrement that occurs via bio_endio() in
   1716		 * end_discard().
   1717		 */
   1718		bio_inc_remaining(bio);
   1719		if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
   1720			pool->process_prepared_discard(m);
   1721
   1722		begin = virt_end;
   1723	}
   1724}
   1725
   1726static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
   1727{
   1728	struct bio *bio = virt_cell->holder;
   1729	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
   1730
   1731	/*
   1732	 * The virt_cell will only get freed once the origin bio completes.
   1733	 * This means it will remain locked while all the individual
   1734	 * passdown bios are in flight.
   1735	 */
   1736	h->cell = virt_cell;
   1737	break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
   1738
   1739	/*
   1740	 * We complete the bio now, knowing that the bi_remaining field
   1741	 * will prevent completion until the sub range discards have
   1742	 * completed.
   1743	 */
   1744	bio_endio(bio);
   1745}
   1746
   1747static void process_discard_bio(struct thin_c *tc, struct bio *bio)
   1748{
   1749	dm_block_t begin, end;
   1750	struct dm_cell_key virt_key;
   1751	struct dm_bio_prison_cell *virt_cell;
   1752
   1753	get_bio_block_range(tc, bio, &begin, &end);
   1754	if (begin == end) {
   1755		/*
   1756		 * The discard covers less than a block.
   1757		 */
   1758		bio_endio(bio);
   1759		return;
   1760	}
   1761
   1762	build_key(tc->td, VIRTUAL, begin, end, &virt_key);
   1763	if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
   1764		/*
   1765		 * Potential starvation issue: We're relying on the
   1766		 * fs/application being well behaved, and not trying to
   1767		 * send IO to a region at the same time as discarding it.
   1768		 * If they do this persistently then it's possible this
   1769		 * cell will never be granted.
   1770		 */
   1771		return;
   1772
   1773	tc->pool->process_discard_cell(tc, virt_cell);
   1774}
   1775
   1776static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
   1777			  struct dm_cell_key *key,
   1778			  struct dm_thin_lookup_result *lookup_result,
   1779			  struct dm_bio_prison_cell *cell)
   1780{
   1781	int r;
   1782	dm_block_t data_block;
   1783	struct pool *pool = tc->pool;
   1784
   1785	r = alloc_data_block(tc, &data_block);
   1786	switch (r) {
   1787	case 0:
   1788		schedule_internal_copy(tc, block, lookup_result->block,
   1789				       data_block, cell, bio);
   1790		break;
   1791
   1792	case -ENOSPC:
   1793		retry_bios_on_resume(pool, cell);
   1794		break;
   1795
   1796	default:
   1797		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
   1798			    __func__, r);
   1799		cell_error(pool, cell);
   1800		break;
   1801	}
   1802}
   1803
   1804static void __remap_and_issue_shared_cell(void *context,
   1805					  struct dm_bio_prison_cell *cell)
   1806{
   1807	struct remap_info *info = context;
   1808	struct bio *bio;
   1809
   1810	while ((bio = bio_list_pop(&cell->bios))) {
   1811		if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) ||
   1812		    bio_op(bio) == REQ_OP_DISCARD)
   1813			bio_list_add(&info->defer_bios, bio);
   1814		else {
   1815			struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
   1816
   1817			h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
   1818			inc_all_io_entry(info->tc->pool, bio);
   1819			bio_list_add(&info->issue_bios, bio);
   1820		}
   1821	}
   1822}
   1823
   1824static void remap_and_issue_shared_cell(struct thin_c *tc,
   1825					struct dm_bio_prison_cell *cell,
   1826					dm_block_t block)
   1827{
   1828	struct bio *bio;
   1829	struct remap_info info;
   1830
   1831	info.tc = tc;
   1832	bio_list_init(&info.defer_bios);
   1833	bio_list_init(&info.issue_bios);
   1834
   1835	cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
   1836			   &info, cell);
   1837
   1838	while ((bio = bio_list_pop(&info.defer_bios)))
   1839		thin_defer_bio(tc, bio);
   1840
   1841	while ((bio = bio_list_pop(&info.issue_bios)))
   1842		remap_and_issue(tc, bio, block);
   1843}
   1844
   1845static void process_shared_bio(struct thin_c *tc, struct bio *bio,
   1846			       dm_block_t block,
   1847			       struct dm_thin_lookup_result *lookup_result,
   1848			       struct dm_bio_prison_cell *virt_cell)
   1849{
   1850	struct dm_bio_prison_cell *data_cell;
   1851	struct pool *pool = tc->pool;
   1852	struct dm_cell_key key;
   1853
   1854	/*
   1855	 * If cell is already occupied, then sharing is already in the process
   1856	 * of being broken so we have nothing further to do here.
   1857	 */
   1858	build_data_key(tc->td, lookup_result->block, &key);
   1859	if (bio_detain(pool, &key, bio, &data_cell)) {
   1860		cell_defer_no_holder(tc, virt_cell);
   1861		return;
   1862	}
   1863
   1864	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
   1865		break_sharing(tc, bio, block, &key, lookup_result, data_cell);
   1866		cell_defer_no_holder(tc, virt_cell);
   1867	} else {
   1868		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
   1869
   1870		h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
   1871		inc_all_io_entry(pool, bio);
   1872		remap_and_issue(tc, bio, lookup_result->block);
   1873
   1874		remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
   1875		remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
   1876	}
   1877}
   1878
   1879static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
   1880			    struct dm_bio_prison_cell *cell)
   1881{
   1882	int r;
   1883	dm_block_t data_block;
   1884	struct pool *pool = tc->pool;
   1885
   1886	/*
   1887	 * Remap empty bios (flushes) immediately, without provisioning.
   1888	 */
   1889	if (!bio->bi_iter.bi_size) {
   1890		inc_all_io_entry(pool, bio);
   1891		cell_defer_no_holder(tc, cell);
   1892
   1893		remap_and_issue(tc, bio, 0);
   1894		return;
   1895	}
   1896
   1897	/*
   1898	 * Fill read bios with zeroes and complete them immediately.
   1899	 */
   1900	if (bio_data_dir(bio) == READ) {
   1901		zero_fill_bio(bio);
   1902		cell_defer_no_holder(tc, cell);
   1903		bio_endio(bio);
   1904		return;
   1905	}
   1906
   1907	r = alloc_data_block(tc, &data_block);
   1908	switch (r) {
   1909	case 0:
   1910		if (tc->origin_dev)
   1911			schedule_external_copy(tc, block, data_block, cell, bio);
   1912		else
   1913			schedule_zero(tc, block, data_block, cell, bio);
   1914		break;
   1915
   1916	case -ENOSPC:
   1917		retry_bios_on_resume(pool, cell);
   1918		break;
   1919
   1920	default:
   1921		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
   1922			    __func__, r);
   1923		cell_error(pool, cell);
   1924		break;
   1925	}
   1926}
   1927
   1928static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
   1929{
   1930	int r;
   1931	struct pool *pool = tc->pool;
   1932	struct bio *bio = cell->holder;
   1933	dm_block_t block = get_bio_block(tc, bio);
   1934	struct dm_thin_lookup_result lookup_result;
   1935
   1936	if (tc->requeue_mode) {
   1937		cell_requeue(pool, cell);
   1938		return;
   1939	}
   1940
   1941	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
   1942	switch (r) {
   1943	case 0:
   1944		if (lookup_result.shared)
   1945			process_shared_bio(tc, bio, block, &lookup_result, cell);
   1946		else {
   1947			inc_all_io_entry(pool, bio);
   1948			remap_and_issue(tc, bio, lookup_result.block);
   1949			inc_remap_and_issue_cell(tc, cell, lookup_result.block);
   1950		}
   1951		break;
   1952
   1953	case -ENODATA:
   1954		if (bio_data_dir(bio) == READ && tc->origin_dev) {
   1955			inc_all_io_entry(pool, bio);
   1956			cell_defer_no_holder(tc, cell);
   1957
   1958			if (bio_end_sector(bio) <= tc->origin_size)
   1959				remap_to_origin_and_issue(tc, bio);
   1960
   1961			else if (bio->bi_iter.bi_sector < tc->origin_size) {
   1962				zero_fill_bio(bio);
   1963				bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
   1964				remap_to_origin_and_issue(tc, bio);
   1965
   1966			} else {
   1967				zero_fill_bio(bio);
   1968				bio_endio(bio);
   1969			}
   1970		} else
   1971			provision_block(tc, bio, block, cell);
   1972		break;
   1973
   1974	default:
   1975		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
   1976			    __func__, r);
   1977		cell_defer_no_holder(tc, cell);
   1978		bio_io_error(bio);
   1979		break;
   1980	}
   1981}
   1982
   1983static void process_bio(struct thin_c *tc, struct bio *bio)
   1984{
   1985	struct pool *pool = tc->pool;
   1986	dm_block_t block = get_bio_block(tc, bio);
   1987	struct dm_bio_prison_cell *cell;
   1988	struct dm_cell_key key;
   1989
   1990	/*
   1991	 * If cell is already occupied, then the block is already
   1992	 * being provisioned so we have nothing further to do here.
   1993	 */
   1994	build_virtual_key(tc->td, block, &key);
   1995	if (bio_detain(pool, &key, bio, &cell))
   1996		return;
   1997
   1998	process_cell(tc, cell);
   1999}
   2000
   2001static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
   2002				    struct dm_bio_prison_cell *cell)
   2003{
   2004	int r;
   2005	int rw = bio_data_dir(bio);
   2006	dm_block_t block = get_bio_block(tc, bio);
   2007	struct dm_thin_lookup_result lookup_result;
   2008
   2009	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
   2010	switch (r) {
   2011	case 0:
   2012		if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
   2013			handle_unserviceable_bio(tc->pool, bio);
   2014			if (cell)
   2015				cell_defer_no_holder(tc, cell);
   2016		} else {
   2017			inc_all_io_entry(tc->pool, bio);
   2018			remap_and_issue(tc, bio, lookup_result.block);
   2019			if (cell)
   2020				inc_remap_and_issue_cell(tc, cell, lookup_result.block);
   2021		}
   2022		break;
   2023
   2024	case -ENODATA:
   2025		if (cell)
   2026			cell_defer_no_holder(tc, cell);
   2027		if (rw != READ) {
   2028			handle_unserviceable_bio(tc->pool, bio);
   2029			break;
   2030		}
   2031
   2032		if (tc->origin_dev) {
   2033			inc_all_io_entry(tc->pool, bio);
   2034			remap_to_origin_and_issue(tc, bio);
   2035			break;
   2036		}
   2037
   2038		zero_fill_bio(bio);
   2039		bio_endio(bio);
   2040		break;
   2041
   2042	default:
   2043		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
   2044			    __func__, r);
   2045		if (cell)
   2046			cell_defer_no_holder(tc, cell);
   2047		bio_io_error(bio);
   2048		break;
   2049	}
   2050}
   2051
   2052static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
   2053{
   2054	__process_bio_read_only(tc, bio, NULL);
   2055}
   2056
   2057static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
   2058{
   2059	__process_bio_read_only(tc, cell->holder, cell);
   2060}
   2061
   2062static void process_bio_success(struct thin_c *tc, struct bio *bio)
   2063{
   2064	bio_endio(bio);
   2065}
   2066
   2067static void process_bio_fail(struct thin_c *tc, struct bio *bio)
   2068{
   2069	bio_io_error(bio);
   2070}
   2071
   2072static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
   2073{
   2074	cell_success(tc->pool, cell);
   2075}
   2076
   2077static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
   2078{
   2079	cell_error(tc->pool, cell);
   2080}
   2081
   2082/*
   2083 * FIXME: should we also commit due to size of transaction, measured in
   2084 * metadata blocks?
   2085 */
   2086static int need_commit_due_to_time(struct pool *pool)
   2087{
   2088	return !time_in_range(jiffies, pool->last_commit_jiffies,
   2089			      pool->last_commit_jiffies + COMMIT_PERIOD);
   2090}
   2091
   2092#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
   2093#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
   2094
   2095static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
   2096{
   2097	struct rb_node **rbp, *parent;
   2098	struct dm_thin_endio_hook *pbd;
   2099	sector_t bi_sector = bio->bi_iter.bi_sector;
   2100
   2101	rbp = &tc->sort_bio_list.rb_node;
   2102	parent = NULL;
   2103	while (*rbp) {
   2104		parent = *rbp;
   2105		pbd = thin_pbd(parent);
   2106
   2107		if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
   2108			rbp = &(*rbp)->rb_left;
   2109		else
   2110			rbp = &(*rbp)->rb_right;
   2111	}
   2112
   2113	pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
   2114	rb_link_node(&pbd->rb_node, parent, rbp);
   2115	rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
   2116}
   2117
   2118static void __extract_sorted_bios(struct thin_c *tc)
   2119{
   2120	struct rb_node *node;
   2121	struct dm_thin_endio_hook *pbd;
   2122	struct bio *bio;
   2123
   2124	for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
   2125		pbd = thin_pbd(node);
   2126		bio = thin_bio(pbd);
   2127
   2128		bio_list_add(&tc->deferred_bio_list, bio);
   2129		rb_erase(&pbd->rb_node, &tc->sort_bio_list);
   2130	}
   2131
   2132	WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
   2133}
   2134
   2135static void __sort_thin_deferred_bios(struct thin_c *tc)
   2136{
   2137	struct bio *bio;
   2138	struct bio_list bios;
   2139
   2140	bio_list_init(&bios);
   2141	bio_list_merge(&bios, &tc->deferred_bio_list);
   2142	bio_list_init(&tc->deferred_bio_list);
   2143
   2144	/* Sort deferred_bio_list using rb-tree */
   2145	while ((bio = bio_list_pop(&bios)))
   2146		__thin_bio_rb_add(tc, bio);
   2147
   2148	/*
   2149	 * Transfer the sorted bios in sort_bio_list back to
   2150	 * deferred_bio_list to allow lockless submission of
   2151	 * all bios.
   2152	 */
   2153	__extract_sorted_bios(tc);
   2154}
   2155
   2156static void process_thin_deferred_bios(struct thin_c *tc)
   2157{
   2158	struct pool *pool = tc->pool;
   2159	struct bio *bio;
   2160	struct bio_list bios;
   2161	struct blk_plug plug;
   2162	unsigned count = 0;
   2163
   2164	if (tc->requeue_mode) {
   2165		error_thin_bio_list(tc, &tc->deferred_bio_list,
   2166				BLK_STS_DM_REQUEUE);
   2167		return;
   2168	}
   2169
   2170	bio_list_init(&bios);
   2171
   2172	spin_lock_irq(&tc->lock);
   2173
   2174	if (bio_list_empty(&tc->deferred_bio_list)) {
   2175		spin_unlock_irq(&tc->lock);
   2176		return;
   2177	}
   2178
   2179	__sort_thin_deferred_bios(tc);
   2180
   2181	bio_list_merge(&bios, &tc->deferred_bio_list);
   2182	bio_list_init(&tc->deferred_bio_list);
   2183
   2184	spin_unlock_irq(&tc->lock);
   2185
   2186	blk_start_plug(&plug);
   2187	while ((bio = bio_list_pop(&bios))) {
   2188		/*
   2189		 * If we've got no free new_mapping structs, and processing
   2190		 * this bio might require one, we pause until there are some
   2191		 * prepared mappings to process.
   2192		 */
   2193		if (ensure_next_mapping(pool)) {
   2194			spin_lock_irq(&tc->lock);
   2195			bio_list_add(&tc->deferred_bio_list, bio);
   2196			bio_list_merge(&tc->deferred_bio_list, &bios);
   2197			spin_unlock_irq(&tc->lock);
   2198			break;
   2199		}
   2200
   2201		if (bio_op(bio) == REQ_OP_DISCARD)
   2202			pool->process_discard(tc, bio);
   2203		else
   2204			pool->process_bio(tc, bio);
   2205
   2206		if ((count++ & 127) == 0) {
   2207			throttle_work_update(&pool->throttle);
   2208			dm_pool_issue_prefetches(pool->pmd);
   2209		}
   2210	}
   2211	blk_finish_plug(&plug);
   2212}
   2213
   2214static int cmp_cells(const void *lhs, const void *rhs)
   2215{
   2216	struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
   2217	struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
   2218
   2219	BUG_ON(!lhs_cell->holder);
   2220	BUG_ON(!rhs_cell->holder);
   2221
   2222	if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
   2223		return -1;
   2224
   2225	if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
   2226		return 1;
   2227
   2228	return 0;
   2229}
   2230
   2231static unsigned sort_cells(struct pool *pool, struct list_head *cells)
   2232{
   2233	unsigned count = 0;
   2234	struct dm_bio_prison_cell *cell, *tmp;
   2235
   2236	list_for_each_entry_safe(cell, tmp, cells, user_list) {
   2237		if (count >= CELL_SORT_ARRAY_SIZE)
   2238			break;
   2239
   2240		pool->cell_sort_array[count++] = cell;
   2241		list_del(&cell->user_list);
   2242	}
   2243
   2244	sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
   2245
   2246	return count;
   2247}
   2248
   2249static void process_thin_deferred_cells(struct thin_c *tc)
   2250{
   2251	struct pool *pool = tc->pool;
   2252	struct list_head cells;
   2253	struct dm_bio_prison_cell *cell;
   2254	unsigned i, j, count;
   2255
   2256	INIT_LIST_HEAD(&cells);
   2257
   2258	spin_lock_irq(&tc->lock);
   2259	list_splice_init(&tc->deferred_cells, &cells);
   2260	spin_unlock_irq(&tc->lock);
   2261
   2262	if (list_empty(&cells))
   2263		return;
   2264
   2265	do {
   2266		count = sort_cells(tc->pool, &cells);
   2267
   2268		for (i = 0; i < count; i++) {
   2269			cell = pool->cell_sort_array[i];
   2270			BUG_ON(!cell->holder);
   2271
   2272			/*
   2273			 * If we've got no free new_mapping structs, and processing
   2274			 * this bio might require one, we pause until there are some
   2275			 * prepared mappings to process.
   2276			 */
   2277			if (ensure_next_mapping(pool)) {
   2278				for (j = i; j < count; j++)
   2279					list_add(&pool->cell_sort_array[j]->user_list, &cells);
   2280
   2281				spin_lock_irq(&tc->lock);
   2282				list_splice(&cells, &tc->deferred_cells);
   2283				spin_unlock_irq(&tc->lock);
   2284				return;
   2285			}
   2286
   2287			if (bio_op(cell->holder) == REQ_OP_DISCARD)
   2288				pool->process_discard_cell(tc, cell);
   2289			else
   2290				pool->process_cell(tc, cell);
   2291		}
   2292	} while (!list_empty(&cells));
   2293}
   2294
   2295static void thin_get(struct thin_c *tc);
   2296static void thin_put(struct thin_c *tc);
   2297
   2298/*
   2299 * We can't hold rcu_read_lock() around code that can block.  So we
   2300 * find a thin with the rcu lock held; bump a refcount; then drop
   2301 * the lock.
   2302 */
   2303static struct thin_c *get_first_thin(struct pool *pool)
   2304{
   2305	struct thin_c *tc = NULL;
   2306
   2307	rcu_read_lock();
   2308	if (!list_empty(&pool->active_thins)) {
   2309		tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
   2310		thin_get(tc);
   2311	}
   2312	rcu_read_unlock();
   2313
   2314	return tc;
   2315}
   2316
   2317static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
   2318{
   2319	struct thin_c *old_tc = tc;
   2320
   2321	rcu_read_lock();
   2322	list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
   2323		thin_get(tc);
   2324		thin_put(old_tc);
   2325		rcu_read_unlock();
   2326		return tc;
   2327	}
   2328	thin_put(old_tc);
   2329	rcu_read_unlock();
   2330
   2331	return NULL;
   2332}
   2333
   2334static void process_deferred_bios(struct pool *pool)
   2335{
   2336	struct bio *bio;
   2337	struct bio_list bios, bio_completions;
   2338	struct thin_c *tc;
   2339
   2340	tc = get_first_thin(pool);
   2341	while (tc) {
   2342		process_thin_deferred_cells(tc);
   2343		process_thin_deferred_bios(tc);
   2344		tc = get_next_thin(pool, tc);
   2345	}
   2346
   2347	/*
   2348	 * If there are any deferred flush bios, we must commit the metadata
   2349	 * before issuing them or signaling their completion.
   2350	 */
   2351	bio_list_init(&bios);
   2352	bio_list_init(&bio_completions);
   2353
   2354	spin_lock_irq(&pool->lock);
   2355	bio_list_merge(&bios, &pool->deferred_flush_bios);
   2356	bio_list_init(&pool->deferred_flush_bios);
   2357
   2358	bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
   2359	bio_list_init(&pool->deferred_flush_completions);
   2360	spin_unlock_irq(&pool->lock);
   2361
   2362	if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
   2363	    !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
   2364		return;
   2365
   2366	if (commit(pool)) {
   2367		bio_list_merge(&bios, &bio_completions);
   2368
   2369		while ((bio = bio_list_pop(&bios)))
   2370			bio_io_error(bio);
   2371		return;
   2372	}
   2373	pool->last_commit_jiffies = jiffies;
   2374
   2375	while ((bio = bio_list_pop(&bio_completions)))
   2376		bio_endio(bio);
   2377
   2378	while ((bio = bio_list_pop(&bios))) {
   2379		/*
   2380		 * The data device was flushed as part of metadata commit,
   2381		 * so complete redundant flushes immediately.
   2382		 */
   2383		if (bio->bi_opf & REQ_PREFLUSH)
   2384			bio_endio(bio);
   2385		else
   2386			dm_submit_bio_remap(bio, NULL);
   2387	}
   2388}
   2389
   2390static void do_worker(struct work_struct *ws)
   2391{
   2392	struct pool *pool = container_of(ws, struct pool, worker);
   2393
   2394	throttle_work_start(&pool->throttle);
   2395	dm_pool_issue_prefetches(pool->pmd);
   2396	throttle_work_update(&pool->throttle);
   2397	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
   2398	throttle_work_update(&pool->throttle);
   2399	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
   2400	throttle_work_update(&pool->throttle);
   2401	process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2);
   2402	throttle_work_update(&pool->throttle);
   2403	process_deferred_bios(pool);
   2404	throttle_work_complete(&pool->throttle);
   2405}
   2406
   2407/*
   2408 * We want to commit periodically so that not too much
   2409 * unwritten data builds up.
   2410 */
   2411static void do_waker(struct work_struct *ws)
   2412{
   2413	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
   2414	wake_worker(pool);
   2415	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
   2416}
   2417
   2418/*
   2419 * We're holding onto IO to allow userland time to react.  After the
   2420 * timeout either the pool will have been resized (and thus back in
   2421 * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
   2422 */
   2423static void do_no_space_timeout(struct work_struct *ws)
   2424{
   2425	struct pool *pool = container_of(to_delayed_work(ws), struct pool,
   2426					 no_space_timeout);
   2427
   2428	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
   2429		pool->pf.error_if_no_space = true;
   2430		notify_of_pool_mode_change(pool);
   2431		error_retry_list_with_code(pool, BLK_STS_NOSPC);
   2432	}
   2433}
   2434
   2435/*----------------------------------------------------------------*/
   2436
   2437struct pool_work {
   2438	struct work_struct worker;
   2439	struct completion complete;
   2440};
   2441
   2442static struct pool_work *to_pool_work(struct work_struct *ws)
   2443{
   2444	return container_of(ws, struct pool_work, worker);
   2445}
   2446
   2447static void pool_work_complete(struct pool_work *pw)
   2448{
   2449	complete(&pw->complete);
   2450}
   2451
   2452static void pool_work_wait(struct pool_work *pw, struct pool *pool,
   2453			   void (*fn)(struct work_struct *))
   2454{
   2455	INIT_WORK_ONSTACK(&pw->worker, fn);
   2456	init_completion(&pw->complete);
   2457	queue_work(pool->wq, &pw->worker);
   2458	wait_for_completion(&pw->complete);
   2459}
   2460
   2461/*----------------------------------------------------------------*/
   2462
   2463struct noflush_work {
   2464	struct pool_work pw;
   2465	struct thin_c *tc;
   2466};
   2467
   2468static struct noflush_work *to_noflush(struct work_struct *ws)
   2469{
   2470	return container_of(to_pool_work(ws), struct noflush_work, pw);
   2471}
   2472
   2473static void do_noflush_start(struct work_struct *ws)
   2474{
   2475	struct noflush_work *w = to_noflush(ws);
   2476	w->tc->requeue_mode = true;
   2477	requeue_io(w->tc);
   2478	pool_work_complete(&w->pw);
   2479}
   2480
   2481static void do_noflush_stop(struct work_struct *ws)
   2482{
   2483	struct noflush_work *w = to_noflush(ws);
   2484	w->tc->requeue_mode = false;
   2485	pool_work_complete(&w->pw);
   2486}
   2487
   2488static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
   2489{
   2490	struct noflush_work w;
   2491
   2492	w.tc = tc;
   2493	pool_work_wait(&w.pw, tc->pool, fn);
   2494}
   2495
   2496/*----------------------------------------------------------------*/
   2497
   2498static bool passdown_enabled(struct pool_c *pt)
   2499{
   2500	return pt->adjusted_pf.discard_passdown;
   2501}
   2502
   2503static void set_discard_callbacks(struct pool *pool)
   2504{
   2505	struct pool_c *pt = pool->ti->private;
   2506
   2507	if (passdown_enabled(pt)) {
   2508		pool->process_discard_cell = process_discard_cell_passdown;
   2509		pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
   2510		pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
   2511	} else {
   2512		pool->process_discard_cell = process_discard_cell_no_passdown;
   2513		pool->process_prepared_discard = process_prepared_discard_no_passdown;
   2514	}
   2515}
   2516
   2517static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
   2518{
   2519	struct pool_c *pt = pool->ti->private;
   2520	bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
   2521	enum pool_mode old_mode = get_pool_mode(pool);
   2522	unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;
   2523
   2524	/*
   2525	 * Never allow the pool to transition to PM_WRITE mode if user
   2526	 * intervention is required to verify metadata and data consistency.
   2527	 */
   2528	if (new_mode == PM_WRITE && needs_check) {
   2529		DMERR("%s: unable to switch pool to write mode until repaired.",
   2530		      dm_device_name(pool->pool_md));
   2531		if (old_mode != new_mode)
   2532			new_mode = old_mode;
   2533		else
   2534			new_mode = PM_READ_ONLY;
   2535	}
   2536	/*
   2537	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
   2538	 * not going to recover without a thin_repair.	So we never let the
   2539	 * pool move out of the old mode.
   2540	 */
   2541	if (old_mode == PM_FAIL)
   2542		new_mode = old_mode;
   2543
   2544	switch (new_mode) {
   2545	case PM_FAIL:
   2546		dm_pool_metadata_read_only(pool->pmd);
   2547		pool->process_bio = process_bio_fail;
   2548		pool->process_discard = process_bio_fail;
   2549		pool->process_cell = process_cell_fail;
   2550		pool->process_discard_cell = process_cell_fail;
   2551		pool->process_prepared_mapping = process_prepared_mapping_fail;
   2552		pool->process_prepared_discard = process_prepared_discard_fail;
   2553
   2554		error_retry_list(pool);
   2555		break;
   2556
   2557	case PM_OUT_OF_METADATA_SPACE:
   2558	case PM_READ_ONLY:
   2559		dm_pool_metadata_read_only(pool->pmd);
   2560		pool->process_bio = process_bio_read_only;
   2561		pool->process_discard = process_bio_success;
   2562		pool->process_cell = process_cell_read_only;
   2563		pool->process_discard_cell = process_cell_success;
   2564		pool->process_prepared_mapping = process_prepared_mapping_fail;
   2565		pool->process_prepared_discard = process_prepared_discard_success;
   2566
   2567		error_retry_list(pool);
   2568		break;
   2569
   2570	case PM_OUT_OF_DATA_SPACE:
   2571		/*
   2572		 * Ideally we'd never hit this state; the low water mark
   2573		 * would trigger userland to extend the pool before we
   2574		 * completely run out of data space.  However, many small
   2575		 * IOs to unprovisioned space can consume data space at an
   2576		 * alarming rate.  Adjust your low water mark if you're
   2577		 * frequently seeing this mode.
   2578		 */
   2579		pool->out_of_data_space = true;
   2580		pool->process_bio = process_bio_read_only;
   2581		pool->process_discard = process_discard_bio;
   2582		pool->process_cell = process_cell_read_only;
   2583		pool->process_prepared_mapping = process_prepared_mapping;
   2584		set_discard_callbacks(pool);
   2585
   2586		if (!pool->pf.error_if_no_space && no_space_timeout)
   2587			queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
   2588		break;
   2589
   2590	case PM_WRITE:
   2591		if (old_mode == PM_OUT_OF_DATA_SPACE)
   2592			cancel_delayed_work_sync(&pool->no_space_timeout);
   2593		pool->out_of_data_space = false;
   2594		pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
   2595		dm_pool_metadata_read_write(pool->pmd);
   2596		pool->process_bio = process_bio;
   2597		pool->process_discard = process_discard_bio;
   2598		pool->process_cell = process_cell;
   2599		pool->process_prepared_mapping = process_prepared_mapping;
   2600		set_discard_callbacks(pool);
   2601		break;
   2602	}
   2603
   2604	pool->pf.mode = new_mode;
   2605	/*
   2606	 * The pool mode may have changed, sync it so bind_control_target()
   2607	 * doesn't cause an unexpected mode transition on resume.
   2608	 */
   2609	pt->adjusted_pf.mode = new_mode;
   2610
   2611	if (old_mode != new_mode)
   2612		notify_of_pool_mode_change(pool);
   2613}
   2614
   2615static void abort_transaction(struct pool *pool)
   2616{
   2617	const char *dev_name = dm_device_name(pool->pool_md);
   2618
   2619	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
   2620	if (dm_pool_abort_metadata(pool->pmd)) {
   2621		DMERR("%s: failed to abort metadata transaction", dev_name);
   2622		set_pool_mode(pool, PM_FAIL);
   2623	}
   2624
   2625	if (dm_pool_metadata_set_needs_check(pool->pmd)) {
   2626		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
   2627		set_pool_mode(pool, PM_FAIL);
   2628	}
   2629}
   2630
   2631static void metadata_operation_failed(struct pool *pool, const char *op, int r)
   2632{
   2633	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
   2634		    dm_device_name(pool->pool_md), op, r);
   2635
   2636	abort_transaction(pool);
   2637	set_pool_mode(pool, PM_READ_ONLY);
   2638}
   2639
   2640/*----------------------------------------------------------------*/
   2641
   2642/*
   2643 * Mapping functions.
   2644 */
   2645
   2646/*
   2647 * Called only while mapping a thin bio to hand it over to the workqueue.
   2648 */
   2649static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
   2650{
   2651	struct pool *pool = tc->pool;
   2652
   2653	spin_lock_irq(&tc->lock);
   2654	bio_list_add(&tc->deferred_bio_list, bio);
   2655	spin_unlock_irq(&tc->lock);
   2656
   2657	wake_worker(pool);
   2658}
   2659
   2660static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
   2661{
   2662	struct pool *pool = tc->pool;
   2663
   2664	throttle_lock(&pool->throttle);
   2665	thin_defer_bio(tc, bio);
   2666	throttle_unlock(&pool->throttle);
   2667}
   2668
   2669static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
   2670{
   2671	struct pool *pool = tc->pool;
   2672
   2673	throttle_lock(&pool->throttle);
   2674	spin_lock_irq(&tc->lock);
   2675	list_add_tail(&cell->user_list, &tc->deferred_cells);
   2676	spin_unlock_irq(&tc->lock);
   2677	throttle_unlock(&pool->throttle);
   2678
   2679	wake_worker(pool);
   2680}
   2681
   2682static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
   2683{
   2684	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
   2685
   2686	h->tc = tc;
   2687	h->shared_read_entry = NULL;
   2688	h->all_io_entry = NULL;
   2689	h->overwrite_mapping = NULL;
   2690	h->cell = NULL;
   2691}
   2692
   2693/*
   2694 * Non-blocking function called from the thin target's map function.
   2695 */
   2696static int thin_bio_map(struct dm_target *ti, struct bio *bio)
   2697{
   2698	int r;
   2699	struct thin_c *tc = ti->private;
   2700	dm_block_t block = get_bio_block(tc, bio);
   2701	struct dm_thin_device *td = tc->td;
   2702	struct dm_thin_lookup_result result;
   2703	struct dm_bio_prison_cell *virt_cell, *data_cell;
   2704	struct dm_cell_key key;
   2705
   2706	thin_hook_bio(tc, bio);
   2707
   2708	if (tc->requeue_mode) {
   2709		bio->bi_status = BLK_STS_DM_REQUEUE;
   2710		bio_endio(bio);
   2711		return DM_MAPIO_SUBMITTED;
   2712	}
   2713
   2714	if (get_pool_mode(tc->pool) == PM_FAIL) {
   2715		bio_io_error(bio);
   2716		return DM_MAPIO_SUBMITTED;
   2717	}
   2718
   2719	if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
   2720		thin_defer_bio_with_throttle(tc, bio);
   2721		return DM_MAPIO_SUBMITTED;
   2722	}
   2723
   2724	/*
   2725	 * We must hold the virtual cell before doing the lookup, otherwise
   2726	 * there's a race with discard.
   2727	 */
   2728	build_virtual_key(tc->td, block, &key);
   2729	if (bio_detain(tc->pool, &key, bio, &virt_cell))
   2730		return DM_MAPIO_SUBMITTED;
   2731
   2732	r = dm_thin_find_block(td, block, 0, &result);
   2733
   2734	/*
   2735	 * Note that we defer readahead too.
   2736	 */
   2737	switch (r) {
   2738	case 0:
   2739		if (unlikely(result.shared)) {
   2740			/*
   2741			 * We have a race condition here between the
   2742			 * result.shared value returned by the lookup and
   2743			 * snapshot creation, which may cause new
   2744			 * sharing.
   2745			 *
   2746			 * To avoid this always quiesce the origin before
   2747			 * taking the snap.  You want to do this anyway to
   2748			 * ensure a consistent application view
   2749			 * (i.e. lockfs).
   2750			 *
   2751			 * More distant ancestors are irrelevant. The
   2752			 * shared flag will be set in their case.
   2753			 */
   2754			thin_defer_cell(tc, virt_cell);
   2755			return DM_MAPIO_SUBMITTED;
   2756		}
   2757
   2758		build_data_key(tc->td, result.block, &key);
   2759		if (bio_detain(tc->pool, &key, bio, &data_cell)) {
   2760			cell_defer_no_holder(tc, virt_cell);
   2761			return DM_MAPIO_SUBMITTED;
   2762		}
   2763
   2764		inc_all_io_entry(tc->pool, bio);
   2765		cell_defer_no_holder(tc, data_cell);
   2766		cell_defer_no_holder(tc, virt_cell);
   2767
   2768		remap(tc, bio, result.block);
   2769		return DM_MAPIO_REMAPPED;
   2770
   2771	case -ENODATA:
   2772	case -EWOULDBLOCK:
   2773		thin_defer_cell(tc, virt_cell);
   2774		return DM_MAPIO_SUBMITTED;
   2775
   2776	default:
   2777		/*
   2778		 * Must always call bio_io_error on failure.
   2779		 * dm_thin_find_block can fail with -EINVAL if the
   2780		 * pool is switched to fail-io mode.
   2781		 */
   2782		bio_io_error(bio);
   2783		cell_defer_no_holder(tc, virt_cell);
   2784		return DM_MAPIO_SUBMITTED;
   2785	}
   2786}
   2787
   2788static void requeue_bios(struct pool *pool)
   2789{
   2790	struct thin_c *tc;
   2791
   2792	rcu_read_lock();
   2793	list_for_each_entry_rcu(tc, &pool->active_thins, list) {
   2794		spin_lock_irq(&tc->lock);
   2795		bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
   2796		bio_list_init(&tc->retry_on_resume_list);
   2797		spin_unlock_irq(&tc->lock);
   2798	}
   2799	rcu_read_unlock();
   2800}
   2801
   2802/*----------------------------------------------------------------
   2803 * Binding of control targets to a pool object
   2804 *--------------------------------------------------------------*/
   2805static bool is_factor(sector_t block_size, uint32_t n)
   2806{
   2807	return !sector_div(block_size, n);
   2808}
   2809
   2810/*
   2811 * If discard_passdown was enabled verify that the data device
   2812 * supports discards.  Disable discard_passdown if not.
   2813 */
   2814static void disable_passdown_if_not_supported(struct pool_c *pt)
   2815{
   2816	struct pool *pool = pt->pool;
   2817	struct block_device *data_bdev = pt->data_dev->bdev;
   2818	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
   2819	const char *reason = NULL;
   2820
   2821	if (!pt->adjusted_pf.discard_passdown)
   2822		return;
   2823
   2824	if (!bdev_max_discard_sectors(pt->data_dev->bdev))
   2825		reason = "discard unsupported";
   2826
   2827	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
   2828		reason = "max discard sectors smaller than a block";
   2829
   2830	if (reason) {
   2831		DMWARN("Data device (%pg) %s: Disabling discard passdown.", data_bdev, reason);
   2832		pt->adjusted_pf.discard_passdown = false;
   2833	}
   2834}
   2835
   2836static int bind_control_target(struct pool *pool, struct dm_target *ti)
   2837{
   2838	struct pool_c *pt = ti->private;
   2839
   2840	/*
   2841	 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
   2842	 */
   2843	enum pool_mode old_mode = get_pool_mode(pool);
   2844	enum pool_mode new_mode = pt->adjusted_pf.mode;
   2845
   2846	/*
   2847	 * Don't change the pool's mode until set_pool_mode() below.
   2848	 * Otherwise the pool's process_* function pointers may
   2849	 * not match the desired pool mode.
   2850	 */
   2851	pt->adjusted_pf.mode = old_mode;
   2852
   2853	pool->ti = ti;
   2854	pool->pf = pt->adjusted_pf;
   2855	pool->low_water_blocks = pt->low_water_blocks;
   2856
   2857	set_pool_mode(pool, new_mode);
   2858
   2859	return 0;
   2860}
   2861
   2862static void unbind_control_target(struct pool *pool, struct dm_target *ti)
   2863{
   2864	if (pool->ti == ti)
   2865		pool->ti = NULL;
   2866}
   2867
   2868/*----------------------------------------------------------------
   2869 * Pool creation
   2870 *--------------------------------------------------------------*/
   2871/* Initialize pool features. */
   2872static void pool_features_init(struct pool_features *pf)
   2873{
   2874	pf->mode = PM_WRITE;
   2875	pf->zero_new_blocks = true;
   2876	pf->discard_enabled = true;
   2877	pf->discard_passdown = true;
   2878	pf->error_if_no_space = false;
   2879}
   2880
   2881static void __pool_destroy(struct pool *pool)
   2882{
   2883	__pool_table_remove(pool);
   2884
   2885	vfree(pool->cell_sort_array);
   2886	if (dm_pool_metadata_close(pool->pmd) < 0)
   2887		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
   2888
   2889	dm_bio_prison_destroy(pool->prison);
   2890	dm_kcopyd_client_destroy(pool->copier);
   2891
   2892	if (pool->wq)
   2893		destroy_workqueue(pool->wq);
   2894
   2895	if (pool->next_mapping)
   2896		mempool_free(pool->next_mapping, &pool->mapping_pool);
   2897	mempool_exit(&pool->mapping_pool);
   2898	dm_deferred_set_destroy(pool->shared_read_ds);
   2899	dm_deferred_set_destroy(pool->all_io_ds);
   2900	kfree(pool);
   2901}
   2902
   2903static struct kmem_cache *_new_mapping_cache;
   2904
   2905static struct pool *pool_create(struct mapped_device *pool_md,
   2906				struct block_device *metadata_dev,
   2907				struct block_device *data_dev,
   2908				unsigned long block_size,
   2909				int read_only, char **error)
   2910{
   2911	int r;
   2912	void *err_p;
   2913	struct pool *pool;
   2914	struct dm_pool_metadata *pmd;
   2915	bool format_device = read_only ? false : true;
   2916
   2917	pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
   2918	if (IS_ERR(pmd)) {
   2919		*error = "Error creating metadata object";
   2920		return (struct pool *)pmd;
   2921	}
   2922
   2923	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
   2924	if (!pool) {
   2925		*error = "Error allocating memory for pool";
   2926		err_p = ERR_PTR(-ENOMEM);
   2927		goto bad_pool;
   2928	}
   2929
   2930	pool->pmd = pmd;
   2931	pool->sectors_per_block = block_size;
   2932	if (block_size & (block_size - 1))
   2933		pool->sectors_per_block_shift = -1;
   2934	else
   2935		pool->sectors_per_block_shift = __ffs(block_size);
   2936	pool->low_water_blocks = 0;
   2937	pool_features_init(&pool->pf);
   2938	pool->prison = dm_bio_prison_create();
   2939	if (!pool->prison) {
   2940		*error = "Error creating pool's bio prison";
   2941		err_p = ERR_PTR(-ENOMEM);
   2942		goto bad_prison;
   2943	}
   2944
   2945	pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
   2946	if (IS_ERR(pool->copier)) {
   2947		r = PTR_ERR(pool->copier);
   2948		*error = "Error creating pool's kcopyd client";
   2949		err_p = ERR_PTR(r);
   2950		goto bad_kcopyd_client;
   2951	}
   2952
   2953	/*
   2954	 * Create singlethreaded workqueue that will service all devices
   2955	 * that use this metadata.
   2956	 */
   2957	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
   2958	if (!pool->wq) {
   2959		*error = "Error creating pool's workqueue";
   2960		err_p = ERR_PTR(-ENOMEM);
   2961		goto bad_wq;
   2962	}
   2963
   2964	throttle_init(&pool->throttle);
   2965	INIT_WORK(&pool->worker, do_worker);
   2966	INIT_DELAYED_WORK(&pool->waker, do_waker);
   2967	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
   2968	spin_lock_init(&pool->lock);
   2969	bio_list_init(&pool->deferred_flush_bios);
   2970	bio_list_init(&pool->deferred_flush_completions);
   2971	INIT_LIST_HEAD(&pool->prepared_mappings);
   2972	INIT_LIST_HEAD(&pool->prepared_discards);
   2973	INIT_LIST_HEAD(&pool->prepared_discards_pt2);
   2974	INIT_LIST_HEAD(&pool->active_thins);
   2975	pool->low_water_triggered = false;
   2976	pool->suspended = true;
   2977	pool->out_of_data_space = false;
   2978
   2979	pool->shared_read_ds = dm_deferred_set_create();
   2980	if (!pool->shared_read_ds) {
   2981		*error = "Error creating pool's shared read deferred set";
   2982		err_p = ERR_PTR(-ENOMEM);
   2983		goto bad_shared_read_ds;
   2984	}
   2985
   2986	pool->all_io_ds = dm_deferred_set_create();
   2987	if (!pool->all_io_ds) {
   2988		*error = "Error creating pool's all io deferred set";
   2989		err_p = ERR_PTR(-ENOMEM);
   2990		goto bad_all_io_ds;
   2991	}
   2992
   2993	pool->next_mapping = NULL;
   2994	r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE,
   2995				   _new_mapping_cache);
   2996	if (r) {
   2997		*error = "Error creating pool's mapping mempool";
   2998		err_p = ERR_PTR(r);
   2999		goto bad_mapping_pool;
   3000	}
   3001
   3002	pool->cell_sort_array =
   3003		vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
   3004				   sizeof(*pool->cell_sort_array)));
   3005	if (!pool->cell_sort_array) {
   3006		*error = "Error allocating cell sort array";
   3007		err_p = ERR_PTR(-ENOMEM);
   3008		goto bad_sort_array;
   3009	}
   3010
   3011	pool->ref_count = 1;
   3012	pool->last_commit_jiffies = jiffies;
   3013	pool->pool_md = pool_md;
   3014	pool->md_dev = metadata_dev;
   3015	pool->data_dev = data_dev;
   3016	__pool_table_insert(pool);
   3017
   3018	return pool;
   3019
   3020bad_sort_array:
   3021	mempool_exit(&pool->mapping_pool);
   3022bad_mapping_pool:
   3023	dm_deferred_set_destroy(pool->all_io_ds);
   3024bad_all_io_ds:
   3025	dm_deferred_set_destroy(pool->shared_read_ds);
   3026bad_shared_read_ds:
   3027	destroy_workqueue(pool->wq);
   3028bad_wq:
   3029	dm_kcopyd_client_destroy(pool->copier);
   3030bad_kcopyd_client:
   3031	dm_bio_prison_destroy(pool->prison);
   3032bad_prison:
   3033	kfree(pool);
   3034bad_pool:
   3035	if (dm_pool_metadata_close(pmd))
   3036		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
   3037
   3038	return err_p;
   3039}
   3040
   3041static void __pool_inc(struct pool *pool)
   3042{
   3043	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
   3044	pool->ref_count++;
   3045}
   3046
   3047static void __pool_dec(struct pool *pool)
   3048{
   3049	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
   3050	BUG_ON(!pool->ref_count);
   3051	if (!--pool->ref_count)
   3052		__pool_destroy(pool);
   3053}
   3054
   3055static struct pool *__pool_find(struct mapped_device *pool_md,
   3056				struct block_device *metadata_dev,
   3057				struct block_device *data_dev,
   3058				unsigned long block_size, int read_only,
   3059				char **error, int *created)
   3060{
   3061	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
   3062
   3063	if (pool) {
   3064		if (pool->pool_md != pool_md) {
   3065			*error = "metadata device already in use by a pool";
   3066			return ERR_PTR(-EBUSY);
   3067		}
   3068		if (pool->data_dev != data_dev) {
   3069			*error = "data device already in use by a pool";
   3070			return ERR_PTR(-EBUSY);
   3071		}
   3072		__pool_inc(pool);
   3073
   3074	} else {
   3075		pool = __pool_table_lookup(pool_md);
   3076		if (pool) {
   3077			if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {
   3078				*error = "different pool cannot replace a pool";
   3079				return ERR_PTR(-EINVAL);
   3080			}
   3081			__pool_inc(pool);
   3082
   3083		} else {
   3084			pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
   3085			*created = 1;
   3086		}
   3087	}
   3088
   3089	return pool;
   3090}
   3091
   3092/*----------------------------------------------------------------
   3093 * Pool target methods
   3094 *--------------------------------------------------------------*/
   3095static void pool_dtr(struct dm_target *ti)
   3096{
   3097	struct pool_c *pt = ti->private;
   3098
   3099	mutex_lock(&dm_thin_pool_table.mutex);
   3100
   3101	unbind_control_target(pt->pool, ti);
   3102	__pool_dec(pt->pool);
   3103	dm_put_device(ti, pt->metadata_dev);
   3104	dm_put_device(ti, pt->data_dev);
   3105	kfree(pt);
   3106
   3107	mutex_unlock(&dm_thin_pool_table.mutex);
   3108}
   3109
   3110static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
   3111			       struct dm_target *ti)
   3112{
   3113	int r;
   3114	unsigned argc;
   3115	const char *arg_name;
   3116
   3117	static const struct dm_arg _args[] = {
   3118		{0, 4, "Invalid number of pool feature arguments"},
   3119	};
   3120
   3121	/*
   3122	 * No feature arguments supplied.
   3123	 */
   3124	if (!as->argc)
   3125		return 0;
   3126
   3127	r = dm_read_arg_group(_args, as, &argc, &ti->error);
   3128	if (r)
   3129		return -EINVAL;
   3130
   3131	while (argc && !r) {
   3132		arg_name = dm_shift_arg(as);
   3133		argc--;
   3134
   3135		if (!strcasecmp(arg_name, "skip_block_zeroing"))
   3136			pf->zero_new_blocks = false;
   3137
   3138		else if (!strcasecmp(arg_name, "ignore_discard"))
   3139			pf->discard_enabled = false;
   3140
   3141		else if (!strcasecmp(arg_name, "no_discard_passdown"))
   3142			pf->discard_passdown = false;
   3143
   3144		else if (!strcasecmp(arg_name, "read_only"))
   3145			pf->mode = PM_READ_ONLY;
   3146
   3147		else if (!strcasecmp(arg_name, "error_if_no_space"))
   3148			pf->error_if_no_space = true;
   3149
   3150		else {
   3151			ti->error = "Unrecognised pool feature requested";
   3152			r = -EINVAL;
   3153			break;
   3154		}
   3155	}
   3156
   3157	return r;
   3158}
   3159
   3160static void metadata_low_callback(void *context)
   3161{
   3162	struct pool *pool = context;
   3163
   3164	DMWARN("%s: reached low water mark for metadata device: sending event.",
   3165	       dm_device_name(pool->pool_md));
   3166
   3167	dm_table_event(pool->ti->table);
   3168}
   3169
   3170/*
   3171 * We need to flush the data device **before** committing the metadata.
   3172 *
   3173 * This ensures that the data blocks of any newly inserted mappings are
   3174 * properly written to non-volatile storage and won't be lost in case of a
   3175 * crash.
   3176 *
   3177 * Failure to do so can result in data corruption in the case of internal or
   3178 * external snapshots and in the case of newly provisioned blocks, when block
   3179 * zeroing is enabled.
   3180 */
   3181static int metadata_pre_commit_callback(void *context)
   3182{
   3183	struct pool *pool = context;
   3184
   3185	return blkdev_issue_flush(pool->data_dev);
   3186}
   3187
   3188static sector_t get_dev_size(struct block_device *bdev)
   3189{
   3190	return bdev_nr_sectors(bdev);
   3191}
   3192
   3193static void warn_if_metadata_device_too_big(struct block_device *bdev)
   3194{
   3195	sector_t metadata_dev_size = get_dev_size(bdev);
   3196
   3197	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
   3198		DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
   3199		       bdev, THIN_METADATA_MAX_SECTORS);
   3200}
   3201
   3202static sector_t get_metadata_dev_size(struct block_device *bdev)
   3203{
   3204	sector_t metadata_dev_size = get_dev_size(bdev);
   3205
   3206	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
   3207		metadata_dev_size = THIN_METADATA_MAX_SECTORS;
   3208
   3209	return metadata_dev_size;
   3210}
   3211
   3212static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
   3213{
   3214	sector_t metadata_dev_size = get_metadata_dev_size(bdev);
   3215
   3216	sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
   3217
   3218	return metadata_dev_size;
   3219}
   3220
   3221/*
   3222 * When a metadata threshold is crossed a dm event is triggered, and
   3223 * userland should respond by growing the metadata device.  We could let
   3224 * userland set the threshold, like we do with the data threshold, but I'm
   3225 * not sure they know enough to do this well.
   3226 */
   3227static dm_block_t calc_metadata_threshold(struct pool_c *pt)
   3228{
   3229	/*
   3230	 * 4M is ample for all ops with the possible exception of thin
   3231	 * device deletion which is harmless if it fails (just retry the
   3232	 * delete after you've grown the device).
   3233	 */
   3234	dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
   3235	return min((dm_block_t)1024ULL /* 4M */, quarter);
   3236}
   3237
   3238/*
   3239 * thin-pool <metadata dev> <data dev>
   3240 *	     <data block size (sectors)>
   3241 *	     <low water mark (blocks)>
   3242 *	     [<#feature args> [<arg>]*]
   3243 *
   3244 * Optional feature arguments are:
   3245 *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
   3246 *	     ignore_discard: disable discard
   3247 *	     no_discard_passdown: don't pass discards down to the data device
   3248 *	     read_only: Don't allow any changes to be made to the pool metadata.
   3249 *	     error_if_no_space: error IOs, instead of queueing, if no space.
   3250 */
   3251static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
   3252{
   3253	int r, pool_created = 0;
   3254	struct pool_c *pt;
   3255	struct pool *pool;
   3256	struct pool_features pf;
   3257	struct dm_arg_set as;
   3258	struct dm_dev *data_dev;
   3259	unsigned long block_size;
   3260	dm_block_t low_water_blocks;
   3261	struct dm_dev *metadata_dev;
   3262	fmode_t metadata_mode;
   3263
   3264	/*
   3265	 * FIXME Remove validation from scope of lock.
   3266	 */
   3267	mutex_lock(&dm_thin_pool_table.mutex);
   3268
   3269	if (argc < 4) {
   3270		ti->error = "Invalid argument count";
   3271		r = -EINVAL;
   3272		goto out_unlock;
   3273	}
   3274
   3275	as.argc = argc;
   3276	as.argv = argv;
   3277
   3278	/* make sure metadata and data are different devices */
   3279	if (!strcmp(argv[0], argv[1])) {
   3280		ti->error = "Error setting metadata or data device";
   3281		r = -EINVAL;
   3282		goto out_unlock;
   3283	}
   3284
   3285	/*
   3286	 * Set default pool features.
   3287	 */
   3288	pool_features_init(&pf);
   3289
   3290	dm_consume_args(&as, 4);
   3291	r = parse_pool_features(&as, &pf, ti);
   3292	if (r)
   3293		goto out_unlock;
   3294
   3295	metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
   3296	r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
   3297	if (r) {
   3298		ti->error = "Error opening metadata block device";
   3299		goto out_unlock;
   3300	}
   3301	warn_if_metadata_device_too_big(metadata_dev->bdev);
   3302
   3303	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
   3304	if (r) {
   3305		ti->error = "Error getting data device";
   3306		goto out_metadata;
   3307	}
   3308
   3309	if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
   3310	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
   3311	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
   3312	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
   3313		ti->error = "Invalid block size";
   3314		r = -EINVAL;
   3315		goto out;
   3316	}
   3317
   3318	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
   3319		ti->error = "Invalid low water mark";
   3320		r = -EINVAL;
   3321		goto out;
   3322	}
   3323
   3324	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
   3325	if (!pt) {
   3326		r = -ENOMEM;
   3327		goto out;
   3328	}
   3329
   3330	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
   3331			   block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
   3332	if (IS_ERR(pool)) {
   3333		r = PTR_ERR(pool);
   3334		goto out_free_pt;
   3335	}
   3336
   3337	/*
   3338	 * 'pool_created' reflects whether this is the first table load.
   3339	 * Top level discard support is not allowed to be changed after
   3340	 * initial load.  This would require a pool reload to trigger thin
   3341	 * device changes.
   3342	 */
   3343	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
   3344		ti->error = "Discard support cannot be disabled once enabled";
   3345		r = -EINVAL;
   3346		goto out_flags_changed;
   3347	}
   3348
   3349	pt->pool = pool;
   3350	pt->ti = ti;
   3351	pt->metadata_dev = metadata_dev;
   3352	pt->data_dev = data_dev;
   3353	pt->low_water_blocks = low_water_blocks;
   3354	pt->adjusted_pf = pt->requested_pf = pf;
   3355	ti->num_flush_bios = 1;
   3356
   3357	/*
   3358	 * Only need to enable discards if the pool should pass
   3359	 * them down to the data device.  The thin device's discard
   3360	 * processing will cause mappings to be removed from the btree.
   3361	 */
   3362	if (pf.discard_enabled && pf.discard_passdown) {
   3363		ti->num_discard_bios = 1;
   3364
   3365		/*
   3366		 * Setting 'discards_supported' circumvents the normal
   3367		 * stacking of discard limits (this keeps the pool and
   3368		 * thin devices' discard limits consistent).
   3369		 */
   3370		ti->discards_supported = true;
   3371	}
   3372	ti->private = pt;
   3373
   3374	r = dm_pool_register_metadata_threshold(pt->pool->pmd,
   3375						calc_metadata_threshold(pt),
   3376						metadata_low_callback,
   3377						pool);
   3378	if (r)
   3379		goto out_flags_changed;
   3380
   3381	dm_pool_register_pre_commit_callback(pool->pmd,
   3382					     metadata_pre_commit_callback, pool);
   3383
   3384	mutex_unlock(&dm_thin_pool_table.mutex);
   3385
   3386	return 0;
   3387
   3388out_flags_changed:
   3389	__pool_dec(pool);
   3390out_free_pt:
   3391	kfree(pt);
   3392out:
   3393	dm_put_device(ti, data_dev);
   3394out_metadata:
   3395	dm_put_device(ti, metadata_dev);
   3396out_unlock:
   3397	mutex_unlock(&dm_thin_pool_table.mutex);
   3398
   3399	return r;
   3400}
   3401
   3402static int pool_map(struct dm_target *ti, struct bio *bio)
   3403{
   3404	int r;
   3405	struct pool_c *pt = ti->private;
   3406	struct pool *pool = pt->pool;
   3407
   3408	/*
   3409	 * As this is a singleton target, ti->begin is always zero.
   3410	 */
   3411	spin_lock_irq(&pool->lock);
   3412	bio_set_dev(bio, pt->data_dev->bdev);
   3413	r = DM_MAPIO_REMAPPED;
   3414	spin_unlock_irq(&pool->lock);
   3415
   3416	return r;
   3417}
   3418
   3419static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
   3420{
   3421	int r;
   3422	struct pool_c *pt = ti->private;
   3423	struct pool *pool = pt->pool;
   3424	sector_t data_size = ti->len;
   3425	dm_block_t sb_data_size;
   3426
   3427	*need_commit = false;
   3428
   3429	(void) sector_div(data_size, pool->sectors_per_block);
   3430
   3431	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
   3432	if (r) {
   3433		DMERR("%s: failed to retrieve data device size",
   3434		      dm_device_name(pool->pool_md));
   3435		return r;
   3436	}
   3437
   3438	if (data_size < sb_data_size) {
   3439		DMERR("%s: pool target (%llu blocks) too small: expected %llu",
   3440		      dm_device_name(pool->pool_md),
   3441		      (unsigned long long)data_size, sb_data_size);
   3442		return -EINVAL;
   3443
   3444	} else if (data_size > sb_data_size) {
   3445		if (dm_pool_metadata_needs_check(pool->pmd)) {
   3446			DMERR("%s: unable to grow the data device until repaired.",
   3447			      dm_device_name(pool->pool_md));
   3448			return 0;
   3449		}
   3450
   3451		if (sb_data_size)
   3452			DMINFO("%s: growing the data device from %llu to %llu blocks",
   3453			       dm_device_name(pool->pool_md),
   3454			       sb_data_size, (unsigned long long)data_size);
   3455		r = dm_pool_resize_data_dev(pool->pmd, data_size);
   3456		if (r) {
   3457			metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
   3458			return r;
   3459		}
   3460
   3461		*need_commit = true;
   3462	}
   3463
   3464	return 0;
   3465}
   3466
   3467static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
   3468{
   3469	int r;
   3470	struct pool_c *pt = ti->private;
   3471	struct pool *pool = pt->pool;
   3472	dm_block_t metadata_dev_size, sb_metadata_dev_size;
   3473
   3474	*need_commit = false;
   3475
   3476	metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
   3477
   3478	r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
   3479	if (r) {
   3480		DMERR("%s: failed to retrieve metadata device size",
   3481		      dm_device_name(pool->pool_md));
   3482		return r;
   3483	}
   3484
   3485	if (metadata_dev_size < sb_metadata_dev_size) {
   3486		DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
   3487		      dm_device_name(pool->pool_md),
   3488		      metadata_dev_size, sb_metadata_dev_size);
   3489		return -EINVAL;
   3490
   3491	} else if (metadata_dev_size > sb_metadata_dev_size) {
   3492		if (dm_pool_metadata_needs_check(pool->pmd)) {
   3493			DMERR("%s: unable to grow the metadata device until repaired.",
   3494			      dm_device_name(pool->pool_md));
   3495			return 0;
   3496		}
   3497
   3498		warn_if_metadata_device_too_big(pool->md_dev);
   3499		DMINFO("%s: growing the metadata device from %llu to %llu blocks",
   3500		       dm_device_name(pool->pool_md),
   3501		       sb_metadata_dev_size, metadata_dev_size);
   3502
   3503		if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
   3504			set_pool_mode(pool, PM_WRITE);
   3505
   3506		r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
   3507		if (r) {
   3508			metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
   3509			return r;
   3510		}
   3511
   3512		*need_commit = true;
   3513	}
   3514
   3515	return 0;
   3516}
   3517
   3518/*
   3519 * Retrieves the number of blocks of the data device from
   3520 * the superblock and compares it to the actual device size,
   3521 * thus resizing the data device in case it has grown.
   3522 *
   3523 * This both copes with opening preallocated data devices in the ctr
   3524 * being followed by a resume
   3525 * -and-
   3526 * calling the resume method individually after userspace has
   3527 * grown the data device in reaction to a table event.
   3528 */
   3529static int pool_preresume(struct dm_target *ti)
   3530{
   3531	int r;
   3532	bool need_commit1, need_commit2;
   3533	struct pool_c *pt = ti->private;
   3534	struct pool *pool = pt->pool;
   3535
   3536	/*
   3537	 * Take control of the pool object.
   3538	 */
   3539	r = bind_control_target(pool, ti);
   3540	if (r)
   3541		return r;
   3542
   3543	r = maybe_resize_data_dev(ti, &need_commit1);
   3544	if (r)
   3545		return r;
   3546
   3547	r = maybe_resize_metadata_dev(ti, &need_commit2);
   3548	if (r)
   3549		return r;
   3550
   3551	if (need_commit1 || need_commit2)
   3552		(void) commit(pool);
   3553
   3554	return 0;
   3555}
   3556
   3557static void pool_suspend_active_thins(struct pool *pool)
   3558{
   3559	struct thin_c *tc;
   3560
   3561	/* Suspend all active thin devices */
   3562	tc = get_first_thin(pool);
   3563	while (tc) {
   3564		dm_internal_suspend_noflush(tc->thin_md);
   3565		tc = get_next_thin(pool, tc);
   3566	}
   3567}
   3568
   3569static void pool_resume_active_thins(struct pool *pool)
   3570{
   3571	struct thin_c *tc;
   3572
   3573	/* Resume all active thin devices */
   3574	tc = get_first_thin(pool);
   3575	while (tc) {
   3576		dm_internal_resume(tc->thin_md);
   3577		tc = get_next_thin(pool, tc);
   3578	}
   3579}
   3580
   3581static void pool_resume(struct dm_target *ti)
   3582{
   3583	struct pool_c *pt = ti->private;
   3584	struct pool *pool = pt->pool;
   3585
   3586	/*
   3587	 * Must requeue active_thins' bios and then resume
   3588	 * active_thins _before_ clearing 'suspend' flag.
   3589	 */
   3590	requeue_bios(pool);
   3591	pool_resume_active_thins(pool);
   3592
   3593	spin_lock_irq(&pool->lock);
   3594	pool->low_water_triggered = false;
   3595	pool->suspended = false;
   3596	spin_unlock_irq(&pool->lock);
   3597
   3598	do_waker(&pool->waker.work);
   3599}
   3600
   3601static void pool_presuspend(struct dm_target *ti)
   3602{
   3603	struct pool_c *pt = ti->private;
   3604	struct pool *pool = pt->pool;
   3605
   3606	spin_lock_irq(&pool->lock);
   3607	pool->suspended = true;
   3608	spin_unlock_irq(&pool->lock);
   3609
   3610	pool_suspend_active_thins(pool);
   3611}
   3612
   3613static void pool_presuspend_undo(struct dm_target *ti)
   3614{
   3615	struct pool_c *pt = ti->private;
   3616	struct pool *pool = pt->pool;
   3617
   3618	pool_resume_active_thins(pool);
   3619
   3620	spin_lock_irq(&pool->lock);
   3621	pool->suspended = false;
   3622	spin_unlock_irq(&pool->lock);
   3623}
   3624
   3625static void pool_postsuspend(struct dm_target *ti)
   3626{
   3627	struct pool_c *pt = ti->private;
   3628	struct pool *pool = pt->pool;
   3629
   3630	cancel_delayed_work_sync(&pool->waker);
   3631	cancel_delayed_work_sync(&pool->no_space_timeout);
   3632	flush_workqueue(pool->wq);
   3633	(void) commit(pool);
   3634}
   3635
   3636static int check_arg_count(unsigned argc, unsigned args_required)
   3637{
   3638	if (argc != args_required) {
   3639		DMWARN("Message received with %u arguments instead of %u.",
   3640		       argc, args_required);
   3641		return -EINVAL;
   3642	}
   3643
   3644	return 0;
   3645}
   3646
   3647static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
   3648{
   3649	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
   3650	    *dev_id <= MAX_DEV_ID)
   3651		return 0;
   3652
   3653	if (warning)
   3654		DMWARN("Message received with invalid device id: %s", arg);
   3655
   3656	return -EINVAL;
   3657}
   3658
   3659static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
   3660{
   3661	dm_thin_id dev_id;
   3662	int r;
   3663
   3664	r = check_arg_count(argc, 2);
   3665	if (r)
   3666		return r;
   3667
   3668	r = read_dev_id(argv[1], &dev_id, 1);
   3669	if (r)
   3670		return r;
   3671
   3672	r = dm_pool_create_thin(pool->pmd, dev_id);
   3673	if (r) {
   3674		DMWARN("Creation of new thinly-provisioned device with id %s failed.",
   3675		       argv[1]);
   3676		return r;
   3677	}
   3678
   3679	return 0;
   3680}
   3681
   3682static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
   3683{
   3684	dm_thin_id dev_id;
   3685	dm_thin_id origin_dev_id;
   3686	int r;
   3687
   3688	r = check_arg_count(argc, 3);
   3689	if (r)
   3690		return r;
   3691
   3692	r = read_dev_id(argv[1], &dev_id, 1);
   3693	if (r)
   3694		return r;
   3695
   3696	r = read_dev_id(argv[2], &origin_dev_id, 1);
   3697	if (r)
   3698		return r;
   3699
   3700	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
   3701	if (r) {
   3702		DMWARN("Creation of new snapshot %s of device %s failed.",
   3703		       argv[1], argv[2]);
   3704		return r;
   3705	}
   3706
   3707	return 0;
   3708}
   3709
   3710static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
   3711{
   3712	dm_thin_id dev_id;
   3713	int r;
   3714
   3715	r = check_arg_count(argc, 2);
   3716	if (r)
   3717		return r;
   3718
   3719	r = read_dev_id(argv[1], &dev_id, 1);
   3720	if (r)
   3721		return r;
   3722
   3723	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
   3724	if (r)
   3725		DMWARN("Deletion of thin device %s failed.", argv[1]);
   3726
   3727	return r;
   3728}
   3729
   3730static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
   3731{
   3732	dm_thin_id old_id, new_id;
   3733	int r;
   3734
   3735	r = check_arg_count(argc, 3);
   3736	if (r)
   3737		return r;
   3738
   3739	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
   3740		DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
   3741		return -EINVAL;
   3742	}
   3743
   3744	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
   3745		DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
   3746		return -EINVAL;
   3747	}
   3748
   3749	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
   3750	if (r) {
   3751		DMWARN("Failed to change transaction id from %s to %s.",
   3752		       argv[1], argv[2]);
   3753		return r;
   3754	}
   3755
   3756	return 0;
   3757}
   3758
   3759static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
   3760{
   3761	int r;
   3762
   3763	r = check_arg_count(argc, 1);
   3764	if (r)
   3765		return r;
   3766
   3767	(void) commit(pool);
   3768
   3769	r = dm_pool_reserve_metadata_snap(pool->pmd);
   3770	if (r)
   3771		DMWARN("reserve_metadata_snap message failed.");
   3772
   3773	return r;
   3774}
   3775
   3776static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
   3777{
   3778	int r;
   3779
   3780	r = check_arg_count(argc, 1);
   3781	if (r)
   3782		return r;
   3783
   3784	r = dm_pool_release_metadata_snap(pool->pmd);
   3785	if (r)
   3786		DMWARN("release_metadata_snap message failed.");
   3787
   3788	return r;
   3789}
   3790
   3791/*
   3792 * Messages supported:
   3793 *   create_thin	<dev_id>
   3794 *   create_snap	<dev_id> <origin_id>
   3795 *   delete		<dev_id>
   3796 *   set_transaction_id <current_trans_id> <new_trans_id>
   3797 *   reserve_metadata_snap
   3798 *   release_metadata_snap
   3799 */
   3800static int pool_message(struct dm_target *ti, unsigned argc, char **argv,
   3801			char *result, unsigned maxlen)
   3802{
   3803	int r = -EINVAL;
   3804	struct pool_c *pt = ti->private;
   3805	struct pool *pool = pt->pool;
   3806
   3807	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
   3808		DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
   3809		      dm_device_name(pool->pool_md));
   3810		return -EOPNOTSUPP;
   3811	}
   3812
   3813	if (!strcasecmp(argv[0], "create_thin"))
   3814		r = process_create_thin_mesg(argc, argv, pool);
   3815
   3816	else if (!strcasecmp(argv[0], "create_snap"))
   3817		r = process_create_snap_mesg(argc, argv, pool);
   3818
   3819	else if (!strcasecmp(argv[0], "delete"))
   3820		r = process_delete_mesg(argc, argv, pool);
   3821
   3822	else if (!strcasecmp(argv[0], "set_transaction_id"))
   3823		r = process_set_transaction_id_mesg(argc, argv, pool);
   3824
   3825	else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
   3826		r = process_reserve_metadata_snap_mesg(argc, argv, pool);
   3827
   3828	else if (!strcasecmp(argv[0], "release_metadata_snap"))
   3829		r = process_release_metadata_snap_mesg(argc, argv, pool);
   3830
   3831	else
   3832		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
   3833
   3834	if (!r)
   3835		(void) commit(pool);
   3836
   3837	return r;
   3838}
   3839
   3840static void emit_flags(struct pool_features *pf, char *result,
   3841		       unsigned sz, unsigned maxlen)
   3842{
   3843	unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
   3844		!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
   3845		pf->error_if_no_space;
   3846	DMEMIT("%u ", count);
   3847
   3848	if (!pf->zero_new_blocks)
   3849		DMEMIT("skip_block_zeroing ");
   3850
   3851	if (!pf->discard_enabled)
   3852		DMEMIT("ignore_discard ");
   3853
   3854	if (!pf->discard_passdown)
   3855		DMEMIT("no_discard_passdown ");
   3856
   3857	if (pf->mode == PM_READ_ONLY)
   3858		DMEMIT("read_only ");
   3859
   3860	if (pf->error_if_no_space)
   3861		DMEMIT("error_if_no_space ");
   3862}
   3863
   3864/*
   3865 * Status line is:
   3866 *    <transaction id> <used metadata sectors>/<total metadata sectors>
   3867 *    <used data sectors>/<total data sectors> <held metadata root>
   3868 *    <pool mode> <discard config> <no space config> <needs_check>
   3869 */
   3870static void pool_status(struct dm_target *ti, status_type_t type,
   3871			unsigned status_flags, char *result, unsigned maxlen)
   3872{
   3873	int r;
   3874	unsigned sz = 0;
   3875	uint64_t transaction_id;
   3876	dm_block_t nr_free_blocks_data;
   3877	dm_block_t nr_free_blocks_metadata;
   3878	dm_block_t nr_blocks_data;
   3879	dm_block_t nr_blocks_metadata;
   3880	dm_block_t held_root;
   3881	enum pool_mode mode;
   3882	char buf[BDEVNAME_SIZE];
   3883	char buf2[BDEVNAME_SIZE];
   3884	struct pool_c *pt = ti->private;
   3885	struct pool *pool = pt->pool;
   3886
   3887	switch (type) {
   3888	case STATUSTYPE_INFO:
   3889		if (get_pool_mode(pool) == PM_FAIL) {
   3890			DMEMIT("Fail");
   3891			break;
   3892		}
   3893
   3894		/* Commit to ensure statistics aren't out-of-date */
   3895		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
   3896			(void) commit(pool);
   3897
   3898		r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
   3899		if (r) {
   3900			DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
   3901			      dm_device_name(pool->pool_md), r);
   3902			goto err;
   3903		}
   3904
   3905		r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
   3906		if (r) {
   3907			DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
   3908			      dm_device_name(pool->pool_md), r);
   3909			goto err;
   3910		}
   3911
   3912		r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
   3913		if (r) {
   3914			DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
   3915			      dm_device_name(pool->pool_md), r);
   3916			goto err;
   3917		}
   3918
   3919		r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
   3920		if (r) {
   3921			DMERR("%s: dm_pool_get_free_block_count returned %d",
   3922			      dm_device_name(pool->pool_md), r);
   3923			goto err;
   3924		}
   3925
   3926		r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
   3927		if (r) {
   3928			DMERR("%s: dm_pool_get_data_dev_size returned %d",
   3929			      dm_device_name(pool->pool_md), r);
   3930			goto err;
   3931		}
   3932
   3933		r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
   3934		if (r) {
   3935			DMERR("%s: dm_pool_get_metadata_snap returned %d",
   3936			      dm_device_name(pool->pool_md), r);
   3937			goto err;
   3938		}
   3939
   3940		DMEMIT("%llu %llu/%llu %llu/%llu ",
   3941		       (unsigned long long)transaction_id,
   3942		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
   3943		       (unsigned long long)nr_blocks_metadata,
   3944		       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
   3945		       (unsigned long long)nr_blocks_data);
   3946
   3947		if (held_root)
   3948			DMEMIT("%llu ", held_root);
   3949		else
   3950			DMEMIT("- ");
   3951
   3952		mode = get_pool_mode(pool);
   3953		if (mode == PM_OUT_OF_DATA_SPACE)
   3954			DMEMIT("out_of_data_space ");
   3955		else if (is_read_only_pool_mode(mode))
   3956			DMEMIT("ro ");
   3957		else
   3958			DMEMIT("rw ");
   3959
   3960		if (!pool->pf.discard_enabled)
   3961			DMEMIT("ignore_discard ");
   3962		else if (pool->pf.discard_passdown)
   3963			DMEMIT("discard_passdown ");
   3964		else
   3965			DMEMIT("no_discard_passdown ");
   3966
   3967		if (pool->pf.error_if_no_space)
   3968			DMEMIT("error_if_no_space ");
   3969		else
   3970			DMEMIT("queue_if_no_space ");
   3971
   3972		if (dm_pool_metadata_needs_check(pool->pmd))
   3973			DMEMIT("needs_check ");
   3974		else
   3975			DMEMIT("- ");
   3976
   3977		DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt));
   3978
   3979		break;
   3980
   3981	case STATUSTYPE_TABLE:
   3982		DMEMIT("%s %s %lu %llu ",
   3983		       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
   3984		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
   3985		       (unsigned long)pool->sectors_per_block,
   3986		       (unsigned long long)pt->low_water_blocks);
   3987		emit_flags(&pt->requested_pf, result, sz, maxlen);
   3988		break;
   3989
   3990	case STATUSTYPE_IMA:
   3991		*result = '\0';
   3992		break;
   3993	}
   3994	return;
   3995
   3996err:
   3997	DMEMIT("Error");
   3998}
   3999
   4000static int pool_iterate_devices(struct dm_target *ti,
   4001				iterate_devices_callout_fn fn, void *data)
   4002{
   4003	struct pool_c *pt = ti->private;
   4004
   4005	return fn(ti, pt->data_dev, 0, ti->len, data);
   4006}
   4007
   4008static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
   4009{
   4010	struct pool_c *pt = ti->private;
   4011	struct pool *pool = pt->pool;
   4012	sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
   4013
   4014	/*
   4015	 * If max_sectors is smaller than pool->sectors_per_block adjust it
   4016	 * to the highest possible power-of-2 factor of pool->sectors_per_block.
   4017	 * This is especially beneficial when the pool's data device is a RAID
   4018	 * device that has a full stripe width that matches pool->sectors_per_block
   4019	 * -- because even though partial RAID stripe-sized IOs will be issued to a
   4020	 *    single RAID stripe; when aggregated they will end on a full RAID stripe
   4021	 *    boundary.. which avoids additional partial RAID stripe writes cascading
   4022	 */
   4023	if (limits->max_sectors < pool->sectors_per_block) {
   4024		while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
   4025			if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
   4026				limits->max_sectors--;
   4027			limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
   4028		}
   4029	}
   4030
   4031	/*
   4032	 * If the system-determined stacked limits are compatible with the
   4033	 * pool's blocksize (io_opt is a factor) do not override them.
   4034	 */
   4035	if (io_opt_sectors < pool->sectors_per_block ||
   4036	    !is_factor(io_opt_sectors, pool->sectors_per_block)) {
   4037		if (is_factor(pool->sectors_per_block, limits->max_sectors))
   4038			blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
   4039		else
   4040			blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
   4041		blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
   4042	}
   4043
   4044	/*
   4045	 * pt->adjusted_pf is a staging area for the actual features to use.
   4046	 * They get transferred to the live pool in bind_control_target()
   4047	 * called from pool_preresume().
   4048	 */
   4049	if (!pt->adjusted_pf.discard_enabled) {
   4050		/*
   4051		 * Must explicitly disallow stacking discard limits otherwise the
   4052		 * block layer will stack them if pool's data device has support.
   4053		 */
   4054		limits->discard_granularity = 0;
   4055		return;
   4056	}
   4057
   4058	disable_passdown_if_not_supported(pt);
   4059
   4060	/*
   4061	 * The pool uses the same discard limits as the underlying data
   4062	 * device.  DM core has already set this up.
   4063	 */
   4064}
   4065
   4066static struct target_type pool_target = {
   4067	.name = "thin-pool",
   4068	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
   4069		    DM_TARGET_IMMUTABLE,
   4070	.version = {1, 22, 0},
   4071	.module = THIS_MODULE,
   4072	.ctr = pool_ctr,
   4073	.dtr = pool_dtr,
   4074	.map = pool_map,
   4075	.presuspend = pool_presuspend,
   4076	.presuspend_undo = pool_presuspend_undo,
   4077	.postsuspend = pool_postsuspend,
   4078	.preresume = pool_preresume,
   4079	.resume = pool_resume,
   4080	.message = pool_message,
   4081	.status = pool_status,
   4082	.iterate_devices = pool_iterate_devices,
   4083	.io_hints = pool_io_hints,
   4084};
   4085
   4086/*----------------------------------------------------------------
   4087 * Thin target methods
   4088 *--------------------------------------------------------------*/
   4089static void thin_get(struct thin_c *tc)
   4090{
   4091	refcount_inc(&tc->refcount);
   4092}
   4093
   4094static void thin_put(struct thin_c *tc)
   4095{
   4096	if (refcount_dec_and_test(&tc->refcount))
   4097		complete(&tc->can_destroy);
   4098}
   4099
   4100static void thin_dtr(struct dm_target *ti)
   4101{
   4102	struct thin_c *tc = ti->private;
   4103
   4104	spin_lock_irq(&tc->pool->lock);
   4105	list_del_rcu(&tc->list);
   4106	spin_unlock_irq(&tc->pool->lock);
   4107	synchronize_rcu();
   4108
   4109	thin_put(tc);
   4110	wait_for_completion(&tc->can_destroy);
   4111
   4112	mutex_lock(&dm_thin_pool_table.mutex);
   4113
   4114	__pool_dec(tc->pool);
   4115	dm_pool_close_thin_device(tc->td);
   4116	dm_put_device(ti, tc->pool_dev);
   4117	if (tc->origin_dev)
   4118		dm_put_device(ti, tc->origin_dev);
   4119	kfree(tc);
   4120
   4121	mutex_unlock(&dm_thin_pool_table.mutex);
   4122}
   4123
   4124/*
   4125 * Thin target parameters:
   4126 *
   4127 * <pool_dev> <dev_id> [origin_dev]
   4128 *
   4129 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
   4130 * dev_id: the internal device identifier
   4131 * origin_dev: a device external to the pool that should act as the origin
   4132 *
   4133 * If the pool device has discards disabled, they get disabled for the thin
   4134 * device as well.
   4135 */
   4136static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
   4137{
   4138	int r;
   4139	struct thin_c *tc;
   4140	struct dm_dev *pool_dev, *origin_dev;
   4141	struct mapped_device *pool_md;
   4142
   4143	mutex_lock(&dm_thin_pool_table.mutex);
   4144
   4145	if (argc != 2 && argc != 3) {
   4146		ti->error = "Invalid argument count";
   4147		r = -EINVAL;
   4148		goto out_unlock;
   4149	}
   4150
   4151	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
   4152	if (!tc) {
   4153		ti->error = "Out of memory";
   4154		r = -ENOMEM;
   4155		goto out_unlock;
   4156	}
   4157	tc->thin_md = dm_table_get_md(ti->table);
   4158	spin_lock_init(&tc->lock);
   4159	INIT_LIST_HEAD(&tc->deferred_cells);
   4160	bio_list_init(&tc->deferred_bio_list);
   4161	bio_list_init(&tc->retry_on_resume_list);
   4162	tc->sort_bio_list = RB_ROOT;
   4163
   4164	if (argc == 3) {
   4165		if (!strcmp(argv[0], argv[2])) {
   4166			ti->error = "Error setting origin device";
   4167			r = -EINVAL;
   4168			goto bad_origin_dev;
   4169		}
   4170
   4171		r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
   4172		if (r) {
   4173			ti->error = "Error opening origin device";
   4174			goto bad_origin_dev;
   4175		}
   4176		tc->origin_dev = origin_dev;
   4177	}
   4178
   4179	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
   4180	if (r) {
   4181		ti->error = "Error opening pool device";
   4182		goto bad_pool_dev;
   4183	}
   4184	tc->pool_dev = pool_dev;
   4185
   4186	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
   4187		ti->error = "Invalid device id";
   4188		r = -EINVAL;
   4189		goto bad_common;
   4190	}
   4191
   4192	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
   4193	if (!pool_md) {
   4194		ti->error = "Couldn't get pool mapped device";
   4195		r = -EINVAL;
   4196		goto bad_common;
   4197	}
   4198
   4199	tc->pool = __pool_table_lookup(pool_md);
   4200	if (!tc->pool) {
   4201		ti->error = "Couldn't find pool object";
   4202		r = -EINVAL;
   4203		goto bad_pool_lookup;
   4204	}
   4205	__pool_inc(tc->pool);
   4206
   4207	if (get_pool_mode(tc->pool) == PM_FAIL) {
   4208		ti->error = "Couldn't open thin device, Pool is in fail mode";
   4209		r = -EINVAL;
   4210		goto bad_pool;
   4211	}
   4212
   4213	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
   4214	if (r) {
   4215		ti->error = "Couldn't open thin internal device";
   4216		goto bad_pool;
   4217	}
   4218
   4219	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
   4220	if (r)
   4221		goto bad;
   4222
   4223	ti->num_flush_bios = 1;
   4224	ti->flush_supported = true;
   4225	ti->accounts_remapped_io = true;
   4226	ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
   4227
   4228	/* In case the pool supports discards, pass them on. */
   4229	if (tc->pool->pf.discard_enabled) {
   4230		ti->discards_supported = true;
   4231		ti->num_discard_bios = 1;
   4232	}
   4233
   4234	mutex_unlock(&dm_thin_pool_table.mutex);
   4235
   4236	spin_lock_irq(&tc->pool->lock);
   4237	if (tc->pool->suspended) {
   4238		spin_unlock_irq(&tc->pool->lock);
   4239		mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
   4240		ti->error = "Unable to activate thin device while pool is suspended";
   4241		r = -EINVAL;
   4242		goto bad;
   4243	}
   4244	refcount_set(&tc->refcount, 1);
   4245	init_completion(&tc->can_destroy);
   4246	list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
   4247	spin_unlock_irq(&tc->pool->lock);
   4248	/*
   4249	 * This synchronize_rcu() call is needed here otherwise we risk a
   4250	 * wake_worker() call finding no bios to process (because the newly
   4251	 * added tc isn't yet visible).  So this reduces latency since we
   4252	 * aren't then dependent on the periodic commit to wake_worker().
   4253	 */
   4254	synchronize_rcu();
   4255
   4256	dm_put(pool_md);
   4257
   4258	return 0;
   4259
   4260bad:
   4261	dm_pool_close_thin_device(tc->td);
   4262bad_pool:
   4263	__pool_dec(tc->pool);
   4264bad_pool_lookup:
   4265	dm_put(pool_md);
   4266bad_common:
   4267	dm_put_device(ti, tc->pool_dev);
   4268bad_pool_dev:
   4269	if (tc->origin_dev)
   4270		dm_put_device(ti, tc->origin_dev);
   4271bad_origin_dev:
   4272	kfree(tc);
   4273out_unlock:
   4274	mutex_unlock(&dm_thin_pool_table.mutex);
   4275
   4276	return r;
   4277}
   4278
   4279static int thin_map(struct dm_target *ti, struct bio *bio)
   4280{
   4281	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
   4282
   4283	return thin_bio_map(ti, bio);
   4284}
   4285
   4286static int thin_endio(struct dm_target *ti, struct bio *bio,
   4287		blk_status_t *err)
   4288{
   4289	unsigned long flags;
   4290	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
   4291	struct list_head work;
   4292	struct dm_thin_new_mapping *m, *tmp;
   4293	struct pool *pool = h->tc->pool;
   4294
   4295	if (h->shared_read_entry) {
   4296		INIT_LIST_HEAD(&work);
   4297		dm_deferred_entry_dec(h->shared_read_entry, &work);
   4298
   4299		spin_lock_irqsave(&pool->lock, flags);
   4300		list_for_each_entry_safe(m, tmp, &work, list) {
   4301			list_del(&m->list);
   4302			__complete_mapping_preparation(m);
   4303		}
   4304		spin_unlock_irqrestore(&pool->lock, flags);
   4305	}
   4306
   4307	if (h->all_io_entry) {
   4308		INIT_LIST_HEAD(&work);
   4309		dm_deferred_entry_dec(h->all_io_entry, &work);
   4310		if (!list_empty(&work)) {
   4311			spin_lock_irqsave(&pool->lock, flags);
   4312			list_for_each_entry_safe(m, tmp, &work, list)
   4313				list_add_tail(&m->list, &pool->prepared_discards);
   4314			spin_unlock_irqrestore(&pool->lock, flags);
   4315			wake_worker(pool);
   4316		}
   4317	}
   4318
   4319	if (h->cell)
   4320		cell_defer_no_holder(h->tc, h->cell);
   4321
   4322	return DM_ENDIO_DONE;
   4323}
   4324
   4325static void thin_presuspend(struct dm_target *ti)
   4326{
   4327	struct thin_c *tc = ti->private;
   4328
   4329	if (dm_noflush_suspending(ti))
   4330		noflush_work(tc, do_noflush_start);
   4331}
   4332
   4333static void thin_postsuspend(struct dm_target *ti)
   4334{
   4335	struct thin_c *tc = ti->private;
   4336
   4337	/*
   4338	 * The dm_noflush_suspending flag has been cleared by now, so
   4339	 * unfortunately we must always run this.
   4340	 */
   4341	noflush_work(tc, do_noflush_stop);
   4342}
   4343
   4344static int thin_preresume(struct dm_target *ti)
   4345{
   4346	struct thin_c *tc = ti->private;
   4347
   4348	if (tc->origin_dev)
   4349		tc->origin_size = get_dev_size(tc->origin_dev->bdev);
   4350
   4351	return 0;
   4352}
   4353
   4354/*
   4355 * <nr mapped sectors> <highest mapped sector>
   4356 */
   4357static void thin_status(struct dm_target *ti, status_type_t type,
   4358			unsigned status_flags, char *result, unsigned maxlen)
   4359{
   4360	int r;
   4361	ssize_t sz = 0;
   4362	dm_block_t mapped, highest;
   4363	char buf[BDEVNAME_SIZE];
   4364	struct thin_c *tc = ti->private;
   4365
   4366	if (get_pool_mode(tc->pool) == PM_FAIL) {
   4367		DMEMIT("Fail");
   4368		return;
   4369	}
   4370
   4371	if (!tc->td)
   4372		DMEMIT("-");
   4373	else {
   4374		switch (type) {
   4375		case STATUSTYPE_INFO:
   4376			r = dm_thin_get_mapped_count(tc->td, &mapped);
   4377			if (r) {
   4378				DMERR("dm_thin_get_mapped_count returned %d", r);
   4379				goto err;
   4380			}
   4381
   4382			r = dm_thin_get_highest_mapped_block(tc->td, &highest);
   4383			if (r < 0) {
   4384				DMERR("dm_thin_get_highest_mapped_block returned %d", r);
   4385				goto err;
   4386			}
   4387
   4388			DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
   4389			if (r)
   4390				DMEMIT("%llu", ((highest + 1) *
   4391						tc->pool->sectors_per_block) - 1);
   4392			else
   4393				DMEMIT("-");
   4394			break;
   4395
   4396		case STATUSTYPE_TABLE:
   4397			DMEMIT("%s %lu",
   4398			       format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
   4399			       (unsigned long) tc->dev_id);
   4400			if (tc->origin_dev)
   4401				DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
   4402			break;
   4403
   4404		case STATUSTYPE_IMA:
   4405			*result = '\0';
   4406			break;
   4407		}
   4408	}
   4409
   4410	return;
   4411
   4412err:
   4413	DMEMIT("Error");
   4414}
   4415
   4416static int thin_iterate_devices(struct dm_target *ti,
   4417				iterate_devices_callout_fn fn, void *data)
   4418{
   4419	sector_t blocks;
   4420	struct thin_c *tc = ti->private;
   4421	struct pool *pool = tc->pool;
   4422
   4423	/*
   4424	 * We can't call dm_pool_get_data_dev_size() since that blocks.  So
   4425	 * we follow a more convoluted path through to the pool's target.
   4426	 */
   4427	if (!pool->ti)
   4428		return 0;	/* nothing is bound */
   4429
   4430	blocks = pool->ti->len;
   4431	(void) sector_div(blocks, pool->sectors_per_block);
   4432	if (blocks)
   4433		return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
   4434
   4435	return 0;
   4436}
   4437
   4438static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
   4439{
   4440	struct thin_c *tc = ti->private;
   4441	struct pool *pool = tc->pool;
   4442
   4443	if (!pool->pf.discard_enabled)
   4444		return;
   4445
   4446	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
   4447	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
   4448}
   4449
   4450static struct target_type thin_target = {
   4451	.name = "thin",
   4452	.version = {1, 22, 0},
   4453	.module	= THIS_MODULE,
   4454	.ctr = thin_ctr,
   4455	.dtr = thin_dtr,
   4456	.map = thin_map,
   4457	.end_io = thin_endio,
   4458	.preresume = thin_preresume,
   4459	.presuspend = thin_presuspend,
   4460	.postsuspend = thin_postsuspend,
   4461	.status = thin_status,
   4462	.iterate_devices = thin_iterate_devices,
   4463	.io_hints = thin_io_hints,
   4464};
   4465
   4466/*----------------------------------------------------------------*/
   4467
   4468static int __init dm_thin_init(void)
   4469{
   4470	int r = -ENOMEM;
   4471
   4472	pool_table_init();
   4473
   4474	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
   4475	if (!_new_mapping_cache)
   4476		return r;
   4477
   4478	r = dm_register_target(&thin_target);
   4479	if (r)
   4480		goto bad_new_mapping_cache;
   4481
   4482	r = dm_register_target(&pool_target);
   4483	if (r)
   4484		goto bad_thin_target;
   4485
   4486	return 0;
   4487
   4488bad_thin_target:
   4489	dm_unregister_target(&thin_target);
   4490bad_new_mapping_cache:
   4491	kmem_cache_destroy(_new_mapping_cache);
   4492
   4493	return r;
   4494}
   4495
   4496static void dm_thin_exit(void)
   4497{
   4498	dm_unregister_target(&thin_target);
   4499	dm_unregister_target(&pool_target);
   4500
   4501	kmem_cache_destroy(_new_mapping_cache);
   4502
   4503	pool_table_exit();
   4504}
   4505
   4506module_init(dm_thin_init);
   4507module_exit(dm_thin_exit);
   4508
   4509module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
   4510MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
   4511
   4512MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
   4513MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
   4514MODULE_LICENSE("GPL");